firecrawl-py 2.2.0__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (29) hide show
  1. firecrawl/__init__.py +1 -1
  2. firecrawl/firecrawl.py +6 -11
  3. {firecrawl_py-2.2.0.dist-info → firecrawl_py-2.4.0.dist-info}/METADATA +1 -1
  4. firecrawl_py-2.4.0.dist-info/RECORD +12 -0
  5. {firecrawl_py-2.2.0.dist-info → firecrawl_py-2.4.0.dist-info}/top_level.txt +0 -2
  6. build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  7. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  8. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  9. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  10. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  11. build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4376
  12. build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  13. build/lib/build/lib/firecrawl/__init__.py +0 -79
  14. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  15. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  16. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  17. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  18. build/lib/build/lib/firecrawl/firecrawl.py +0 -4376
  19. build/lib/build/lib/tests/test_change_tracking.py +0 -98
  20. build/lib/firecrawl/__init__.py +0 -79
  21. build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  22. build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  23. build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  24. build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  25. build/lib/firecrawl/firecrawl.py +0 -4376
  26. build/lib/tests/test_change_tracking.py +0 -98
  27. firecrawl_py-2.2.0.dist-info/RECORD +0 -33
  28. {firecrawl_py-2.2.0.dist-info → firecrawl_py-2.4.0.dist-info}/LICENSE +0 -0
  29. {firecrawl_py-2.2.0.dist-info → firecrawl_py-2.4.0.dist-info}/WHEEL +0 -0
@@ -1,4376 +0,0 @@
1
- """
2
- FirecrawlApp Module
3
-
4
- This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
5
- It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
- and check the status of these jobs. The module uses requests for HTTP communication
7
- and handles retries for certain HTTP status codes.
8
-
9
- Classes:
10
- - FirecrawlApp: Main class for interacting with the Firecrawl API.
11
- """
12
- import logging
13
- import os
14
- import time
15
- from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
16
- import json
17
- from datetime import datetime
18
- import re
19
- import warnings
20
- import requests
21
- import pydantic
22
- import websockets
23
- import aiohttp
24
- import asyncio
25
- from pydantic import Field
26
-
27
- # Suppress Pydantic warnings about attribute shadowing
28
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
29
- warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
30
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
31
- warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
32
-
33
-
34
- def get_version():
35
- try:
36
- from pathlib import Path
37
- package_path = os.path.dirname(__file__)
38
- version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
39
- version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
40
- if version_match:
41
- return version_match.group(1).strip()
42
- except Exception:
43
- print("Failed to get version from __init__.py")
44
- return None
45
-
46
- version = get_version()
47
-
48
- logger : logging.Logger = logging.getLogger("firecrawl")
49
-
50
- T = TypeVar('T')
51
-
52
- # class FirecrawlDocumentMetadata(pydantic.BaseModel):
53
- # """Metadata for a Firecrawl document."""
54
- # title: Optional[str] = None
55
- # description: Optional[str] = None
56
- # language: Optional[str] = None
57
- # keywords: Optional[str] = None
58
- # robots: Optional[str] = None
59
- # ogTitle: Optional[str] = None
60
- # ogDescription: Optional[str] = None
61
- # ogUrl: Optional[str] = None
62
- # ogImage: Optional[str] = None
63
- # ogAudio: Optional[str] = None
64
- # ogDeterminer: Optional[str] = None
65
- # ogLocale: Optional[str] = None
66
- # ogLocaleAlternate: Optional[List[str]] = None
67
- # ogSiteName: Optional[str] = None
68
- # ogVideo: Optional[str] = None
69
- # dctermsCreated: Optional[str] = None
70
- # dcDateCreated: Optional[str] = None
71
- # dcDate: Optional[str] = None
72
- # dctermsType: Optional[str] = None
73
- # dcType: Optional[str] = None
74
- # dctermsAudience: Optional[str] = None
75
- # dctermsSubject: Optional[str] = None
76
- # dcSubject: Optional[str] = None
77
- # dcDescription: Optional[str] = None
78
- # dctermsKeywords: Optional[str] = None
79
- # modifiedTime: Optional[str] = None
80
- # publishedTime: Optional[str] = None
81
- # articleTag: Optional[str] = None
82
- # articleSection: Optional[str] = None
83
- # sourceURL: Optional[str] = None
84
- # statusCode: Optional[int] = None
85
- # error: Optional[str] = None
86
-
87
- class AgentOptions(pydantic.BaseModel):
88
- """Configuration for the agent."""
89
- model: Literal["FIRE-1"] = "FIRE-1"
90
- prompt: Optional[str] = None
91
-
92
- class AgentOptionsExtract(pydantic.BaseModel):
93
- """Configuration for the agent in extract operations."""
94
- model: Literal["FIRE-1"] = "FIRE-1"
95
-
96
- class ActionsResult(pydantic.BaseModel):
97
- """Result of actions performed during scraping."""
98
- screenshots: List[str]
99
-
100
- class ChangeTrackingData(pydantic.BaseModel):
101
- """
102
- Data for the change tracking format.
103
- """
104
- previousScrapeAt: Optional[str] = None
105
- changeStatus: str # "new" | "same" | "changed" | "removed"
106
- visibility: str # "visible" | "hidden"
107
- diff: Optional[Dict[str, Any]] = None
108
- json: Optional[Any] = None
109
-
110
- class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
111
- """Document retrieved or processed by Firecrawl."""
112
- url: Optional[str] = None
113
- markdown: Optional[str] = None
114
- html: Optional[str] = None
115
- rawHtml: Optional[str] = None
116
- links: Optional[List[str]] = None
117
- extract: Optional[T] = None
118
- json: Optional[T] = None
119
- screenshot: Optional[str] = None
120
- metadata: Optional[Any] = None
121
- actions: Optional[ActionsResult] = None
122
- title: Optional[str] = None # v1 search only
123
- description: Optional[str] = None # v1 search only
124
- changeTracking: Optional[ChangeTrackingData] = None
125
-
126
- class LocationConfig(pydantic.BaseModel):
127
- """Location configuration for scraping."""
128
- country: Optional[str] = None
129
- languages: Optional[List[str]] = None
130
-
131
- class WebhookConfig(pydantic.BaseModel):
132
- """Configuration for webhooks."""
133
- url: str
134
- headers: Optional[Dict[str, str]] = None
135
- metadata: Optional[Dict[str, str]] = None
136
- events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
137
-
138
- class ScrapeOptions(pydantic.BaseModel):
139
- """Parameters for scraping operations."""
140
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
141
- headers: Optional[Dict[str, str]] = None
142
- includeTags: Optional[List[str]] = None
143
- excludeTags: Optional[List[str]] = None
144
- onlyMainContent: Optional[bool] = None
145
- waitFor: Optional[int] = None
146
- timeout: Optional[int] = None
147
- location: Optional[LocationConfig] = None
148
- mobile: Optional[bool] = None
149
- skipTlsVerification: Optional[bool] = None
150
- removeBase64Images: Optional[bool] = None
151
- blockAds: Optional[bool] = None
152
- proxy: Optional[Literal["basic", "stealth"]] = None
153
-
154
- class WaitAction(pydantic.BaseModel):
155
- """Wait action to perform during scraping."""
156
- type: Literal["wait"]
157
- milliseconds: int
158
- selector: Optional[str] = None
159
-
160
- class ScreenshotAction(pydantic.BaseModel):
161
- """Screenshot action to perform during scraping."""
162
- type: Literal["screenshot"]
163
- fullPage: Optional[bool] = None
164
-
165
- class ClickAction(pydantic.BaseModel):
166
- """Click action to perform during scraping."""
167
- type: Literal["click"]
168
- selector: str
169
-
170
- class WriteAction(pydantic.BaseModel):
171
- """Write action to perform during scraping."""
172
- type: Literal["write"]
173
- text: str
174
-
175
- class PressAction(pydantic.BaseModel):
176
- """Press action to perform during scraping."""
177
- type: Literal["press"]
178
- key: str
179
-
180
- class ScrollAction(pydantic.BaseModel):
181
- """Scroll action to perform during scraping."""
182
- type: Literal["scroll"]
183
- direction: Literal["up", "down"]
184
- selector: Optional[str] = None
185
-
186
- class ScrapeAction(pydantic.BaseModel):
187
- """Scrape action to perform during scraping."""
188
- type: Literal["scrape"]
189
-
190
- class ExecuteJavascriptAction(pydantic.BaseModel):
191
- """Execute javascript action to perform during scraping."""
192
- type: Literal["executeJavascript"]
193
- script: str
194
-
195
-
196
- class ExtractAgent(pydantic.BaseModel):
197
- """Configuration for the agent in extract operations."""
198
- model: Literal["FIRE-1"] = "FIRE-1"
199
-
200
- class JsonConfig(pydantic.BaseModel):
201
- """Configuration for extraction."""
202
- prompt: Optional[str] = None
203
- schema: Optional[Any] = None
204
- systemPrompt: Optional[str] = None
205
- agent: Optional[ExtractAgent] = None
206
-
207
- class ScrapeParams(ScrapeOptions):
208
- """Parameters for scraping operations."""
209
- extract: Optional[JsonConfig] = None
210
- jsonOptions: Optional[JsonConfig] = None
211
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
212
- agent: Optional[AgentOptions] = None
213
-
214
- class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
215
- """Response from scraping operations."""
216
- success: bool = True
217
- warning: Optional[str] = None
218
- error: Optional[str] = None
219
-
220
- class BatchScrapeResponse(pydantic.BaseModel):
221
- """Response from batch scrape operations."""
222
- id: Optional[str] = None
223
- url: Optional[str] = None
224
- success: bool = True
225
- error: Optional[str] = None
226
- invalidURLs: Optional[List[str]] = None
227
-
228
- class BatchScrapeStatusResponse(pydantic.BaseModel):
229
- """Response from batch scrape status checks."""
230
- success: bool = True
231
- status: Literal["scraping", "completed", "failed", "cancelled"]
232
- completed: int
233
- total: int
234
- creditsUsed: int
235
- expiresAt: datetime
236
- next: Optional[str] = None
237
- data: List[FirecrawlDocument]
238
-
239
- class CrawlParams(pydantic.BaseModel):
240
- """Parameters for crawling operations."""
241
- includePaths: Optional[List[str]] = None
242
- excludePaths: Optional[List[str]] = None
243
- maxDepth: Optional[int] = None
244
- maxDiscoveryDepth: Optional[int] = None
245
- limit: Optional[int] = None
246
- allowBackwardLinks: Optional[bool] = None
247
- allowExternalLinks: Optional[bool] = None
248
- ignoreSitemap: Optional[bool] = None
249
- scrapeOptions: Optional[ScrapeOptions] = None
250
- webhook: Optional[Union[str, WebhookConfig]] = None
251
- deduplicateSimilarURLs: Optional[bool] = None
252
- ignoreQueryParameters: Optional[bool] = None
253
- regexOnFullURL: Optional[bool] = None
254
-
255
- class CrawlResponse(pydantic.BaseModel):
256
- """Response from crawling operations."""
257
- id: Optional[str] = None
258
- url: Optional[str] = None
259
- success: bool = True
260
- error: Optional[str] = None
261
-
262
- class CrawlStatusResponse(pydantic.BaseModel):
263
- """Response from crawl status checks."""
264
- success: bool = True
265
- status: Literal["scraping", "completed", "failed", "cancelled"]
266
- completed: int
267
- total: int
268
- creditsUsed: int
269
- expiresAt: datetime
270
- next: Optional[str] = None
271
- data: List[FirecrawlDocument]
272
-
273
- class CrawlErrorsResponse(pydantic.BaseModel):
274
- """Response from crawl/batch scrape error monitoring."""
275
- errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
276
- robotsBlocked: List[str]
277
-
278
- class MapParams(pydantic.BaseModel):
279
- """Parameters for mapping operations."""
280
- search: Optional[str] = None
281
- ignoreSitemap: Optional[bool] = None
282
- includeSubdomains: Optional[bool] = None
283
- sitemapOnly: Optional[bool] = None
284
- limit: Optional[int] = None
285
- timeout: Optional[int] = None
286
-
287
- class MapResponse(pydantic.BaseModel):
288
- """Response from mapping operations."""
289
- success: bool = True
290
- links: Optional[List[str]] = None
291
- error: Optional[str] = None
292
-
293
- class ExtractParams(pydantic.BaseModel):
294
- """Parameters for extracting information from URLs."""
295
- prompt: Optional[str] = None
296
- schema: Optional[Any] = None
297
- systemPrompt: Optional[str] = None
298
- allowExternalLinks: Optional[bool] = None
299
- enableWebSearch: Optional[bool] = None
300
- includeSubdomains: Optional[bool] = None
301
- origin: Optional[str] = None
302
- showSources: Optional[bool] = None
303
- scrapeOptions: Optional[ScrapeOptions] = None
304
-
305
- class ExtractResponse(pydantic.BaseModel, Generic[T]):
306
- """Response from extract operations."""
307
- success: bool = True
308
- data: Optional[T] = None
309
- error: Optional[str] = None
310
- warning: Optional[str] = None
311
- sources: Optional[List[str]] = None
312
-
313
- class SearchParams(pydantic.BaseModel):
314
- query: str
315
- limit: Optional[int] = 5
316
- tbs: Optional[str] = None
317
- filter: Optional[str] = None
318
- lang: Optional[str] = "en"
319
- country: Optional[str] = "us"
320
- location: Optional[str] = None
321
- origin: Optional[str] = "api"
322
- timeout: Optional[int] = 60000
323
- scrapeOptions: Optional[ScrapeOptions] = None
324
-
325
- class SearchResponse(pydantic.BaseModel):
326
- """Response from search operations."""
327
- success: bool = True
328
- data: List[FirecrawlDocument]
329
- warning: Optional[str] = None
330
- error: Optional[str] = None
331
-
332
- class GenerateLLMsTextParams(pydantic.BaseModel):
333
- """
334
- Parameters for the LLMs.txt generation operation.
335
- """
336
- maxUrls: Optional[int] = 10
337
- showFullText: Optional[bool] = False
338
- __experimental_stream: Optional[bool] = None
339
-
340
- class DeepResearchParams(pydantic.BaseModel):
341
- """
342
- Parameters for the deep research operation.
343
- """
344
- maxDepth: Optional[int] = 7
345
- timeLimit: Optional[int] = 270
346
- maxUrls: Optional[int] = 20
347
- analysisPrompt: Optional[str] = None
348
- systemPrompt: Optional[str] = None
349
- __experimental_streamSteps: Optional[bool] = None
350
-
351
- class DeepResearchResponse(pydantic.BaseModel):
352
- """
353
- Response from the deep research operation.
354
- """
355
- success: bool
356
- id: str
357
- error: Optional[str] = None
358
-
359
- class DeepResearchStatusResponse(pydantic.BaseModel):
360
- """
361
- Status response from the deep research operation.
362
- """
363
- success: bool
364
- data: Optional[Dict[str, Any]] = None
365
- status: str
366
- error: Optional[str] = None
367
- expiresAt: str
368
- currentDepth: int
369
- maxDepth: int
370
- activities: List[Dict[str, Any]]
371
- sources: List[Dict[str, Any]]
372
- summaries: List[str]
373
-
374
- class GenerateLLMsTextResponse(pydantic.BaseModel):
375
- """Response from LLMs.txt generation operations."""
376
- success: bool = True
377
- id: str
378
- error: Optional[str] = None
379
-
380
- class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
381
- llmstxt: str
382
- llmsfulltxt: Optional[str] = None
383
-
384
- class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
385
- """Status response from LLMs.txt generation operations."""
386
- success: bool = True
387
- data: Optional[GenerateLLMsTextStatusResponseData] = None
388
- status: Literal["processing", "completed", "failed"]
389
- error: Optional[str] = None
390
- expiresAt: str
391
-
392
- class SearchResponse(pydantic.BaseModel):
393
- """
394
- Response from the search operation.
395
- """
396
- success: bool
397
- data: List[Dict[str, Any]]
398
- warning: Optional[str] = None
399
- error: Optional[str] = None
400
-
401
- class ExtractParams(pydantic.BaseModel):
402
- """
403
- Parameters for the extract operation.
404
- """
405
- prompt: Optional[str] = None
406
- schema: Optional[Any] = pydantic.Field(None, alias='schema')
407
- system_prompt: Optional[str] = None
408
- allow_external_links: Optional[bool] = False
409
- enable_web_search: Optional[bool] = False
410
- # Just for backwards compatibility
411
- enableWebSearch: Optional[bool] = False
412
- show_sources: Optional[bool] = False
413
- agent: Optional[Dict[str, Any]] = None
414
-
415
- class ExtractResponse(pydantic.BaseModel, Generic[T]):
416
- """
417
- Response from the extract operation.
418
- """
419
- success: bool
420
- data: Optional[T] = None
421
- error: Optional[str] = None
422
-
423
- class FirecrawlApp:
424
- def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
425
- """
426
- Initialize the FirecrawlApp instance with API key, API URL.
427
-
428
- Args:
429
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
430
- api_url (Optional[str]): Base URL for the Firecrawl API.
431
- """
432
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
433
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
434
-
435
- # Only require API key when using cloud service
436
- if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
437
- logger.warning("No API key provided for cloud service")
438
- raise ValueError('No API key provided')
439
-
440
- logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
441
-
442
- def scrape_url(
443
- self,
444
- url: str,
445
- *,
446
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
447
- include_tags: Optional[List[str]] = None,
448
- exclude_tags: Optional[List[str]] = None,
449
- only_main_content: Optional[bool] = None,
450
- wait_for: Optional[int] = None,
451
- timeout: Optional[int] = None,
452
- location: Optional[LocationConfig] = None,
453
- mobile: Optional[bool] = None,
454
- skip_tls_verification: Optional[bool] = None,
455
- remove_base64_images: Optional[bool] = None,
456
- block_ads: Optional[bool] = None,
457
- proxy: Optional[Literal["basic", "stealth"]] = None,
458
- extract: Optional[JsonConfig] = None,
459
- json_options: Optional[JsonConfig] = None,
460
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
461
- **kwargs) -> ScrapeResponse[Any]:
462
- """
463
- Scrape and extract content from a URL.
464
-
465
- Args:
466
- url (str): Target URL to scrape
467
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
468
- include_tags (Optional[List[str]]): HTML tags to include
469
- exclude_tags (Optional[List[str]]): HTML tags to exclude
470
- only_main_content (Optional[bool]): Extract main content only
471
- wait_for (Optional[int]): Wait for a specific element to appear
472
- timeout (Optional[int]): Request timeout (ms)
473
- location (Optional[LocationConfig]): Location configuration
474
- mobile (Optional[bool]): Use mobile user agent
475
- skip_tls_verification (Optional[bool]): Skip TLS verification
476
- remove_base64_images (Optional[bool]): Remove base64 images
477
- block_ads (Optional[bool]): Block ads
478
- proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
479
- extract (Optional[JsonConfig]): Content extraction settings
480
- json_options (Optional[JsonConfig]): JSON extraction settings
481
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
482
-
483
-
484
- Returns:
485
- ScrapeResponse with:
486
- * Requested content formats
487
- * Page metadata
488
- * Extraction results
489
- * Success/error status
490
-
491
- Raises:
492
- Exception: If scraping fails
493
- """
494
- headers = self._prepare_headers()
495
-
496
- # Build scrape parameters
497
- scrape_params = {
498
- 'url': url,
499
- 'origin': f"python-sdk@{version}"
500
- }
501
-
502
- # Add optional parameters if provided
503
- if formats:
504
- scrape_params['formats'] = formats
505
- if include_tags:
506
- scrape_params['includeTags'] = include_tags
507
- if exclude_tags:
508
- scrape_params['excludeTags'] = exclude_tags
509
- if only_main_content is not None:
510
- scrape_params['onlyMainContent'] = only_main_content
511
- if wait_for:
512
- scrape_params['waitFor'] = wait_for
513
- if timeout:
514
- scrape_params['timeout'] = timeout
515
- if location:
516
- scrape_params['location'] = location.dict(exclude_none=True)
517
- if mobile is not None:
518
- scrape_params['mobile'] = mobile
519
- if skip_tls_verification is not None:
520
- scrape_params['skipTlsVerification'] = skip_tls_verification
521
- if remove_base64_images is not None:
522
- scrape_params['removeBase64Images'] = remove_base64_images
523
- if block_ads is not None:
524
- scrape_params['blockAds'] = block_ads
525
- if proxy:
526
- scrape_params['proxy'] = proxy
527
- if extract:
528
- if hasattr(extract.schema, 'schema'):
529
- extract.schema = extract.schema.schema()
530
- scrape_params['extract'] = extract.dict(exclude_none=True)
531
- if json_options:
532
- if hasattr(json_options.schema, 'schema'):
533
- json_options.schema = json_options.schema.schema()
534
- scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
535
- if actions:
536
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
537
- scrape_params.update(kwargs)
538
-
539
- # Make request
540
- response = requests.post(
541
- f'{self.api_url}/v1/scrape',
542
- headers=headers,
543
- json=scrape_params,
544
- timeout=(timeout + 5000 if timeout else None)
545
- )
546
-
547
- if response.status_code == 200:
548
- try:
549
- response_json = response.json()
550
- if response_json.get('success') and 'data' in response_json:
551
- return ScrapeResponse(**response_json['data'])
552
- elif "error" in response_json:
553
- raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
554
- else:
555
- raise Exception(f'Failed to scrape URL. Error: {response_json}')
556
- except ValueError:
557
- raise Exception('Failed to parse Firecrawl response as JSON.')
558
- else:
559
- self._handle_error(response, 'scrape URL')
560
-
561
- def search(
562
- self,
563
- query: str,
564
- *,
565
- limit: Optional[int] = None,
566
- tbs: Optional[str] = None,
567
- filter: Optional[str] = None,
568
- lang: Optional[str] = None,
569
- country: Optional[str] = None,
570
- location: Optional[str] = None,
571
- timeout: Optional[int] = None,
572
- scrape_options: Optional[ScrapeOptions] = None,
573
- **kwargs) -> SearchResponse:
574
- """
575
- Search for content using Firecrawl.
576
-
577
- Args:
578
- query (str): Search query string
579
- limit (Optional[int]): Max results (default: 5)
580
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
581
- filter (Optional[str]): Custom result filter
582
- lang (Optional[str]): Language code (default: "en")
583
- country (Optional[str]): Country code (default: "us")
584
- location (Optional[str]): Geo-targeting
585
- timeout (Optional[int]): Request timeout in milliseconds
586
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
587
- **kwargs: Additional keyword arguments for future compatibility
588
-
589
- Returns:
590
- SearchResponse: Response containing:
591
- * success (bool): Whether request succeeded
592
- * data (List[FirecrawlDocument]): Search results
593
- * warning (Optional[str]): Warning message if any
594
- * error (Optional[str]): Error message if any
595
-
596
- Raises:
597
- Exception: If search fails or response cannot be parsed
598
- """
599
- # Validate any additional kwargs
600
- self._validate_kwargs(kwargs, "search")
601
-
602
- # Build search parameters
603
- search_params = {}
604
-
605
- # Add individual parameters
606
- if limit is not None:
607
- search_params['limit'] = limit
608
- if tbs is not None:
609
- search_params['tbs'] = tbs
610
- if filter is not None:
611
- search_params['filter'] = filter
612
- if lang is not None:
613
- search_params['lang'] = lang
614
- if country is not None:
615
- search_params['country'] = country
616
- if location is not None:
617
- search_params['location'] = location
618
- if timeout is not None:
619
- search_params['timeout'] = timeout
620
- if scrape_options is not None:
621
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
622
-
623
- # Add any additional kwargs
624
- search_params.update(kwargs)
625
-
626
- # Create final params object
627
- final_params = SearchParams(query=query, **search_params)
628
- params_dict = final_params.dict(exclude_none=True)
629
- params_dict['origin'] = f"python-sdk@{version}"
630
-
631
- # Make request
632
- response = requests.post(
633
- f"{self.api_url}/v1/search",
634
- headers={"Authorization": f"Bearer {self.api_key}"},
635
- json=params_dict
636
- )
637
-
638
- if response.status_code == 200:
639
- try:
640
- response_json = response.json()
641
- if response_json.get('success') and 'data' in response_json:
642
- return SearchResponse(**response_json)
643
- elif "error" in response_json:
644
- raise Exception(f'Search failed. Error: {response_json["error"]}')
645
- else:
646
- raise Exception(f'Search failed. Error: {response_json}')
647
- except ValueError:
648
- raise Exception('Failed to parse Firecrawl response as JSON.')
649
- else:
650
- self._handle_error(response, 'search')
651
-
652
- def crawl_url(
653
- self,
654
- url: str,
655
- *,
656
- include_paths: Optional[List[str]] = None,
657
- exclude_paths: Optional[List[str]] = None,
658
- max_depth: Optional[int] = None,
659
- max_discovery_depth: Optional[int] = None,
660
- limit: Optional[int] = None,
661
- allow_backward_links: Optional[bool] = None,
662
- allow_external_links: Optional[bool] = None,
663
- ignore_sitemap: Optional[bool] = None,
664
- scrape_options: Optional[ScrapeOptions] = None,
665
- webhook: Optional[Union[str, WebhookConfig]] = None,
666
- deduplicate_similar_urls: Optional[bool] = None,
667
- ignore_query_parameters: Optional[bool] = None,
668
- regex_on_full_url: Optional[bool] = None,
669
- poll_interval: Optional[int] = 2,
670
- idempotency_key: Optional[str] = None,
671
- **kwargs
672
- ) -> CrawlStatusResponse:
673
- """
674
- Crawl a website starting from a URL.
675
-
676
- Args:
677
- url (str): Target URL to start crawling from
678
- include_paths (Optional[List[str]]): Patterns of URLs to include
679
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
680
- max_depth (Optional[int]): Maximum crawl depth
681
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
682
- limit (Optional[int]): Maximum pages to crawl
683
- allow_backward_links (Optional[bool]): Follow parent directory links
684
- allow_external_links (Optional[bool]): Follow external domain links
685
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
686
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
687
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
688
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
689
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
690
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
691
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
692
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
693
- **kwargs: Additional parameters to pass to the API
694
-
695
- Returns:
696
- CrawlStatusResponse with:
697
- * Crawling status and progress
698
- * Crawled page contents
699
- * Success/error information
700
-
701
- Raises:
702
- Exception: If crawl fails
703
- """
704
- # Validate any additional kwargs
705
- self._validate_kwargs(kwargs, "crawl_url")
706
-
707
- crawl_params = {}
708
-
709
- # Add individual parameters
710
- if include_paths is not None:
711
- crawl_params['includePaths'] = include_paths
712
- if exclude_paths is not None:
713
- crawl_params['excludePaths'] = exclude_paths
714
- if max_depth is not None:
715
- crawl_params['maxDepth'] = max_depth
716
- if max_discovery_depth is not None:
717
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
718
- if limit is not None:
719
- crawl_params['limit'] = limit
720
- if allow_backward_links is not None:
721
- crawl_params['allowBackwardLinks'] = allow_backward_links
722
- if allow_external_links is not None:
723
- crawl_params['allowExternalLinks'] = allow_external_links
724
- if ignore_sitemap is not None:
725
- crawl_params['ignoreSitemap'] = ignore_sitemap
726
- if scrape_options is not None:
727
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
728
- if webhook is not None:
729
- crawl_params['webhook'] = webhook
730
- if deduplicate_similar_urls is not None:
731
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
732
- if ignore_query_parameters is not None:
733
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
734
- if regex_on_full_url is not None:
735
- crawl_params['regexOnFullURL'] = regex_on_full_url
736
-
737
- # Add any additional kwargs
738
- crawl_params.update(kwargs)
739
-
740
- # Create final params object
741
- final_params = CrawlParams(**crawl_params)
742
- params_dict = final_params.dict(exclude_none=True)
743
- params_dict['url'] = url
744
- params_dict['origin'] = f"python-sdk@{version}"
745
-
746
- # Make request
747
- headers = self._prepare_headers(idempotency_key)
748
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
749
-
750
- if response.status_code == 200:
751
- try:
752
- id = response.json().get('id')
753
- except:
754
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
755
- return self._monitor_job_status(id, headers, poll_interval)
756
- else:
757
- self._handle_error(response, 'start crawl job')
758
-
759
- def async_crawl_url(
760
- self,
761
- url: str,
762
- *,
763
- include_paths: Optional[List[str]] = None,
764
- exclude_paths: Optional[List[str]] = None,
765
- max_depth: Optional[int] = None,
766
- max_discovery_depth: Optional[int] = None,
767
- limit: Optional[int] = None,
768
- allow_backward_links: Optional[bool] = None,
769
- allow_external_links: Optional[bool] = None,
770
- ignore_sitemap: Optional[bool] = None,
771
- scrape_options: Optional[ScrapeOptions] = None,
772
- webhook: Optional[Union[str, WebhookConfig]] = None,
773
- deduplicate_similar_urls: Optional[bool] = None,
774
- ignore_query_parameters: Optional[bool] = None,
775
- regex_on_full_url: Optional[bool] = None,
776
- idempotency_key: Optional[str] = None,
777
- **kwargs
778
- ) -> CrawlResponse:
779
- """
780
- Start an asynchronous crawl job.
781
-
782
- Args:
783
- url (str): Target URL to start crawling from
784
- include_paths (Optional[List[str]]): Patterns of URLs to include
785
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
786
- max_depth (Optional[int]): Maximum crawl depth
787
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
788
- limit (Optional[int]): Maximum pages to crawl
789
- allow_backward_links (Optional[bool]): Follow parent directory links
790
- allow_external_links (Optional[bool]): Follow external domain links
791
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
792
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
793
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
794
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
795
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
796
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
797
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
798
- **kwargs: Additional parameters to pass to the API
799
-
800
- Returns:
801
- CrawlResponse with:
802
- * success - Whether crawl started successfully
803
- * id - Unique identifier for the crawl job
804
- * url - Status check URL for the crawl
805
- * error - Error message if start failed
806
-
807
- Raises:
808
- Exception: If crawl initiation fails
809
- """
810
- # Validate any additional kwargs
811
- self._validate_kwargs(kwargs, "async_crawl_url")
812
-
813
- crawl_params = {}
814
-
815
- # Add individual parameters
816
- if include_paths is not None:
817
- crawl_params['includePaths'] = include_paths
818
- if exclude_paths is not None:
819
- crawl_params['excludePaths'] = exclude_paths
820
- if max_depth is not None:
821
- crawl_params['maxDepth'] = max_depth
822
- if max_discovery_depth is not None:
823
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
824
- if limit is not None:
825
- crawl_params['limit'] = limit
826
- if allow_backward_links is not None:
827
- crawl_params['allowBackwardLinks'] = allow_backward_links
828
- if allow_external_links is not None:
829
- crawl_params['allowExternalLinks'] = allow_external_links
830
- if ignore_sitemap is not None:
831
- crawl_params['ignoreSitemap'] = ignore_sitemap
832
- if scrape_options is not None:
833
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
834
- if webhook is not None:
835
- crawl_params['webhook'] = webhook
836
- if deduplicate_similar_urls is not None:
837
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
838
- if ignore_query_parameters is not None:
839
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
840
- if regex_on_full_url is not None:
841
- crawl_params['regexOnFullURL'] = regex_on_full_url
842
-
843
- # Add any additional kwargs
844
- crawl_params.update(kwargs)
845
-
846
- # Create final params object
847
- final_params = CrawlParams(**crawl_params)
848
- params_dict = final_params.dict(exclude_none=True)
849
- params_dict['url'] = url
850
- params_dict['origin'] = f"python-sdk@{version}"
851
-
852
- # Make request
853
- headers = self._prepare_headers(idempotency_key)
854
- response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
855
-
856
- if response.status_code == 200:
857
- try:
858
- return CrawlResponse(**response.json())
859
- except:
860
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
861
- else:
862
- self._handle_error(response, 'start crawl job')
863
-
864
- def check_crawl_status(self, id: str) -> CrawlStatusResponse:
865
- """
866
- Check the status and results of a crawl job.
867
-
868
- Args:
869
- id: Unique identifier for the crawl job
870
-
871
- Returns:
872
- CrawlStatusResponse containing:
873
-
874
- Status Information:
875
- * status - Current state (scraping/completed/failed/cancelled)
876
- * completed - Number of pages crawled
877
- * total - Total pages to crawl
878
- * creditsUsed - API credits consumed
879
- * expiresAt - Data expiration timestamp
880
-
881
- Results:
882
- * data - List of crawled documents
883
- * next - URL for next page of results (if paginated)
884
- * success - Whether status check succeeded
885
- * error - Error message if failed
886
-
887
- Raises:
888
- Exception: If status check fails
889
- """
890
- endpoint = f'/v1/crawl/{id}'
891
-
892
- headers = self._prepare_headers()
893
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
894
- if response.status_code == 200:
895
- try:
896
- status_data = response.json()
897
- except:
898
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
899
- if status_data['status'] == 'completed':
900
- if 'data' in status_data:
901
- data = status_data['data']
902
- while 'next' in status_data:
903
- if len(status_data['data']) == 0:
904
- break
905
- next_url = status_data.get('next')
906
- if not next_url:
907
- logger.warning("Expected 'next' URL is missing.")
908
- break
909
- try:
910
- status_response = self._get_request(next_url, headers)
911
- if status_response.status_code != 200:
912
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
913
- break
914
- try:
915
- next_data = status_response.json()
916
- except:
917
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
918
- data.extend(next_data.get('data', []))
919
- status_data = next_data
920
- except Exception as e:
921
- logger.error(f"Error during pagination request: {e}")
922
- break
923
- status_data['data'] = data
924
-
925
- response = {
926
- 'status': status_data.get('status'),
927
- 'total': status_data.get('total'),
928
- 'completed': status_data.get('completed'),
929
- 'creditsUsed': status_data.get('creditsUsed'),
930
- 'expiresAt': status_data.get('expiresAt'),
931
- 'data': status_data.get('data')
932
- }
933
-
934
- if 'error' in status_data:
935
- response['error'] = status_data['error']
936
-
937
- if 'next' in status_data:
938
- response['next'] = status_data['next']
939
-
940
- return CrawlStatusResponse(
941
- success=False if 'error' in status_data else True,
942
- **response
943
- )
944
- else:
945
- self._handle_error(response, 'check crawl status')
946
-
947
- def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
948
- """
949
- Returns information about crawl errors.
950
-
951
- Args:
952
- id (str): The ID of the crawl job
953
-
954
- Returns:
955
- CrawlErrorsResponse containing:
956
- * errors (List[Dict[str, str]]): List of errors with fields:
957
- - id (str): Error ID
958
- - timestamp (str): When the error occurred
959
- - url (str): URL that caused the error
960
- - error (str): Error message
961
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
962
-
963
- Raises:
964
- Exception: If error check fails
965
- """
966
- headers = self._prepare_headers()
967
- response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
968
- if response.status_code == 200:
969
- try:
970
- return CrawlErrorsResponse(**response.json())
971
- except:
972
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
973
- else:
974
- self._handle_error(response, "check crawl errors")
975
-
976
- def cancel_crawl(self, id: str) -> Dict[str, Any]:
977
- """
978
- Cancel an asynchronous crawl job.
979
-
980
- Args:
981
- id (str): The ID of the crawl job to cancel
982
-
983
- Returns:
984
- Dict[str, Any] containing:
985
- * success (bool): Whether cancellation was successful
986
- * error (str, optional): Error message if cancellation failed
987
-
988
- Raises:
989
- Exception: If cancellation fails
990
- """
991
- headers = self._prepare_headers()
992
- response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
993
- if response.status_code == 200:
994
- try:
995
- return response.json()
996
- except:
997
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
998
- else:
999
- self._handle_error(response, "cancel crawl job")
1000
-
1001
- def crawl_url_and_watch(
1002
- self,
1003
- url: str,
1004
- *,
1005
- include_paths: Optional[List[str]] = None,
1006
- exclude_paths: Optional[List[str]] = None,
1007
- max_depth: Optional[int] = None,
1008
- max_discovery_depth: Optional[int] = None,
1009
- limit: Optional[int] = None,
1010
- allow_backward_links: Optional[bool] = None,
1011
- allow_external_links: Optional[bool] = None,
1012
- ignore_sitemap: Optional[bool] = None,
1013
- scrape_options: Optional[ScrapeOptions] = None,
1014
- webhook: Optional[Union[str, WebhookConfig]] = None,
1015
- deduplicate_similar_urls: Optional[bool] = None,
1016
- ignore_query_parameters: Optional[bool] = None,
1017
- regex_on_full_url: Optional[bool] = None,
1018
- idempotency_key: Optional[str] = None,
1019
- **kwargs
1020
- ) -> 'CrawlWatcher':
1021
- """
1022
- Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
1023
-
1024
- Args:
1025
- url (str): Target URL to start crawling from
1026
- include_paths (Optional[List[str]]): Patterns of URLs to include
1027
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
1028
- max_depth (Optional[int]): Maximum crawl depth
1029
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1030
- limit (Optional[int]): Maximum pages to crawl
1031
- allow_backward_links (Optional[bool]): Follow parent directory links
1032
- allow_external_links (Optional[bool]): Follow external domain links
1033
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1034
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
1035
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
1036
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1037
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
1038
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
1039
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1040
- **kwargs: Additional parameters to pass to the API
1041
-
1042
- Returns:
1043
- CrawlWatcher: An instance to monitor the crawl job via WebSocket
1044
-
1045
- Raises:
1046
- Exception: If crawl job fails to start
1047
- """
1048
- crawl_response = self.async_crawl_url(
1049
- url,
1050
- include_paths=include_paths,
1051
- exclude_paths=exclude_paths,
1052
- max_depth=max_depth,
1053
- max_discovery_depth=max_discovery_depth,
1054
- limit=limit,
1055
- allow_backward_links=allow_backward_links,
1056
- allow_external_links=allow_external_links,
1057
- ignore_sitemap=ignore_sitemap,
1058
- scrape_options=scrape_options,
1059
- webhook=webhook,
1060
- deduplicate_similar_urls=deduplicate_similar_urls,
1061
- ignore_query_parameters=ignore_query_parameters,
1062
- regex_on_full_url=regex_on_full_url,
1063
- idempotency_key=idempotency_key,
1064
- **kwargs
1065
- )
1066
- if crawl_response.success and crawl_response.id:
1067
- return CrawlWatcher(crawl_response.id, self)
1068
- else:
1069
- raise Exception("Crawl job failed to start")
1070
-
1071
- def map_url(
1072
- self,
1073
- url: str,
1074
- *,
1075
- search: Optional[str] = None,
1076
- ignore_sitemap: Optional[bool] = None,
1077
- include_subdomains: Optional[bool] = None,
1078
- sitemap_only: Optional[bool] = None,
1079
- limit: Optional[int] = None,
1080
- timeout: Optional[int] = None,
1081
- **kwargs) -> MapResponse:
1082
- """
1083
- Map and discover links from a URL.
1084
-
1085
- Args:
1086
- url (str): Target URL to map
1087
- search (Optional[str]): Filter pattern for URLs
1088
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1089
- include_subdomains (Optional[bool]): Include subdomain links
1090
- sitemap_only (Optional[bool]): Only use sitemap.xml
1091
- limit (Optional[int]): Maximum URLs to return
1092
- timeout (Optional[int]): Request timeout in milliseconds
1093
- **kwargs: Additional parameters to pass to the API
1094
-
1095
- Returns:
1096
- MapResponse: Response containing:
1097
- * success (bool): Whether request succeeded
1098
- * links (List[str]): Discovered URLs
1099
- * error (Optional[str]): Error message if any
1100
-
1101
- Raises:
1102
- Exception: If mapping fails or response cannot be parsed
1103
- """
1104
- # Validate any additional kwargs
1105
- self._validate_kwargs(kwargs, "map_url")
1106
-
1107
- # Build map parameters
1108
- map_params = {}
1109
-
1110
- # Add individual parameters
1111
- if search is not None:
1112
- map_params['search'] = search
1113
- if ignore_sitemap is not None:
1114
- map_params['ignoreSitemap'] = ignore_sitemap
1115
- if include_subdomains is not None:
1116
- map_params['includeSubdomains'] = include_subdomains
1117
- if sitemap_only is not None:
1118
- map_params['sitemapOnly'] = sitemap_only
1119
- if limit is not None:
1120
- map_params['limit'] = limit
1121
- if timeout is not None:
1122
- map_params['timeout'] = timeout
1123
-
1124
- # Add any additional kwargs
1125
- map_params.update(kwargs)
1126
-
1127
- # Create final params object
1128
- final_params = MapParams(**map_params)
1129
- params_dict = final_params.dict(exclude_none=True)
1130
- params_dict['url'] = url
1131
- params_dict['origin'] = f"python-sdk@{version}"
1132
-
1133
- # Make request
1134
- response = requests.post(
1135
- f"{self.api_url}/v1/map",
1136
- headers={"Authorization": f"Bearer {self.api_key}"},
1137
- json=params_dict
1138
- )
1139
-
1140
- if response.status_code == 200:
1141
- try:
1142
- response_json = response.json()
1143
- if response_json.get('success') and 'links' in response_json:
1144
- return MapResponse(**response_json)
1145
- elif "error" in response_json:
1146
- raise Exception(f'Map failed. Error: {response_json["error"]}')
1147
- else:
1148
- raise Exception(f'Map failed. Error: {response_json}')
1149
- except ValueError:
1150
- raise Exception('Failed to parse Firecrawl response as JSON.')
1151
- else:
1152
- self._handle_error(response, 'map')
1153
-
1154
- def batch_scrape_urls(
1155
- self,
1156
- urls: List[str],
1157
- *,
1158
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1159
- headers: Optional[Dict[str, str]] = None,
1160
- include_tags: Optional[List[str]] = None,
1161
- exclude_tags: Optional[List[str]] = None,
1162
- only_main_content: Optional[bool] = None,
1163
- wait_for: Optional[int] = None,
1164
- timeout: Optional[int] = None,
1165
- location: Optional[LocationConfig] = None,
1166
- mobile: Optional[bool] = None,
1167
- skip_tls_verification: Optional[bool] = None,
1168
- remove_base64_images: Optional[bool] = None,
1169
- block_ads: Optional[bool] = None,
1170
- proxy: Optional[Literal["basic", "stealth"]] = None,
1171
- extract: Optional[JsonConfig] = None,
1172
- json_options: Optional[JsonConfig] = None,
1173
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1174
- agent: Optional[AgentOptions] = None,
1175
- poll_interval: Optional[int] = 2,
1176
- idempotency_key: Optional[str] = None,
1177
- **kwargs
1178
- ) -> BatchScrapeStatusResponse:
1179
- """
1180
- Batch scrape multiple URLs and monitor until completion.
1181
-
1182
- Args:
1183
- urls (List[str]): URLs to scrape
1184
- formats (Optional[List[Literal]]): Content formats to retrieve
1185
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1186
- include_tags (Optional[List[str]]): HTML tags to include
1187
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1188
- only_main_content (Optional[bool]): Extract main content only
1189
- wait_for (Optional[int]): Wait time in milliseconds
1190
- timeout (Optional[int]): Request timeout in milliseconds
1191
- location (Optional[LocationConfig]): Location configuration
1192
- mobile (Optional[bool]): Use mobile user agent
1193
- skip_tls_verification (Optional[bool]): Skip TLS verification
1194
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1195
- block_ads (Optional[bool]): Block advertisements
1196
- proxy (Optional[Literal]): Proxy type to use
1197
- extract (Optional[JsonConfig]): Content extraction config
1198
- json_options (Optional[JsonConfig]): JSON extraction config
1199
- actions (Optional[List[Union]]): Actions to perform
1200
- agent (Optional[AgentOptions]): Agent configuration
1201
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
1202
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1203
- **kwargs: Additional parameters to pass to the API
1204
-
1205
- Returns:
1206
- BatchScrapeStatusResponse with:
1207
- * Scraping status and progress
1208
- * Scraped content for each URL
1209
- * Success/error information
1210
-
1211
- Raises:
1212
- Exception: If batch scrape fails
1213
- """
1214
- # Validate any additional kwargs
1215
- self._validate_kwargs(kwargs, "batch_scrape_urls")
1216
-
1217
- scrape_params = {}
1218
-
1219
- # Add individual parameters
1220
- if formats is not None:
1221
- scrape_params['formats'] = formats
1222
- if headers is not None:
1223
- scrape_params['headers'] = headers
1224
- if include_tags is not None:
1225
- scrape_params['includeTags'] = include_tags
1226
- if exclude_tags is not None:
1227
- scrape_params['excludeTags'] = exclude_tags
1228
- if only_main_content is not None:
1229
- scrape_params['onlyMainContent'] = only_main_content
1230
- if wait_for is not None:
1231
- scrape_params['waitFor'] = wait_for
1232
- if timeout is not None:
1233
- scrape_params['timeout'] = timeout
1234
- if location is not None:
1235
- scrape_params['location'] = location.dict(exclude_none=True)
1236
- if mobile is not None:
1237
- scrape_params['mobile'] = mobile
1238
- if skip_tls_verification is not None:
1239
- scrape_params['skipTlsVerification'] = skip_tls_verification
1240
- if remove_base64_images is not None:
1241
- scrape_params['removeBase64Images'] = remove_base64_images
1242
- if block_ads is not None:
1243
- scrape_params['blockAds'] = block_ads
1244
- if proxy is not None:
1245
- scrape_params['proxy'] = proxy
1246
- if extract is not None:
1247
- if hasattr(extract.schema, 'schema'):
1248
- extract.schema = extract.schema.schema()
1249
- scrape_params['extract'] = extract.dict(exclude_none=True)
1250
- if json_options is not None:
1251
- if hasattr(json_options.schema, 'schema'):
1252
- json_options.schema = json_options.schema.schema()
1253
- scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1254
- if actions is not None:
1255
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1256
- if agent is not None:
1257
- scrape_params['agent'] = agent.dict(exclude_none=True)
1258
-
1259
- # Add any additional kwargs
1260
- scrape_params.update(kwargs)
1261
-
1262
- # Create final params object
1263
- final_params = ScrapeParams(**scrape_params)
1264
- params_dict = final_params.dict(exclude_none=True)
1265
- params_dict['urls'] = urls
1266
- params_dict['origin'] = f"python-sdk@{version}"
1267
-
1268
- # Make request
1269
- headers = self._prepare_headers(idempotency_key)
1270
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1271
-
1272
- if response.status_code == 200:
1273
- try:
1274
- id = response.json().get('id')
1275
- except:
1276
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1277
- return self._monitor_job_status(id, headers, poll_interval)
1278
- else:
1279
- self._handle_error(response, 'start batch scrape job')
1280
-
1281
- def async_batch_scrape_urls(
1282
- self,
1283
- urls: List[str],
1284
- *,
1285
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1286
- headers: Optional[Dict[str, str]] = None,
1287
- include_tags: Optional[List[str]] = None,
1288
- exclude_tags: Optional[List[str]] = None,
1289
- only_main_content: Optional[bool] = None,
1290
- wait_for: Optional[int] = None,
1291
- timeout: Optional[int] = None,
1292
- location: Optional[LocationConfig] = None,
1293
- mobile: Optional[bool] = None,
1294
- skip_tls_verification: Optional[bool] = None,
1295
- remove_base64_images: Optional[bool] = None,
1296
- block_ads: Optional[bool] = None,
1297
- proxy: Optional[Literal["basic", "stealth"]] = None,
1298
- extract: Optional[JsonConfig] = None,
1299
- json_options: Optional[JsonConfig] = None,
1300
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1301
- agent: Optional[AgentOptions] = None,
1302
- idempotency_key: Optional[str] = None,
1303
- **kwargs
1304
- ) -> BatchScrapeResponse:
1305
- """
1306
- Initiate a batch scrape job asynchronously.
1307
-
1308
- Args:
1309
- urls (List[str]): URLs to scrape
1310
- formats (Optional[List[Literal]]): Content formats to retrieve
1311
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1312
- include_tags (Optional[List[str]]): HTML tags to include
1313
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1314
- only_main_content (Optional[bool]): Extract main content only
1315
- wait_for (Optional[int]): Wait time in milliseconds
1316
- timeout (Optional[int]): Request timeout in milliseconds
1317
- location (Optional[LocationConfig]): Location configuration
1318
- mobile (Optional[bool]): Use mobile user agent
1319
- skip_tls_verification (Optional[bool]): Skip TLS verification
1320
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1321
- block_ads (Optional[bool]): Block advertisements
1322
- proxy (Optional[Literal]): Proxy type to use
1323
- extract (Optional[JsonConfig]): Content extraction config
1324
- json_options (Optional[JsonConfig]): JSON extraction config
1325
- actions (Optional[List[Union]]): Actions to perform
1326
- agent (Optional[AgentOptions]): Agent configuration
1327
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1328
- **kwargs: Additional parameters to pass to the API
1329
-
1330
- Returns:
1331
- BatchScrapeResponse with:
1332
- * success - Whether job started successfully
1333
- * id - Unique identifier for the job
1334
- * url - Status check URL
1335
- * error - Error message if start failed
1336
-
1337
- Raises:
1338
- Exception: If job initiation fails
1339
- """
1340
- # Validate any additional kwargs
1341
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
1342
-
1343
- scrape_params = {}
1344
-
1345
- # Add individual parameters
1346
- if formats is not None:
1347
- scrape_params['formats'] = formats
1348
- if headers is not None:
1349
- scrape_params['headers'] = headers
1350
- if include_tags is not None:
1351
- scrape_params['includeTags'] = include_tags
1352
- if exclude_tags is not None:
1353
- scrape_params['excludeTags'] = exclude_tags
1354
- if only_main_content is not None:
1355
- scrape_params['onlyMainContent'] = only_main_content
1356
- if wait_for is not None:
1357
- scrape_params['waitFor'] = wait_for
1358
- if timeout is not None:
1359
- scrape_params['timeout'] = timeout
1360
- if location is not None:
1361
- scrape_params['location'] = location.dict(exclude_none=True)
1362
- if mobile is not None:
1363
- scrape_params['mobile'] = mobile
1364
- if skip_tls_verification is not None:
1365
- scrape_params['skipTlsVerification'] = skip_tls_verification
1366
- if remove_base64_images is not None:
1367
- scrape_params['removeBase64Images'] = remove_base64_images
1368
- if block_ads is not None:
1369
- scrape_params['blockAds'] = block_ads
1370
- if proxy is not None:
1371
- scrape_params['proxy'] = proxy
1372
- if extract is not None:
1373
- if hasattr(extract.schema, 'schema'):
1374
- extract.schema = extract.schema.schema()
1375
- scrape_params['extract'] = extract.dict(exclude_none=True)
1376
- if json_options is not None:
1377
- if hasattr(json_options.schema, 'schema'):
1378
- json_options.schema = json_options.schema.schema()
1379
- scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1380
- if actions is not None:
1381
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1382
- if agent is not None:
1383
- scrape_params['agent'] = agent.dict(exclude_none=True)
1384
-
1385
- # Add any additional kwargs
1386
- scrape_params.update(kwargs)
1387
-
1388
- # Create final params object
1389
- final_params = ScrapeParams(**scrape_params)
1390
- params_dict = final_params.dict(exclude_none=True)
1391
- params_dict['urls'] = urls
1392
- params_dict['origin'] = f"python-sdk@{version}"
1393
-
1394
- # Make request
1395
- headers = self._prepare_headers(idempotency_key)
1396
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1397
-
1398
- if response.status_code == 200:
1399
- try:
1400
- return BatchScrapeResponse(**response.json())
1401
- except:
1402
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1403
- else:
1404
- self._handle_error(response, 'start batch scrape job')
1405
-
1406
- def batch_scrape_urls_and_watch(
1407
- self,
1408
- urls: List[str],
1409
- *,
1410
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
1411
- headers: Optional[Dict[str, str]] = None,
1412
- include_tags: Optional[List[str]] = None,
1413
- exclude_tags: Optional[List[str]] = None,
1414
- only_main_content: Optional[bool] = None,
1415
- wait_for: Optional[int] = None,
1416
- timeout: Optional[int] = None,
1417
- location: Optional[LocationConfig] = None,
1418
- mobile: Optional[bool] = None,
1419
- skip_tls_verification: Optional[bool] = None,
1420
- remove_base64_images: Optional[bool] = None,
1421
- block_ads: Optional[bool] = None,
1422
- proxy: Optional[Literal["basic", "stealth"]] = None,
1423
- extract: Optional[JsonConfig] = None,
1424
- json_options: Optional[JsonConfig] = None,
1425
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1426
- agent: Optional[AgentOptions] = None,
1427
- idempotency_key: Optional[str] = None,
1428
- **kwargs
1429
- ) -> 'CrawlWatcher':
1430
- """
1431
- Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
1432
-
1433
- Args:
1434
- urls (List[str]): URLs to scrape
1435
- formats (Optional[List[Literal]]): Content formats to retrieve
1436
- headers (Optional[Dict[str, str]]): Custom HTTP headers
1437
- include_tags (Optional[List[str]]): HTML tags to include
1438
- exclude_tags (Optional[List[str]]): HTML tags to exclude
1439
- only_main_content (Optional[bool]): Extract main content only
1440
- wait_for (Optional[int]): Wait time in milliseconds
1441
- timeout (Optional[int]): Request timeout in milliseconds
1442
- location (Optional[LocationConfig]): Location configuration
1443
- mobile (Optional[bool]): Use mobile user agent
1444
- skip_tls_verification (Optional[bool]): Skip TLS verification
1445
- remove_base64_images (Optional[bool]): Remove base64 encoded images
1446
- block_ads (Optional[bool]): Block advertisements
1447
- proxy (Optional[Literal]): Proxy type to use
1448
- extract (Optional[JsonConfig]): Content extraction config
1449
- json_options (Optional[JsonConfig]): JSON extraction config
1450
- actions (Optional[List[Union]]): Actions to perform
1451
- agent (Optional[AgentOptions]): Agent configuration
1452
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1453
- **kwargs: Additional parameters to pass to the API
1454
-
1455
- Returns:
1456
- CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
1457
-
1458
- Raises:
1459
- Exception: If batch scrape job fails to start
1460
- """
1461
- # Validate any additional kwargs
1462
- self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
1463
-
1464
- scrape_params = {}
1465
-
1466
- # Add individual parameters
1467
- if formats is not None:
1468
- scrape_params['formats'] = formats
1469
- if headers is not None:
1470
- scrape_params['headers'] = headers
1471
- if include_tags is not None:
1472
- scrape_params['includeTags'] = include_tags
1473
- if exclude_tags is not None:
1474
- scrape_params['excludeTags'] = exclude_tags
1475
- if only_main_content is not None:
1476
- scrape_params['onlyMainContent'] = only_main_content
1477
- if wait_for is not None:
1478
- scrape_params['waitFor'] = wait_for
1479
- if timeout is not None:
1480
- scrape_params['timeout'] = timeout
1481
- if location is not None:
1482
- scrape_params['location'] = location.dict(exclude_none=True)
1483
- if mobile is not None:
1484
- scrape_params['mobile'] = mobile
1485
- if skip_tls_verification is not None:
1486
- scrape_params['skipTlsVerification'] = skip_tls_verification
1487
- if remove_base64_images is not None:
1488
- scrape_params['removeBase64Images'] = remove_base64_images
1489
- if block_ads is not None:
1490
- scrape_params['blockAds'] = block_ads
1491
- if proxy is not None:
1492
- scrape_params['proxy'] = proxy
1493
- if extract is not None:
1494
- if hasattr(extract.schema, 'schema'):
1495
- extract.schema = extract.schema.schema()
1496
- scrape_params['extract'] = extract.dict(exclude_none=True)
1497
- if json_options is not None:
1498
- if hasattr(json_options.schema, 'schema'):
1499
- json_options.schema = json_options.schema.schema()
1500
- scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
1501
- if actions is not None:
1502
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1503
- if agent is not None:
1504
- scrape_params['agent'] = agent.dict(exclude_none=True)
1505
-
1506
- # Add any additional kwargs
1507
- scrape_params.update(kwargs)
1508
-
1509
- # Create final params object
1510
- final_params = ScrapeParams(**scrape_params)
1511
- params_dict = final_params.dict(exclude_none=True)
1512
- params_dict['urls'] = urls
1513
- params_dict['origin'] = f"python-sdk@{version}"
1514
-
1515
- # Make request
1516
- headers = self._prepare_headers(idempotency_key)
1517
- response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
1518
-
1519
- if response.status_code == 200:
1520
- try:
1521
- crawl_response = BatchScrapeResponse(**response.json())
1522
- if crawl_response.success and crawl_response.id:
1523
- return CrawlWatcher(crawl_response.id, self)
1524
- else:
1525
- raise Exception("Batch scrape job failed to start")
1526
- except:
1527
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1528
- else:
1529
- self._handle_error(response, 'start batch scrape job')
1530
-
1531
- def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
1532
- """
1533
- Check the status of a batch scrape job using the Firecrawl API.
1534
-
1535
- Args:
1536
- id (str): The ID of the batch scrape job.
1537
-
1538
- Returns:
1539
- BatchScrapeStatusResponse: The status of the batch scrape job.
1540
-
1541
- Raises:
1542
- Exception: If the status check request fails.
1543
- """
1544
- endpoint = f'/v1/batch/scrape/{id}'
1545
-
1546
- headers = self._prepare_headers()
1547
- response = self._get_request(f'{self.api_url}{endpoint}', headers)
1548
- if response.status_code == 200:
1549
- try:
1550
- status_data = response.json()
1551
- except:
1552
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1553
- if status_data['status'] == 'completed':
1554
- if 'data' in status_data:
1555
- data = status_data['data']
1556
- while 'next' in status_data:
1557
- if len(status_data['data']) == 0:
1558
- break
1559
- next_url = status_data.get('next')
1560
- if not next_url:
1561
- logger.warning("Expected 'next' URL is missing.")
1562
- break
1563
- try:
1564
- status_response = self._get_request(next_url, headers)
1565
- if status_response.status_code != 200:
1566
- logger.error(f"Failed to fetch next page: {status_response.status_code}")
1567
- break
1568
- try:
1569
- next_data = status_response.json()
1570
- except:
1571
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1572
- data.extend(next_data.get('data', []))
1573
- status_data = next_data
1574
- except Exception as e:
1575
- logger.error(f"Error during pagination request: {e}")
1576
- break
1577
- status_data['data'] = data
1578
-
1579
- return BatchScrapeStatusResponse(**{
1580
- 'success': False if 'error' in status_data else True,
1581
- 'status': status_data.get('status'),
1582
- 'total': status_data.get('total'),
1583
- 'completed': status_data.get('completed'),
1584
- 'creditsUsed': status_data.get('creditsUsed'),
1585
- 'expiresAt': status_data.get('expiresAt'),
1586
- 'data': status_data.get('data'),
1587
- 'next': status_data.get('next'),
1588
- 'error': status_data.get('error')
1589
- })
1590
- else:
1591
- self._handle_error(response, 'check batch scrape status')
1592
-
1593
- def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
1594
- """
1595
- Returns information about batch scrape errors.
1596
-
1597
- Args:
1598
- id (str): The ID of the crawl job.
1599
-
1600
- Returns:
1601
- CrawlErrorsResponse: A response containing:
1602
- * errors (List[Dict[str, str]]): List of errors with fields:
1603
- * id (str): Error ID
1604
- * timestamp (str): When the error occurred
1605
- * url (str): URL that caused the error
1606
- * error (str): Error message
1607
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
1608
-
1609
- Raises:
1610
- Exception: If the error check request fails
1611
- """
1612
- headers = self._prepare_headers()
1613
- response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
1614
- if response.status_code == 200:
1615
- try:
1616
- return CrawlErrorsResponse(**response.json())
1617
- except:
1618
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1619
- else:
1620
- self._handle_error(response, "check batch scrape errors")
1621
-
1622
- def extract(
1623
- self,
1624
- urls: Optional[List[str]] = None,
1625
- *,
1626
- prompt: Optional[str] = None,
1627
- schema: Optional[Any] = None,
1628
- system_prompt: Optional[str] = None,
1629
- allow_external_links: Optional[bool] = False,
1630
- enable_web_search: Optional[bool] = False,
1631
- show_sources: Optional[bool] = False,
1632
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1633
- """
1634
- Extract structured information from URLs.
1635
-
1636
- Args:
1637
- urls (Optional[List[str]]): URLs to extract from
1638
- prompt (Optional[str]): Custom extraction prompt
1639
- schema (Optional[Any]): JSON schema/Pydantic model
1640
- system_prompt (Optional[str]): System context
1641
- allow_external_links (Optional[bool]): Follow external links
1642
- enable_web_search (Optional[bool]): Enable web search
1643
- show_sources (Optional[bool]): Include source URLs
1644
- agent (Optional[Dict[str, Any]]): Agent configuration
1645
-
1646
- Returns:
1647
- ExtractResponse[Any] with:
1648
- * success (bool): Whether request succeeded
1649
- * data (Optional[Any]): Extracted data matching schema
1650
- * error (Optional[str]): Error message if any
1651
-
1652
- Raises:
1653
- ValueError: If prompt/schema missing or extraction fails
1654
- """
1655
- headers = self._prepare_headers()
1656
-
1657
- if not prompt and not schema:
1658
- raise ValueError("Either prompt or schema is required")
1659
-
1660
- if not urls and not prompt:
1661
- raise ValueError("Either urls or prompt is required")
1662
-
1663
- if schema:
1664
- if hasattr(schema, 'model_json_schema'):
1665
- # Convert Pydantic model to JSON schema
1666
- schema = schema.model_json_schema()
1667
- # Otherwise assume it's already a JSON schema dict
1668
-
1669
- request_data = {
1670
- 'urls': urls or [],
1671
- 'allowExternalLinks': allow_external_links,
1672
- 'enableWebSearch': enable_web_search,
1673
- 'showSources': show_sources,
1674
- 'schema': schema,
1675
- 'origin': f'python-sdk@{get_version()}'
1676
- }
1677
-
1678
- # Only add prompt and systemPrompt if they exist
1679
- if prompt:
1680
- request_data['prompt'] = prompt
1681
- if system_prompt:
1682
- request_data['systemPrompt'] = system_prompt
1683
-
1684
- if agent:
1685
- request_data['agent'] = agent
1686
-
1687
- try:
1688
- # Send the initial extract request
1689
- response = self._post_request(
1690
- f'{self.api_url}/v1/extract',
1691
- request_data,
1692
- headers
1693
- )
1694
- if response.status_code == 200:
1695
- try:
1696
- data = response.json()
1697
- except:
1698
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1699
- if data['success']:
1700
- job_id = data.get('id')
1701
- if not job_id:
1702
- raise Exception('Job ID not returned from extract request.')
1703
-
1704
- # Poll for the extract status
1705
- while True:
1706
- status_response = self._get_request(
1707
- f'{self.api_url}/v1/extract/{job_id}',
1708
- headers
1709
- )
1710
- if status_response.status_code == 200:
1711
- try:
1712
- status_data = status_response.json()
1713
- except:
1714
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1715
- if status_data['status'] == 'completed':
1716
- return ExtractResponse(**status_data)
1717
- elif status_data['status'] in ['failed', 'cancelled']:
1718
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
1719
- else:
1720
- self._handle_error(status_response, "extract-status")
1721
-
1722
- time.sleep(2) # Polling interval
1723
- else:
1724
- raise Exception(f'Failed to extract. Error: {data["error"]}')
1725
- else:
1726
- self._handle_error(response, "extract")
1727
- except Exception as e:
1728
- raise ValueError(str(e), 500)
1729
-
1730
- return ExtractResponse(success=False, error="Internal server error.")
1731
-
1732
- def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
1733
- """
1734
- Retrieve the status of an extract job.
1735
-
1736
- Args:
1737
- job_id (str): The ID of the extract job.
1738
-
1739
- Returns:
1740
- ExtractResponse[Any]: The status of the extract job.
1741
-
1742
- Raises:
1743
- ValueError: If there is an error retrieving the status.
1744
- """
1745
- headers = self._prepare_headers()
1746
- try:
1747
- response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
1748
- if response.status_code == 200:
1749
- try:
1750
- return ExtractResponse(**response.json())
1751
- except:
1752
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1753
- else:
1754
- self._handle_error(response, "get extract status")
1755
- except Exception as e:
1756
- raise ValueError(str(e), 500)
1757
-
1758
- def async_extract(
1759
- self,
1760
- urls: Optional[List[str]] = None,
1761
- *,
1762
- prompt: Optional[str] = None,
1763
- schema: Optional[Any] = None,
1764
- system_prompt: Optional[str] = None,
1765
- allow_external_links: Optional[bool] = False,
1766
- enable_web_search: Optional[bool] = False,
1767
- show_sources: Optional[bool] = False,
1768
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
1769
- """
1770
- Initiate an asynchronous extract job.
1771
-
1772
- Args:
1773
- urls (List[str]): URLs to extract information from
1774
- prompt (Optional[str]): Custom extraction prompt
1775
- schema (Optional[Any]): JSON schema/Pydantic model
1776
- system_prompt (Optional[str]): System context
1777
- allow_external_links (Optional[bool]): Follow external links
1778
- enable_web_search (Optional[bool]): Enable web search
1779
- show_sources (Optional[bool]): Include source URLs
1780
- agent (Optional[Dict[str, Any]]): Agent configuration
1781
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1782
-
1783
- Returns:
1784
- ExtractResponse[Any] with:
1785
- * success (bool): Whether request succeeded
1786
- * data (Optional[Any]): Extracted data matching schema
1787
- * error (Optional[str]): Error message if any
1788
-
1789
- Raises:
1790
- ValueError: If job initiation fails
1791
- """
1792
- headers = self._prepare_headers()
1793
-
1794
- schema = schema
1795
- if schema:
1796
- if hasattr(schema, 'model_json_schema'):
1797
- # Convert Pydantic model to JSON schema
1798
- schema = schema.model_json_schema()
1799
- # Otherwise assume it's already a JSON schema dict
1800
-
1801
- request_data = {
1802
- 'urls': urls,
1803
- 'allowExternalLinks': allow_external_links,
1804
- 'enableWebSearch': enable_web_search,
1805
- 'showSources': show_sources,
1806
- 'schema': schema,
1807
- 'origin': f'python-sdk@{version}'
1808
- }
1809
-
1810
- if prompt:
1811
- request_data['prompt'] = prompt
1812
- if system_prompt:
1813
- request_data['systemPrompt'] = system_prompt
1814
- if agent:
1815
- request_data['agent'] = agent
1816
-
1817
- try:
1818
- response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
1819
- if response.status_code == 200:
1820
- try:
1821
- return ExtractResponse(**response.json())
1822
- except:
1823
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
1824
- else:
1825
- self._handle_error(response, "async extract")
1826
- except Exception as e:
1827
- raise ValueError(str(e), 500)
1828
-
1829
- def generate_llms_text(
1830
- self,
1831
- url: str,
1832
- *,
1833
- max_urls: Optional[int] = None,
1834
- show_full_text: Optional[bool] = None,
1835
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1836
- """
1837
- Generate LLMs.txt for a given URL and poll until completion.
1838
-
1839
- Args:
1840
- url (str): Target URL to generate LLMs.txt from
1841
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1842
- show_full_text (Optional[bool]): Include full text in output (default: False)
1843
- experimental_stream (Optional[bool]): Enable experimental streaming
1844
-
1845
- Returns:
1846
- GenerateLLMsTextStatusResponse with:
1847
- * Generated LLMs.txt content
1848
- * Full version if requested
1849
- * Generation status
1850
- * Success/error information
1851
-
1852
- Raises:
1853
- Exception: If generation fails
1854
- """
1855
- params = GenerateLLMsTextParams(
1856
- maxUrls=max_urls,
1857
- showFullText=show_full_text,
1858
- __experimental_stream=experimental_stream
1859
- )
1860
-
1861
- response = self.async_generate_llms_text(
1862
- url,
1863
- max_urls=max_urls,
1864
- show_full_text=show_full_text,
1865
- experimental_stream=experimental_stream
1866
- )
1867
-
1868
- if not response.success or not response.id:
1869
- return GenerateLLMsTextStatusResponse(
1870
- success=False,
1871
- error='Failed to start LLMs.txt generation',
1872
- status='failed',
1873
- expiresAt=''
1874
- )
1875
-
1876
- job_id = response.id
1877
- while True:
1878
- status = self.check_generate_llms_text_status(job_id)
1879
-
1880
- if status.status == 'completed':
1881
- return status
1882
- elif status.status == 'failed':
1883
- return status
1884
- elif status.status != 'processing':
1885
- return GenerateLLMsTextStatusResponse(
1886
- success=False,
1887
- error='LLMs.txt generation job terminated unexpectedly',
1888
- status='failed',
1889
- expiresAt=''
1890
- )
1891
-
1892
- time.sleep(2) # Polling interval
1893
-
1894
- def async_generate_llms_text(
1895
- self,
1896
- url: str,
1897
- *,
1898
- max_urls: Optional[int] = None,
1899
- show_full_text: Optional[bool] = None,
1900
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1901
- """
1902
- Initiate an asynchronous LLMs.txt generation operation.
1903
-
1904
- Args:
1905
- url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1906
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
1907
- show_full_text (Optional[bool]): Include full text in output (default: False)
1908
- experimental_stream (Optional[bool]): Enable experimental streaming
1909
-
1910
- Returns:
1911
- GenerateLLMsTextResponse: A response containing:
1912
- * success (bool): Whether the generation initiation was successful
1913
- * id (str): The unique identifier for the generation job
1914
- * error (str, optional): Error message if initiation failed
1915
-
1916
- Raises:
1917
- Exception: If the generation job initiation fails.
1918
- """
1919
- params = GenerateLLMsTextParams(
1920
- maxUrls=max_urls,
1921
- showFullText=show_full_text,
1922
- __experimental_stream=experimental_stream
1923
- )
1924
-
1925
- headers = self._prepare_headers()
1926
- json_data = {'url': url, **params.dict(exclude_none=True)}
1927
- json_data['origin'] = f"python-sdk@{version}"
1928
-
1929
- try:
1930
- req = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
1931
- response = req.json()
1932
- print("json_data", json_data)
1933
- print("response", response)
1934
- if response.get('success'):
1935
- try:
1936
- return GenerateLLMsTextResponse(**response)
1937
- except:
1938
- raise Exception('Failed to parse Firecrawl response as JSON.')
1939
- else:
1940
- self._handle_error(response, 'start LLMs.txt generation')
1941
- except Exception as e:
1942
- raise ValueError(str(e))
1943
-
1944
- return GenerateLLMsTextResponse(
1945
- success=False,
1946
- error='Internal server error'
1947
- )
1948
-
1949
- def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
1950
- """
1951
- Check the status of a LLMs.txt generation operation.
1952
-
1953
- Args:
1954
- id (str): The unique identifier of the LLMs.txt generation job to check status for.
1955
-
1956
- Returns:
1957
- GenerateLLMsTextStatusResponse: A response containing:
1958
- * success (bool): Whether the generation was successful
1959
- * status (str): Status of generation ("processing", "completed", "failed")
1960
- * data (Dict[str, str], optional): Generated text with fields:
1961
- * llmstxt (str): Generated LLMs.txt content
1962
- * llmsfulltxt (str, optional): Full version if requested
1963
- * error (str, optional): Error message if generation failed
1964
- * expiresAt (str): When the generated data expires
1965
-
1966
- Raises:
1967
- Exception: If the status check fails.
1968
- """
1969
- headers = self._prepare_headers()
1970
- try:
1971
- response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers)
1972
- if response.status_code == 200:
1973
- try:
1974
- json_data = response.json()
1975
- return GenerateLLMsTextStatusResponse(**json_data)
1976
- except Exception as e:
1977
- raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
1978
- elif response.status_code == 404:
1979
- raise Exception('LLMs.txt generation job not found')
1980
- else:
1981
- self._handle_error(response, 'check LLMs.txt generation status')
1982
- except Exception as e:
1983
- raise ValueError(str(e))
1984
-
1985
- return GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
1986
-
1987
- def _prepare_headers(
1988
- self,
1989
- idempotency_key: Optional[str] = None) -> Dict[str, str]:
1990
- """
1991
- Prepare the headers for API requests.
1992
-
1993
- Args:
1994
- idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
1995
-
1996
- Returns:
1997
- Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
1998
- """
1999
- if idempotency_key:
2000
- return {
2001
- 'Content-Type': 'application/json',
2002
- 'Authorization': f'Bearer {self.api_key}',
2003
- 'x-idempotency-key': idempotency_key
2004
- }
2005
-
2006
- return {
2007
- 'Content-Type': 'application/json',
2008
- 'Authorization': f'Bearer {self.api_key}',
2009
- }
2010
-
2011
- def _post_request(
2012
- self,
2013
- url: str,
2014
- data: Dict[str, Any],
2015
- headers: Dict[str, str],
2016
- retries: int = 3,
2017
- backoff_factor: float = 0.5) -> requests.Response:
2018
- """
2019
- Make a POST request with retries.
2020
-
2021
- Args:
2022
- url (str): The URL to send the POST request to.
2023
- data (Dict[str, Any]): The JSON data to include in the POST request.
2024
- headers (Dict[str, str]): The headers to include in the POST request.
2025
- retries (int): Number of retries for the request.
2026
- backoff_factor (float): Backoff factor for retries.
2027
-
2028
- Returns:
2029
- requests.Response: The response from the POST request.
2030
-
2031
- Raises:
2032
- requests.RequestException: If the request fails after the specified retries.
2033
- """
2034
- for attempt in range(retries):
2035
- response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] + 5000) if "timeout" in data else None))
2036
- if response.status_code == 502:
2037
- time.sleep(backoff_factor * (2 ** attempt))
2038
- else:
2039
- return response
2040
- return response
2041
-
2042
- def _get_request(
2043
- self,
2044
- url: str,
2045
- headers: Dict[str, str],
2046
- retries: int = 3,
2047
- backoff_factor: float = 0.5) -> requests.Response:
2048
- """
2049
- Make a GET request with retries.
2050
-
2051
- Args:
2052
- url (str): The URL to send the GET request to.
2053
- headers (Dict[str, str]): The headers to include in the GET request.
2054
- retries (int): Number of retries for the request.
2055
- backoff_factor (float): Backoff factor for retries.
2056
-
2057
- Returns:
2058
- requests.Response: The response from the GET request.
2059
-
2060
- Raises:
2061
- requests.RequestException: If the request fails after the specified retries.
2062
- """
2063
- for attempt in range(retries):
2064
- response = requests.get(url, headers=headers)
2065
- if response.status_code == 502:
2066
- time.sleep(backoff_factor * (2 ** attempt))
2067
- else:
2068
- return response
2069
- return response
2070
-
2071
- def _delete_request(
2072
- self,
2073
- url: str,
2074
- headers: Dict[str, str],
2075
- retries: int = 3,
2076
- backoff_factor: float = 0.5) -> requests.Response:
2077
- """
2078
- Make a DELETE request with retries.
2079
-
2080
- Args:
2081
- url (str): The URL to send the DELETE request to.
2082
- headers (Dict[str, str]): The headers to include in the DELETE request.
2083
- retries (int): Number of retries for the request.
2084
- backoff_factor (float): Backoff factor for retries.
2085
-
2086
- Returns:
2087
- requests.Response: The response from the DELETE request.
2088
-
2089
- Raises:
2090
- requests.RequestException: If the request fails after the specified retries.
2091
- """
2092
- for attempt in range(retries):
2093
- response = requests.delete(url, headers=headers)
2094
- if response.status_code == 502:
2095
- time.sleep(backoff_factor * (2 ** attempt))
2096
- else:
2097
- return response
2098
- return response
2099
-
2100
- def _monitor_job_status(
2101
- self,
2102
- id: str,
2103
- headers: Dict[str, str],
2104
- poll_interval: int) -> CrawlStatusResponse:
2105
- """
2106
- Monitor the status of a crawl job until completion.
2107
-
2108
- Args:
2109
- id (str): The ID of the crawl job.
2110
- headers (Dict[str, str]): The headers to include in the status check requests.
2111
- poll_interval (int): Seconds between status checks.
2112
-
2113
- Returns:
2114
- CrawlStatusResponse: The crawl results if the job is completed successfully.
2115
-
2116
- Raises:
2117
- Exception: If the job fails or an error occurs during status checks.
2118
- """
2119
- while True:
2120
- api_url = f'{self.api_url}/v1/crawl/{id}'
2121
-
2122
- status_response = self._get_request(api_url, headers)
2123
- if status_response.status_code == 200:
2124
- try:
2125
- status_data = status_response.json()
2126
- except:
2127
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2128
- if status_data['status'] == 'completed':
2129
- if 'data' in status_data:
2130
- data = status_data['data']
2131
- while 'next' in status_data:
2132
- if len(status_data['data']) == 0:
2133
- break
2134
- status_response = self._get_request(status_data['next'], headers)
2135
- try:
2136
- status_data = status_response.json()
2137
- except:
2138
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
2139
- data.extend(status_data.get('data', []))
2140
- status_data['data'] = data
2141
- return CrawlStatusResponse(**status_data)
2142
- else:
2143
- raise Exception('Crawl job completed but no data was returned')
2144
- elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
2145
- poll_interval=max(poll_interval,2)
2146
- time.sleep(poll_interval) # Wait for the specified interval before checking again
2147
- else:
2148
- raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
2149
- else:
2150
- self._handle_error(status_response, 'check crawl status')
2151
-
2152
- def _handle_error(
2153
- self,
2154
- response: requests.Response,
2155
- action: str) -> None:
2156
- """
2157
- Handle errors from API responses.
2158
-
2159
- Args:
2160
- response (requests.Response): The response object from the API request.
2161
- action (str): Description of the action that was being performed.
2162
-
2163
- Raises:
2164
- Exception: An exception with a message containing the status code and error details from the response.
2165
- """
2166
- try:
2167
- error_message = response.json().get('error', 'No error message provided.')
2168
- error_details = response.json().get('details', 'No additional error details provided.')
2169
- except:
2170
- raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
2171
-
2172
- message = self._get_error_message(response.status_code, action, error_message, error_details)
2173
-
2174
- # Raise an HTTPError with the custom message and attach the response
2175
- raise requests.exceptions.HTTPError(message, response=response)
2176
-
2177
- def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2178
- """
2179
- Generate a standardized error message based on HTTP status code.
2180
-
2181
- Args:
2182
- status_code (int): The HTTP status code from the response
2183
- action (str): Description of the action that was being performed
2184
- error_message (str): The error message from the API response
2185
- error_details (str): Additional error details from the API response
2186
-
2187
- Returns:
2188
- str: A formatted error message
2189
- """
2190
- if status_code == 402:
2191
- return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
2192
- elif status_code == 403:
2193
- message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
2194
- elif status_code == 408:
2195
- return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
2196
- elif status_code == 409:
2197
- return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
2198
- elif status_code == 500:
2199
- return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
2200
- else:
2201
- return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
2202
-
2203
- def deep_research(
2204
- self,
2205
- query: str,
2206
- *,
2207
- max_depth: Optional[int] = None,
2208
- time_limit: Optional[int] = None,
2209
- max_urls: Optional[int] = None,
2210
- analysis_prompt: Optional[str] = None,
2211
- system_prompt: Optional[str] = None,
2212
- __experimental_stream_steps: Optional[bool] = None,
2213
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
2214
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
2215
- """
2216
- Initiates a deep research operation on a given query and polls until completion.
2217
-
2218
- Args:
2219
- query (str): Research query or topic to investigate
2220
- max_depth (Optional[int]): Maximum depth of research exploration
2221
- time_limit (Optional[int]): Time limit in seconds for research
2222
- max_urls (Optional[int]): Maximum number of URLs to process
2223
- analysis_prompt (Optional[str]): Custom prompt for analysis
2224
- system_prompt (Optional[str]): Custom system prompt
2225
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2226
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
2227
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
2228
-
2229
- Returns:
2230
- DeepResearchStatusResponse containing:
2231
- * success (bool): Whether research completed successfully
2232
- * status (str): Current state (processing/completed/failed)
2233
- * error (Optional[str]): Error message if failed
2234
- * id (str): Unique identifier for the research job
2235
- * data (Any): Research findings and analysis
2236
- * sources (List[Dict]): List of discovered sources
2237
- * activities (List[Dict]): Research progress log
2238
- * summaries (List[str]): Generated research summaries
2239
-
2240
- Raises:
2241
- Exception: If research fails
2242
- """
2243
- research_params = {}
2244
- if max_depth is not None:
2245
- research_params['maxDepth'] = max_depth
2246
- if time_limit is not None:
2247
- research_params['timeLimit'] = time_limit
2248
- if max_urls is not None:
2249
- research_params['maxUrls'] = max_urls
2250
- if analysis_prompt is not None:
2251
- research_params['analysisPrompt'] = analysis_prompt
2252
- if system_prompt is not None:
2253
- research_params['systemPrompt'] = system_prompt
2254
- if __experimental_stream_steps is not None:
2255
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2256
- research_params = DeepResearchParams(**research_params)
2257
-
2258
- response = self.async_deep_research(
2259
- query,
2260
- max_depth=max_depth,
2261
- time_limit=time_limit,
2262
- max_urls=max_urls,
2263
- analysis_prompt=analysis_prompt,
2264
- system_prompt=system_prompt
2265
- )
2266
- if not response.get('success') or 'id' not in response:
2267
- return response
2268
-
2269
- job_id = response['id']
2270
- last_activity_count = 0
2271
- last_source_count = 0
2272
-
2273
- while True:
2274
- status = self.check_deep_research_status(job_id)
2275
-
2276
- if on_activity and 'activities' in status:
2277
- new_activities = status['activities'][last_activity_count:]
2278
- for activity in new_activities:
2279
- on_activity(activity)
2280
- last_activity_count = len(status['activities'])
2281
-
2282
- if on_source and 'sources' in status:
2283
- new_sources = status['sources'][last_source_count:]
2284
- for source in new_sources:
2285
- on_source(source)
2286
- last_source_count = len(status['sources'])
2287
-
2288
- if status['status'] == 'completed':
2289
- return status
2290
- elif status['status'] == 'failed':
2291
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
2292
- elif status['status'] != 'processing':
2293
- break
2294
-
2295
- time.sleep(2) # Polling interval
2296
-
2297
- return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
2298
-
2299
- def async_deep_research(
2300
- self,
2301
- query: str,
2302
- *,
2303
- max_depth: Optional[int] = None,
2304
- time_limit: Optional[int] = None,
2305
- max_urls: Optional[int] = None,
2306
- analysis_prompt: Optional[str] = None,
2307
- system_prompt: Optional[str] = None,
2308
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
2309
- """
2310
- Initiates an asynchronous deep research operation.
2311
-
2312
- Args:
2313
- query (str): Research query or topic to investigate
2314
- max_depth (Optional[int]): Maximum depth of research exploration
2315
- time_limit (Optional[int]): Time limit in seconds for research
2316
- max_urls (Optional[int]): Maximum number of URLs to process
2317
- analysis_prompt (Optional[str]): Custom prompt for analysis
2318
- system_prompt (Optional[str]): Custom system prompt
2319
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
2320
-
2321
- Returns:
2322
- Dict[str, Any]: A response containing:
2323
- * success (bool): Whether the research initiation was successful
2324
- * id (str): The unique identifier for the research job
2325
- * error (str, optional): Error message if initiation failed
2326
-
2327
- Raises:
2328
- Exception: If the research initiation fails.
2329
- """
2330
- research_params = {}
2331
- if max_depth is not None:
2332
- research_params['maxDepth'] = max_depth
2333
- if time_limit is not None:
2334
- research_params['timeLimit'] = time_limit
2335
- if max_urls is not None:
2336
- research_params['maxUrls'] = max_urls
2337
- if analysis_prompt is not None:
2338
- research_params['analysisPrompt'] = analysis_prompt
2339
- if system_prompt is not None:
2340
- research_params['systemPrompt'] = system_prompt
2341
- if __experimental_stream_steps is not None:
2342
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
2343
- research_params = DeepResearchParams(**research_params)
2344
-
2345
- headers = self._prepare_headers()
2346
-
2347
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
2348
- json_data['origin'] = f"python-sdk@{version}"
2349
-
2350
- # Handle json options schema if present
2351
- if 'jsonOptions' in json_data:
2352
- json_opts = json_data['jsonOptions']
2353
- if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
2354
- json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
2355
-
2356
- try:
2357
- response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
2358
- if response.status_code == 200:
2359
- try:
2360
- return response.json()
2361
- except:
2362
- raise Exception('Failed to parse Firecrawl response as JSON.')
2363
- else:
2364
- self._handle_error(response, 'start deep research')
2365
- except Exception as e:
2366
- raise ValueError(str(e))
2367
-
2368
- return {'success': False, 'error': 'Internal server error'}
2369
-
2370
- def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
2371
- """
2372
- Check the status of a deep research operation.
2373
-
2374
- Args:
2375
- id (str): The ID of the deep research operation.
2376
-
2377
- Returns:
2378
- DeepResearchResponse containing:
2379
-
2380
- Status:
2381
- * success - Whether research completed successfully
2382
- * status - Current state (processing/completed/failed)
2383
- * error - Error message if failed
2384
-
2385
- Results:
2386
- * id - Unique identifier for the research job
2387
- * data - Research findings and analysis
2388
- * sources - List of discovered sources
2389
- * activities - Research progress log
2390
- * summaries - Generated research summaries
2391
-
2392
- Raises:
2393
- Exception: If the status check fails.
2394
- """
2395
- headers = self._prepare_headers()
2396
- try:
2397
- response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers)
2398
- if response.status_code == 200:
2399
- try:
2400
- return response.json()
2401
- except:
2402
- raise Exception('Failed to parse Firecrawl response as JSON.')
2403
- elif response.status_code == 404:
2404
- raise Exception('Deep research job not found')
2405
- else:
2406
- self._handle_error(response, 'check deep research status')
2407
- except Exception as e:
2408
- raise ValueError(str(e))
2409
-
2410
- return {'success': False, 'error': 'Internal server error'}
2411
-
2412
- def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
2413
- """
2414
- Validate additional keyword arguments before they are passed to the API.
2415
- This provides early validation before the Pydantic model validation.
2416
-
2417
- Args:
2418
- kwargs (Dict[str, Any]): Additional keyword arguments to validate
2419
- method_name (str): Name of the method these kwargs are for
2420
-
2421
- Raises:
2422
- ValueError: If kwargs contain invalid or unsupported parameters
2423
- """
2424
- if not kwargs:
2425
- return
2426
-
2427
- # Known parameter mappings for each method
2428
- method_params = {
2429
- "scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
2430
- "timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
2431
- "block_ads", "proxy", "extract", "json_options", "actions"},
2432
- "search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
2433
- "crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
2434
- "allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
2435
- "webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
2436
- "map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
2437
- "batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2438
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2439
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2440
- "actions", "agent"},
2441
- "async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2442
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2443
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2444
- "actions", "agent"},
2445
- "batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
2446
- "wait_for", "timeout", "location", "mobile", "skip_tls_verification",
2447
- "remove_base64_images", "block_ads", "proxy", "extract", "json_options",
2448
- "actions", "agent"}
2449
- }
2450
-
2451
- # Get allowed parameters for this method
2452
- allowed_params = method_params.get(method_name, set())
2453
-
2454
- # Check for unknown parameters
2455
- unknown_params = set(kwargs.keys()) - allowed_params
2456
- if unknown_params:
2457
- raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
2458
-
2459
- # Additional type validation can be added here if needed
2460
- # For now, we rely on Pydantic models for detailed type validation
2461
-
2462
- class CrawlWatcher:
2463
- """
2464
- A class to watch and handle crawl job events via WebSocket connection.
2465
-
2466
- Attributes:
2467
- id (str): The ID of the crawl job to watch
2468
- app (FirecrawlApp): The FirecrawlApp instance
2469
- data (List[Dict[str, Any]]): List of crawled documents/data
2470
- status (str): Current status of the crawl job
2471
- ws_url (str): WebSocket URL for the crawl job
2472
- event_handlers (dict): Dictionary of event type to list of handler functions
2473
- """
2474
- def __init__(self, id: str, app: FirecrawlApp):
2475
- self.id = id
2476
- self.app = app
2477
- self.data: List[Dict[str, Any]] = []
2478
- self.status = "scraping"
2479
- self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
2480
- self.event_handlers = {
2481
- 'done': [],
2482
- 'error': [],
2483
- 'document': []
2484
- }
2485
-
2486
- async def connect(self) -> None:
2487
- """
2488
- Establishes WebSocket connection and starts listening for messages.
2489
- """
2490
- async with websockets.connect(
2491
- self.ws_url,
2492
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
2493
- ) as websocket:
2494
- await self._listen(websocket)
2495
-
2496
- async def _listen(self, websocket) -> None:
2497
- """
2498
- Listens for incoming WebSocket messages and handles them.
2499
-
2500
- Args:
2501
- websocket: The WebSocket connection object
2502
- """
2503
- async for message in websocket:
2504
- msg = json.loads(message)
2505
- await self._handle_message(msg)
2506
-
2507
- def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
2508
- """
2509
- Adds an event handler function for a specific event type.
2510
-
2511
- Args:
2512
- event_type (str): Type of event to listen for ('done', 'error', or 'document')
2513
- handler (Callable): Function to handle the event
2514
- """
2515
- if event_type in self.event_handlers:
2516
- self.event_handlers[event_type].append(handler)
2517
-
2518
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
2519
- """
2520
- Dispatches an event to all registered handlers for that event type.
2521
-
2522
- Args:
2523
- event_type (str): Type of event to dispatch
2524
- detail (Dict[str, Any]): Event details/data to pass to handlers
2525
- """
2526
- if event_type in self.event_handlers:
2527
- for handler in self.event_handlers[event_type]:
2528
- handler(detail)
2529
-
2530
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
2531
- """
2532
- Handles incoming WebSocket messages based on their type.
2533
-
2534
- Args:
2535
- msg (Dict[str, Any]): The message to handle
2536
- """
2537
- if msg['type'] == 'done':
2538
- self.status = 'completed'
2539
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
2540
- elif msg['type'] == 'error':
2541
- self.status = 'failed'
2542
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
2543
- elif msg['type'] == 'catchup':
2544
- self.status = msg['data']['status']
2545
- self.data.extend(msg['data'].get('data', []))
2546
- for doc in self.data:
2547
- self.dispatch_event('document', {'data': doc, 'id': self.id})
2548
- elif msg['type'] == 'document':
2549
- self.data.append(msg['data'])
2550
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
2551
-
2552
- class AsyncFirecrawlApp(FirecrawlApp):
2553
- """
2554
- Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
2555
- Provides non-blocking alternatives to all FirecrawlApp operations.
2556
- """
2557
-
2558
- async def _async_request(
2559
- self,
2560
- method: str,
2561
- url: str,
2562
- headers: Dict[str, str],
2563
- data: Optional[Dict[str, Any]] = None,
2564
- retries: int = 3,
2565
- backoff_factor: float = 0.5) -> Dict[str, Any]:
2566
- """
2567
- Generic async request method with exponential backoff retry logic.
2568
-
2569
- Args:
2570
- method (str): The HTTP method to use (e.g., "GET" or "POST").
2571
- url (str): The URL to send the request to.
2572
- headers (Dict[str, str]): Headers to include in the request.
2573
- data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
2574
- retries (int): Maximum number of retry attempts (default: 3).
2575
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2576
- Delay will be backoff_factor * (2 ** retry_count).
2577
-
2578
- Returns:
2579
- Dict[str, Any]: The parsed JSON response from the server.
2580
-
2581
- Raises:
2582
- aiohttp.ClientError: If the request fails after all retries.
2583
- Exception: If max retries are exceeded or other errors occur.
2584
- """
2585
- async with aiohttp.ClientSession() as session:
2586
- for attempt in range(retries):
2587
- try:
2588
- async with session.request(
2589
- method=method, url=url, headers=headers, json=data
2590
- ) as response:
2591
- if response.status == 502:
2592
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2593
- continue
2594
- if response.status >= 300:
2595
- await self._handle_error(response, f"make {method} request")
2596
- return await response.json()
2597
- except aiohttp.ClientError as e:
2598
- if attempt == retries - 1:
2599
- raise e
2600
- await asyncio.sleep(backoff_factor * (2 ** attempt))
2601
- raise Exception("Max retries exceeded")
2602
-
2603
- async def _async_post_request(
2604
- self, url: str, data: Dict[str, Any], headers: Dict[str, str],
2605
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2606
- """
2607
- Make an async POST request with exponential backoff retry logic.
2608
-
2609
- Args:
2610
- url (str): The URL to send the POST request to.
2611
- data (Dict[str, Any]): The JSON data to include in the request body.
2612
- headers (Dict[str, str]): Headers to include in the request.
2613
- retries (int): Maximum number of retry attempts (default: 3).
2614
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2615
- Delay will be backoff_factor * (2 ** retry_count).
2616
-
2617
- Returns:
2618
- Dict[str, Any]: The parsed JSON response from the server.
2619
-
2620
- Raises:
2621
- aiohttp.ClientError: If the request fails after all retries.
2622
- Exception: If max retries are exceeded or other errors occur.
2623
- """
2624
- return await self._async_request("POST", url, headers, data, retries, backoff_factor)
2625
-
2626
- async def _async_get_request(
2627
- self, url: str, headers: Dict[str, str],
2628
- retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
2629
- """
2630
- Make an async GET request with exponential backoff retry logic.
2631
-
2632
- Args:
2633
- url (str): The URL to send the GET request to.
2634
- headers (Dict[str, str]): Headers to include in the request.
2635
- retries (int): Maximum number of retry attempts (default: 3).
2636
- backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
2637
- Delay will be backoff_factor * (2 ** retry_count).
2638
-
2639
- Returns:
2640
- Dict[str, Any]: The parsed JSON response from the server.
2641
-
2642
- Raises:
2643
- aiohttp.ClientError: If the request fails after all retries.
2644
- Exception: If max retries are exceeded or other errors occur.
2645
- """
2646
- return await self._async_request("GET", url, headers, None, retries, backoff_factor)
2647
-
2648
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
2649
- """
2650
- Handle errors from async API responses with detailed error messages.
2651
-
2652
- Args:
2653
- response (aiohttp.ClientResponse): The response object from the failed request
2654
- action (str): Description of the action that was being attempted
2655
-
2656
- Raises:
2657
- aiohttp.ClientError: With a detailed error message based on the response status:
2658
- - 402: Payment Required
2659
- - 408: Request Timeout
2660
- - 409: Conflict
2661
- - 500: Internal Server Error
2662
- - Other: Unexpected error with status code
2663
- """
2664
- try:
2665
- error_data = await response.json()
2666
- error_message = error_data.get('error', 'No error message provided.')
2667
- error_details = error_data.get('details', 'No additional error details provided.')
2668
- except:
2669
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
2670
-
2671
- message = await self._get_async_error_message(response.status, action, error_message, error_details)
2672
-
2673
- raise aiohttp.ClientError(message)
2674
-
2675
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
2676
- """
2677
- Generate a standardized error message based on HTTP status code for async operations.
2678
-
2679
- Args:
2680
- status_code (int): The HTTP status code from the response
2681
- action (str): Description of the action that was being performed
2682
- error_message (str): The error message from the API response
2683
- error_details (str): Additional error details from the API response
2684
-
2685
- Returns:
2686
- str: A formatted error message
2687
- """
2688
- return self._get_error_message(status_code, action, error_message, error_details)
2689
-
2690
- async def crawl_url_and_watch(
2691
- self,
2692
- url: str,
2693
- params: Optional[CrawlParams] = None,
2694
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2695
- """
2696
- Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
2697
-
2698
- Args:
2699
- url (str): Target URL to start crawling from
2700
- params (Optional[CrawlParams]): See CrawlParams model for configuration:
2701
- URL Discovery:
2702
- * includePaths - Patterns of URLs to include
2703
- * excludePaths - Patterns of URLs to exclude
2704
- * maxDepth - Maximum crawl depth
2705
- * maxDiscoveryDepth - Maximum depth for finding new URLs
2706
- * limit - Maximum pages to crawl
2707
-
2708
- Link Following:
2709
- * allowBackwardLinks - Follow parent directory links
2710
- * allowExternalLinks - Follow external domain links
2711
- * ignoreSitemap - Skip sitemap.xml processing
2712
-
2713
- Advanced:
2714
- * scrapeOptions - Page scraping configuration
2715
- * webhook - Notification webhook settings
2716
- * deduplicateSimilarURLs - Remove similar URLs
2717
- * ignoreQueryParameters - Ignore URL parameters
2718
- * regexOnFullURL - Apply regex to full URLs
2719
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2720
-
2721
- Returns:
2722
- AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
2723
-
2724
- Raises:
2725
- Exception: If crawl job fails to start
2726
- """
2727
- crawl_response = await self.async_crawl_url(url, params, idempotency_key)
2728
- if crawl_response.get('success') and 'id' in crawl_response:
2729
- return AsyncCrawlWatcher(crawl_response['id'], self)
2730
- else:
2731
- raise Exception("Crawl job failed to start")
2732
-
2733
- async def batch_scrape_urls_and_watch(
2734
- self,
2735
- urls: List[str],
2736
- params: Optional[ScrapeParams] = None,
2737
- idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
2738
- """
2739
- Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
2740
-
2741
- Args:
2742
- urls (List[str]): List of URLs to scrape
2743
- params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
2744
-
2745
- Content Options:
2746
- * formats - Content formats to retrieve
2747
- * includeTags - HTML tags to include
2748
- * excludeTags - HTML tags to exclude
2749
- * onlyMainContent - Extract main content only
2750
-
2751
- Request Options:
2752
- * headers - Custom HTTP headers
2753
- * timeout - Request timeout (ms)
2754
- * mobile - Use mobile user agent
2755
- * proxy - Proxy type
2756
-
2757
- Extraction Options:
2758
- * extract - Content extraction config
2759
- * jsonOptions - JSON extraction config
2760
- * actions - Actions to perform
2761
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2762
-
2763
- Returns:
2764
- AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
2765
-
2766
- Raises:
2767
- Exception: If batch scrape job fails to start
2768
- """
2769
- batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
2770
- if batch_response.get('success') and 'id' in batch_response:
2771
- return AsyncCrawlWatcher(batch_response['id'], self)
2772
- else:
2773
- raise Exception("Batch scrape job failed to start")
2774
-
2775
- async def scrape_url(
2776
- self,
2777
- url: str,
2778
- *,
2779
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
2780
- include_tags: Optional[List[str]] = None,
2781
- exclude_tags: Optional[List[str]] = None,
2782
- only_main_content: Optional[bool] = None,
2783
- wait_for: Optional[int] = None,
2784
- timeout: Optional[int] = None,
2785
- location: Optional[LocationConfig] = None,
2786
- mobile: Optional[bool] = None,
2787
- skip_tls_verification: Optional[bool] = None,
2788
- remove_base64_images: Optional[bool] = None,
2789
- block_ads: Optional[bool] = None,
2790
- proxy: Optional[Literal["basic", "stealth"]] = None,
2791
- extract: Optional[JsonConfig] = None,
2792
- json_options: Optional[JsonConfig] = None,
2793
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2794
- **kwargs) -> ScrapeResponse[Any]:
2795
- """
2796
- Scrape a single URL asynchronously.
2797
-
2798
- Args:
2799
- url (str): Target URL to scrape
2800
- formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
2801
- include_tags (Optional[List[str]]): HTML tags to include
2802
- exclude_tags (Optional[List[str]]): HTML tags to exclude
2803
- only_main_content (Optional[bool]): Extract main content only
2804
- wait_for (Optional[int]): Wait for a specific element to appear
2805
- timeout (Optional[int]): Request timeout (ms)
2806
- location (Optional[LocationConfig]): Location configuration
2807
- mobile (Optional[bool]): Use mobile user agent
2808
- skip_tls_verification (Optional[bool]): Skip TLS verification
2809
- remove_base64_images (Optional[bool]): Remove base64 images
2810
- block_ads (Optional[bool]): Block ads
2811
- proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
2812
- extract (Optional[JsonConfig]): Content extraction settings
2813
- json_options (Optional[JsonConfig]): JSON extraction settings
2814
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2815
- **kwargs: Additional parameters to pass to the API
2816
-
2817
- Returns:
2818
- ScrapeResponse with:
2819
- * success - Whether scrape was successful
2820
- * markdown - Markdown content if requested
2821
- * html - HTML content if requested
2822
- * rawHtml - Raw HTML content if requested
2823
- * links - Extracted links if requested
2824
- * screenshot - Screenshot if requested
2825
- * extract - Extracted data if requested
2826
- * json - JSON data if requested
2827
- * error - Error message if scrape failed
2828
-
2829
- Raises:
2830
- Exception: If scraping fails
2831
- """
2832
- # Validate any additional kwargs
2833
- self._validate_kwargs(kwargs, "scrape_url")
2834
-
2835
- headers = self._prepare_headers()
2836
-
2837
- # Build scrape parameters
2838
- scrape_params = {
2839
- 'url': url,
2840
- 'origin': f"python-sdk@{version}"
2841
- }
2842
-
2843
- # Add optional parameters if provided and not None
2844
- if formats:
2845
- scrape_params['formats'] = formats
2846
- if include_tags:
2847
- scrape_params['includeTags'] = include_tags
2848
- if exclude_tags:
2849
- scrape_params['excludeTags'] = exclude_tags
2850
- if only_main_content is not None:
2851
- scrape_params['onlyMainContent'] = only_main_content
2852
- if wait_for:
2853
- scrape_params['waitFor'] = wait_for
2854
- if timeout:
2855
- scrape_params['timeout'] = timeout
2856
- if location:
2857
- scrape_params['location'] = location.dict(exclude_none=True)
2858
- if mobile is not None:
2859
- scrape_params['mobile'] = mobile
2860
- if skip_tls_verification is not None:
2861
- scrape_params['skipTlsVerification'] = skip_tls_verification
2862
- if remove_base64_images is not None:
2863
- scrape_params['removeBase64Images'] = remove_base64_images
2864
- if block_ads is not None:
2865
- scrape_params['blockAds'] = block_ads
2866
- if proxy:
2867
- scrape_params['proxy'] = proxy
2868
- if extract:
2869
- extract_dict = extract.dict(exclude_none=True)
2870
- if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
2871
- extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
2872
- scrape_params['extract'] = extract_dict
2873
- if json_options:
2874
- json_options_dict = json_options.dict(exclude_none=True)
2875
- if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
2876
- json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
2877
- scrape_params['jsonOptions'] = json_options_dict
2878
- if actions:
2879
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
2880
-
2881
- # Make async request
2882
- endpoint = f'/v1/scrape'
2883
- response = await self._async_post_request(
2884
- f'{self.api_url}{endpoint}',
2885
- scrape_params,
2886
- headers
2887
- )
2888
-
2889
- if response.get('success') and 'data' in response:
2890
- return ScrapeResponse(**response['data'])
2891
- elif "error" in response:
2892
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
2893
- else:
2894
- # Use the response content directly if possible, otherwise a generic message
2895
- error_content = response.get('error', str(response))
2896
- raise Exception(f'Failed to scrape URL. Error: {error_content}')
2897
-
2898
- async def batch_scrape_urls(
2899
- self,
2900
- urls: List[str],
2901
- *,
2902
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
2903
- headers: Optional[Dict[str, str]] = None,
2904
- include_tags: Optional[List[str]] = None,
2905
- exclude_tags: Optional[List[str]] = None,
2906
- only_main_content: Optional[bool] = None,
2907
- wait_for: Optional[int] = None,
2908
- timeout: Optional[int] = None,
2909
- location: Optional[LocationConfig] = None,
2910
- mobile: Optional[bool] = None,
2911
- skip_tls_verification: Optional[bool] = None,
2912
- remove_base64_images: Optional[bool] = None,
2913
- block_ads: Optional[bool] = None,
2914
- proxy: Optional[Literal["basic", "stealth"]] = None,
2915
- extract: Optional[JsonConfig] = None,
2916
- json_options: Optional[JsonConfig] = None,
2917
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2918
- agent: Optional[AgentOptions] = None,
2919
- poll_interval: Optional[int] = 2,
2920
- idempotency_key: Optional[str] = None,
2921
- **kwargs
2922
- ) -> BatchScrapeStatusResponse:
2923
- """
2924
- Asynchronously scrape multiple URLs and monitor until completion.
2925
-
2926
- Args:
2927
- urls (List[str]): URLs to scrape
2928
- formats (Optional[List[Literal]]): Content formats to retrieve
2929
- headers (Optional[Dict[str, str]]): Custom HTTP headers
2930
- include_tags (Optional[List[str]]): HTML tags to include
2931
- exclude_tags (Optional[List[str]]): HTML tags to exclude
2932
- only_main_content (Optional[bool]): Extract main content only
2933
- wait_for (Optional[int]): Wait time in milliseconds
2934
- timeout (Optional[int]): Request timeout in milliseconds
2935
- location (Optional[LocationConfig]): Location configuration
2936
- mobile (Optional[bool]): Use mobile user agent
2937
- skip_tls_verification (Optional[bool]): Skip TLS verification
2938
- remove_base64_images (Optional[bool]): Remove base64 encoded images
2939
- block_ads (Optional[bool]): Block advertisements
2940
- proxy (Optional[Literal]): Proxy type to use
2941
- extract (Optional[JsonConfig]): Content extraction config
2942
- json_options (Optional[JsonConfig]): JSON extraction config
2943
- actions (Optional[List[Union]]): Actions to perform
2944
- agent (Optional[AgentOptions]): Agent configuration
2945
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
2946
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
2947
- **kwargs: Additional parameters to pass to the API
2948
-
2949
- Returns:
2950
- BatchScrapeStatusResponse with:
2951
- * Scraping status and progress
2952
- * Scraped content for each URL
2953
- * Success/error information
2954
-
2955
- Raises:
2956
- Exception: If batch scrape fails
2957
- """
2958
- # Validate any additional kwargs
2959
- self._validate_kwargs(kwargs, "batch_scrape_urls")
2960
-
2961
- scrape_params = {}
2962
-
2963
- # Add individual parameters
2964
- if formats is not None:
2965
- scrape_params['formats'] = formats
2966
- if headers is not None:
2967
- scrape_params['headers'] = headers
2968
- if include_tags is not None:
2969
- scrape_params['includeTags'] = include_tags
2970
- if exclude_tags is not None:
2971
- scrape_params['excludeTags'] = exclude_tags
2972
- if only_main_content is not None:
2973
- scrape_params['onlyMainContent'] = only_main_content
2974
- if wait_for is not None:
2975
- scrape_params['waitFor'] = wait_for
2976
- if timeout is not None:
2977
- scrape_params['timeout'] = timeout
2978
- if location is not None:
2979
- scrape_params['location'] = location.dict(exclude_none=True)
2980
- if mobile is not None:
2981
- scrape_params['mobile'] = mobile
2982
- if skip_tls_verification is not None:
2983
- scrape_params['skipTlsVerification'] = skip_tls_verification
2984
- if remove_base64_images is not None:
2985
- scrape_params['removeBase64Images'] = remove_base64_images
2986
- if block_ads is not None:
2987
- scrape_params['blockAds'] = block_ads
2988
- if proxy is not None:
2989
- scrape_params['proxy'] = proxy
2990
- if extract is not None:
2991
- if hasattr(extract.schema, 'schema'):
2992
- extract.schema = extract.schema.schema()
2993
- scrape_params['extract'] = extract.dict(exclude_none=True)
2994
- if json_options is not None:
2995
- if hasattr(json_options.schema, 'schema'):
2996
- json_options.schema = json_options.schema.schema()
2997
- scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
2998
- if actions is not None:
2999
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3000
- if agent is not None:
3001
- scrape_params['agent'] = agent.dict(exclude_none=True)
3002
-
3003
- # Add any additional kwargs
3004
- scrape_params.update(kwargs)
3005
-
3006
- # Create final params object
3007
- final_params = ScrapeParams(**scrape_params)
3008
- params_dict = final_params.dict(exclude_none=True)
3009
- params_dict['urls'] = urls
3010
- params_dict['origin'] = f"python-sdk@{version}"
3011
-
3012
- # Make request
3013
- headers = self._prepare_headers(idempotency_key)
3014
- response = await self._async_post_request(
3015
- f'{self.api_url}/v1/batch/scrape',
3016
- params_dict,
3017
- headers
3018
- )
3019
-
3020
- if response.get('success'):
3021
- try:
3022
- id = response.get('id')
3023
- except:
3024
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3025
- return self._monitor_job_status(id, headers, poll_interval)
3026
- else:
3027
- self._handle_error(response, 'start batch scrape job')
3028
-
3029
-
3030
- async def async_batch_scrape_urls(
3031
- self,
3032
- urls: List[str],
3033
- *,
3034
- formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
3035
- headers: Optional[Dict[str, str]] = None,
3036
- include_tags: Optional[List[str]] = None,
3037
- exclude_tags: Optional[List[str]] = None,
3038
- only_main_content: Optional[bool] = None,
3039
- wait_for: Optional[int] = None,
3040
- timeout: Optional[int] = None,
3041
- location: Optional[LocationConfig] = None,
3042
- mobile: Optional[bool] = None,
3043
- skip_tls_verification: Optional[bool] = None,
3044
- remove_base64_images: Optional[bool] = None,
3045
- block_ads: Optional[bool] = None,
3046
- proxy: Optional[Literal["basic", "stealth"]] = None,
3047
- extract: Optional[JsonConfig] = None,
3048
- json_options: Optional[JsonConfig] = None,
3049
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3050
- agent: Optional[AgentOptions] = None,
3051
- idempotency_key: Optional[str] = None,
3052
- **kwargs
3053
- ) -> BatchScrapeResponse:
3054
- """
3055
- Initiate a batch scrape job asynchronously.
3056
-
3057
- Args:
3058
- urls (List[str]): URLs to scrape
3059
- formats (Optional[List[Literal]]): Content formats to retrieve
3060
- headers (Optional[Dict[str, str]]): Custom HTTP headers
3061
- include_tags (Optional[List[str]]): HTML tags to include
3062
- exclude_tags (Optional[List[str]]): HTML tags to exclude
3063
- only_main_content (Optional[bool]): Extract main content only
3064
- wait_for (Optional[int]): Wait time in milliseconds
3065
- timeout (Optional[int]): Request timeout in milliseconds
3066
- location (Optional[LocationConfig]): Location configuration
3067
- mobile (Optional[bool]): Use mobile user agent
3068
- skip_tls_verification (Optional[bool]): Skip TLS verification
3069
- remove_base64_images (Optional[bool]): Remove base64 encoded images
3070
- block_ads (Optional[bool]): Block advertisements
3071
- proxy (Optional[Literal]): Proxy type to use
3072
- extract (Optional[JsonConfig]): Content extraction config
3073
- json_options (Optional[JsonConfig]): JSON extraction config
3074
- actions (Optional[List[Union]]): Actions to perform
3075
- agent (Optional[AgentOptions]): Agent configuration
3076
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3077
- **kwargs: Additional parameters to pass to the API
3078
-
3079
- Returns:
3080
- BatchScrapeResponse with:
3081
- * success - Whether job started successfully
3082
- * id - Unique identifier for the job
3083
- * url - Status check URL
3084
- * error - Error message if start failed
3085
-
3086
- Raises:
3087
- Exception: If job initiation fails
3088
- """
3089
- # Validate any additional kwargs
3090
- self._validate_kwargs(kwargs, "async_batch_scrape_urls")
3091
-
3092
- scrape_params = {}
3093
-
3094
- # Add individual parameters
3095
- if formats is not None:
3096
- scrape_params['formats'] = formats
3097
- if headers is not None:
3098
- scrape_params['headers'] = headers
3099
- if include_tags is not None:
3100
- scrape_params['includeTags'] = include_tags
3101
- if exclude_tags is not None:
3102
- scrape_params['excludeTags'] = exclude_tags
3103
- if only_main_content is not None:
3104
- scrape_params['onlyMainContent'] = only_main_content
3105
- if wait_for is not None:
3106
- scrape_params['waitFor'] = wait_for
3107
- if timeout is not None:
3108
- scrape_params['timeout'] = timeout
3109
- if location is not None:
3110
- scrape_params['location'] = location.dict(exclude_none=True)
3111
- if mobile is not None:
3112
- scrape_params['mobile'] = mobile
3113
- if skip_tls_verification is not None:
3114
- scrape_params['skipTlsVerification'] = skip_tls_verification
3115
- if remove_base64_images is not None:
3116
- scrape_params['removeBase64Images'] = remove_base64_images
3117
- if block_ads is not None:
3118
- scrape_params['blockAds'] = block_ads
3119
- if proxy is not None:
3120
- scrape_params['proxy'] = proxy
3121
- if extract is not None:
3122
- if hasattr(extract.schema, 'schema'):
3123
- extract.schema = extract.schema.schema()
3124
- scrape_params['extract'] = extract.dict(exclude_none=True)
3125
- if json_options is not None:
3126
- if hasattr(json_options.schema, 'schema'):
3127
- json_options.schema = json_options.schema.schema()
3128
- scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
3129
- if actions is not None:
3130
- scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
3131
- if agent is not None:
3132
- scrape_params['agent'] = agent.dict(exclude_none=True)
3133
-
3134
- # Add any additional kwargs
3135
- scrape_params.update(kwargs)
3136
-
3137
- # Create final params object
3138
- final_params = ScrapeParams(**scrape_params)
3139
- params_dict = final_params.dict(exclude_none=True)
3140
- params_dict['urls'] = urls
3141
- params_dict['origin'] = f"python-sdk@{version}"
3142
-
3143
- # Make request
3144
- headers = self._prepare_headers(idempotency_key)
3145
- response = await self._async_post_request(
3146
- f'{self.api_url}/v1/batch/scrape',
3147
- params_dict,
3148
- headers
3149
- )
3150
-
3151
- if response.get('status_code') == 200:
3152
- try:
3153
- return BatchScrapeResponse(**response.json())
3154
- except:
3155
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3156
- else:
3157
- self._handle_error(response, 'start batch scrape job')
3158
-
3159
- async def crawl_url(
3160
- self,
3161
- url: str,
3162
- *,
3163
- include_paths: Optional[List[str]] = None,
3164
- exclude_paths: Optional[List[str]] = None,
3165
- max_depth: Optional[int] = None,
3166
- max_discovery_depth: Optional[int] = None,
3167
- limit: Optional[int] = None,
3168
- allow_backward_links: Optional[bool] = None,
3169
- allow_external_links: Optional[bool] = None,
3170
- ignore_sitemap: Optional[bool] = None,
3171
- scrape_options: Optional[ScrapeOptions] = None,
3172
- webhook: Optional[Union[str, WebhookConfig]] = None,
3173
- deduplicate_similar_urls: Optional[bool] = None,
3174
- ignore_query_parameters: Optional[bool] = None,
3175
- regex_on_full_url: Optional[bool] = None,
3176
- poll_interval: Optional[int] = 2,
3177
- idempotency_key: Optional[str] = None,
3178
- **kwargs
3179
- ) -> CrawlStatusResponse:
3180
- """
3181
- Crawl a website starting from a URL.
3182
-
3183
- Args:
3184
- url (str): Target URL to start crawling from
3185
- include_paths (Optional[List[str]]): Patterns of URLs to include
3186
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3187
- max_depth (Optional[int]): Maximum crawl depth
3188
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3189
- limit (Optional[int]): Maximum pages to crawl
3190
- allow_backward_links (Optional[bool]): Follow parent directory links
3191
- allow_external_links (Optional[bool]): Follow external domain links
3192
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3193
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3194
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3195
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3196
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3197
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3198
- poll_interval (Optional[int]): Seconds between status checks (default: 2)
3199
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3200
- **kwargs: Additional parameters to pass to the API
3201
-
3202
- Returns:
3203
- CrawlStatusResponse with:
3204
- * Crawling status and progress
3205
- * Crawled page contents
3206
- * Success/error information
3207
-
3208
- Raises:
3209
- Exception: If crawl fails
3210
- """
3211
- # Validate any additional kwargs
3212
- self._validate_kwargs(kwargs, "crawl_url")
3213
-
3214
- crawl_params = {}
3215
-
3216
- # Add individual parameters
3217
- if include_paths is not None:
3218
- crawl_params['includePaths'] = include_paths
3219
- if exclude_paths is not None:
3220
- crawl_params['excludePaths'] = exclude_paths
3221
- if max_depth is not None:
3222
- crawl_params['maxDepth'] = max_depth
3223
- if max_discovery_depth is not None:
3224
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3225
- if limit is not None:
3226
- crawl_params['limit'] = limit
3227
- if allow_backward_links is not None:
3228
- crawl_params['allowBackwardLinks'] = allow_backward_links
3229
- if allow_external_links is not None:
3230
- crawl_params['allowExternalLinks'] = allow_external_links
3231
- if ignore_sitemap is not None:
3232
- crawl_params['ignoreSitemap'] = ignore_sitemap
3233
- if scrape_options is not None:
3234
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3235
- if webhook is not None:
3236
- crawl_params['webhook'] = webhook
3237
- if deduplicate_similar_urls is not None:
3238
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3239
- if ignore_query_parameters is not None:
3240
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3241
- if regex_on_full_url is not None:
3242
- crawl_params['regexOnFullURL'] = regex_on_full_url
3243
-
3244
- # Add any additional kwargs
3245
- crawl_params.update(kwargs)
3246
-
3247
- # Create final params object
3248
- final_params = CrawlParams(**crawl_params)
3249
- params_dict = final_params.dict(exclude_none=True)
3250
- params_dict['url'] = url
3251
- params_dict['origin'] = f"python-sdk@{version}"
3252
- # Make request
3253
- headers = self._prepare_headers(idempotency_key)
3254
- response = await self._async_post_request(
3255
- f'{self.api_url}/v1/crawl', params_dict, headers)
3256
-
3257
- if response.get('success'):
3258
- try:
3259
- id = response.get('id')
3260
- except:
3261
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3262
- return self._monitor_job_status(id, headers, poll_interval)
3263
- else:
3264
- self._handle_error(response, 'start crawl job')
3265
-
3266
-
3267
- async def async_crawl_url(
3268
- self,
3269
- url: str,
3270
- *,
3271
- include_paths: Optional[List[str]] = None,
3272
- exclude_paths: Optional[List[str]] = None,
3273
- max_depth: Optional[int] = None,
3274
- max_discovery_depth: Optional[int] = None,
3275
- limit: Optional[int] = None,
3276
- allow_backward_links: Optional[bool] = None,
3277
- allow_external_links: Optional[bool] = None,
3278
- ignore_sitemap: Optional[bool] = None,
3279
- scrape_options: Optional[ScrapeOptions] = None,
3280
- webhook: Optional[Union[str, WebhookConfig]] = None,
3281
- deduplicate_similar_urls: Optional[bool] = None,
3282
- ignore_query_parameters: Optional[bool] = None,
3283
- regex_on_full_url: Optional[bool] = None,
3284
- poll_interval: Optional[int] = 2,
3285
- idempotency_key: Optional[str] = None,
3286
- **kwargs
3287
- ) -> CrawlResponse:
3288
- """
3289
- Start an asynchronous crawl job.
3290
-
3291
- Args:
3292
- url (str): Target URL to start crawling from
3293
- include_paths (Optional[List[str]]): Patterns of URLs to include
3294
- exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
3295
- max_depth (Optional[int]): Maximum crawl depth
3296
- max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3297
- limit (Optional[int]): Maximum pages to crawl
3298
- allow_backward_links (Optional[bool]): Follow parent directory links
3299
- allow_external_links (Optional[bool]): Follow external domain links
3300
- ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3301
- scrape_options (Optional[ScrapeOptions]): Page scraping configuration
3302
- webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
3303
- deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3304
- ignore_query_parameters (Optional[bool]): Ignore URL parameters
3305
- regex_on_full_url (Optional[bool]): Apply regex to full URLs
3306
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3307
- **kwargs: Additional parameters to pass to the API
3308
-
3309
- Returns:
3310
- CrawlResponse with:
3311
- * success - Whether crawl started successfully
3312
- * id - Unique identifier for the crawl job
3313
- * url - Status check URL for the crawl
3314
- * error - Error message if start failed
3315
-
3316
- Raises:
3317
- Exception: If crawl initiation fails
3318
- """
3319
- crawl_params = {}
3320
-
3321
- # Add individual parameters
3322
- if include_paths is not None:
3323
- crawl_params['includePaths'] = include_paths
3324
- if exclude_paths is not None:
3325
- crawl_params['excludePaths'] = exclude_paths
3326
- if max_depth is not None:
3327
- crawl_params['maxDepth'] = max_depth
3328
- if max_discovery_depth is not None:
3329
- crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3330
- if limit is not None:
3331
- crawl_params['limit'] = limit
3332
- if allow_backward_links is not None:
3333
- crawl_params['allowBackwardLinks'] = allow_backward_links
3334
- if allow_external_links is not None:
3335
- crawl_params['allowExternalLinks'] = allow_external_links
3336
- if ignore_sitemap is not None:
3337
- crawl_params['ignoreSitemap'] = ignore_sitemap
3338
- if scrape_options is not None:
3339
- crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
3340
- if webhook is not None:
3341
- crawl_params['webhook'] = webhook
3342
- if deduplicate_similar_urls is not None:
3343
- crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
3344
- if ignore_query_parameters is not None:
3345
- crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3346
- if regex_on_full_url is not None:
3347
- crawl_params['regexOnFullURL'] = regex_on_full_url
3348
-
3349
- # Add any additional kwargs
3350
- crawl_params.update(kwargs)
3351
-
3352
- # Create final params object
3353
- final_params = CrawlParams(**crawl_params)
3354
- params_dict = final_params.dict(exclude_none=True)
3355
- params_dict['url'] = url
3356
- params_dict['origin'] = f"python-sdk@{version}"
3357
-
3358
- # Make request
3359
- headers = self._prepare_headers(idempotency_key)
3360
- response = await self._async_post_request(
3361
- f'{self.api_url}/v1/crawl',
3362
- params_dict,
3363
- headers
3364
- )
3365
-
3366
- if response.get('success'):
3367
- try:
3368
- return CrawlResponse(**response)
3369
- except:
3370
- raise Exception(f'Failed to parse Firecrawl response as JSON.')
3371
- else:
3372
- self._handle_error(response, 'start crawl job')
3373
-
3374
- async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
3375
- """
3376
- Check the status and results of an asynchronous crawl job.
3377
-
3378
- Args:
3379
- id (str): Unique identifier for the crawl job
3380
-
3381
- Returns:
3382
- CrawlStatusResponse containing:
3383
- Status Information:
3384
- * status - Current state (scraping/completed/failed/cancelled)
3385
- * completed - Number of pages crawled
3386
- * total - Total pages to crawl
3387
- * creditsUsed - API credits consumed
3388
- * expiresAt - Data expiration timestamp
3389
-
3390
- Results:
3391
- * data - List of crawled documents
3392
- * next - URL for next page of results (if paginated)
3393
- * success - Whether status check succeeded
3394
- * error - Error message if failed
3395
-
3396
- Raises:
3397
- Exception: If status check fails
3398
- """
3399
- headers = self._prepare_headers()
3400
- endpoint = f'/v1/crawl/{id}'
3401
-
3402
- status_data = await self._async_get_request(
3403
- f'{self.api_url}{endpoint}',
3404
- headers
3405
- )
3406
-
3407
- if status_data.get('status') == 'completed':
3408
- if 'data' in status_data:
3409
- data = status_data['data']
3410
- while 'next' in status_data:
3411
- if len(status_data['data']) == 0:
3412
- break
3413
- next_url = status_data.get('next')
3414
- if not next_url:
3415
- logger.warning("Expected 'next' URL is missing.")
3416
- break
3417
- next_data = await self._async_get_request(next_url, headers)
3418
- data.extend(next_data.get('data', []))
3419
- status_data = next_data
3420
- status_data['data'] = data
3421
- # Create CrawlStatusResponse object from status data
3422
- response = CrawlStatusResponse(
3423
- status=status_data.get('status'),
3424
- total=status_data.get('total'),
3425
- completed=status_data.get('completed'),
3426
- creditsUsed=status_data.get('creditsUsed'),
3427
- expiresAt=status_data.get('expiresAt'),
3428
- data=status_data.get('data'),
3429
- success=False if 'error' in status_data else True
3430
- )
3431
-
3432
- if 'error' in status_data:
3433
- response.error = status_data.get('error')
3434
-
3435
- if 'next' in status_data:
3436
- response.next = status_data.get('next')
3437
-
3438
- return response
3439
-
3440
- async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
3441
- """
3442
- Monitor the status of an asynchronous job until completion.
3443
-
3444
- Args:
3445
- id (str): The ID of the job to monitor
3446
- headers (Dict[str, str]): Headers to include in status check requests
3447
- poll_interval (int): Seconds between status checks (default: 2)
3448
-
3449
- Returns:
3450
- CrawlStatusResponse: The job results if completed successfully
3451
-
3452
- Raises:
3453
- Exception: If the job fails or an error occurs during status checks
3454
- """
3455
- while True:
3456
- status_data = await self._async_get_request(
3457
- f'{self.api_url}/v1/crawl/{id}',
3458
- headers
3459
- )
3460
-
3461
- if status_data.get('status') == 'completed':
3462
- if 'data' in status_data:
3463
- data = status_data['data']
3464
- while 'next' in status_data:
3465
- if len(status_data['data']) == 0:
3466
- break
3467
- next_url = status_data.get('next')
3468
- if not next_url:
3469
- logger.warning("Expected 'next' URL is missing.")
3470
- break
3471
- next_data = await self._async_get_request(next_url, headers)
3472
- data.extend(next_data.get('data', []))
3473
- status_data = next_data
3474
- status_data['data'] = data
3475
- return status_data
3476
- else:
3477
- raise Exception('Job completed but no data was returned')
3478
- elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
3479
- await asyncio.sleep(max(poll_interval, 2))
3480
- else:
3481
- raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
3482
-
3483
- async def map_url(
3484
- self,
3485
- url: str,
3486
- *,
3487
- search: Optional[str] = None,
3488
- ignore_sitemap: Optional[bool] = None,
3489
- include_subdomains: Optional[bool] = None,
3490
- sitemap_only: Optional[bool] = None,
3491
- limit: Optional[int] = None,
3492
- timeout: Optional[int] = None,
3493
- params: Optional[MapParams] = None) -> MapResponse:
3494
- """
3495
- Asynchronously map and discover links from a URL.
3496
-
3497
- Args:
3498
- url (str): Target URL to map
3499
- params (Optional[MapParams]): See MapParams model:
3500
- Discovery Options:
3501
- * search - Filter pattern for URLs
3502
- * ignoreSitemap - Skip sitemap.xml
3503
- * includeSubdomains - Include subdomain links
3504
- * sitemapOnly - Only use sitemap.xml
3505
-
3506
- Limits:
3507
- * limit - Max URLs to return
3508
- * timeout - Request timeout (ms)
3509
-
3510
- Returns:
3511
- MapResponse with:
3512
- * Discovered URLs
3513
- * Success/error status
3514
-
3515
- Raises:
3516
- Exception: If mapping fails
3517
- """
3518
- map_params = {}
3519
- if params:
3520
- map_params.update(params.dict(exclude_none=True))
3521
-
3522
- # Add individual parameters
3523
- if search is not None:
3524
- map_params['search'] = search
3525
- if ignore_sitemap is not None:
3526
- map_params['ignoreSitemap'] = ignore_sitemap
3527
- if include_subdomains is not None:
3528
- map_params['includeSubdomains'] = include_subdomains
3529
- if sitemap_only is not None:
3530
- map_params['sitemapOnly'] = sitemap_only
3531
- if limit is not None:
3532
- map_params['limit'] = limit
3533
- if timeout is not None:
3534
- map_params['timeout'] = timeout
3535
-
3536
- # Create final params object
3537
- final_params = MapParams(**map_params)
3538
- params_dict = final_params.dict(exclude_none=True)
3539
- params_dict['url'] = url
3540
- params_dict['origin'] = f"python-sdk@{version}"
3541
-
3542
- # Make request
3543
- endpoint = f'/v1/map'
3544
- response = await self._async_post_request(
3545
- f'{self.api_url}{endpoint}',
3546
- params_dict,
3547
- headers={"Authorization": f"Bearer {self.api_key}"}
3548
- )
3549
-
3550
- if response.get('success') and 'links' in response:
3551
- return MapResponse(**response)
3552
- elif 'error' in response:
3553
- raise Exception(f'Failed to map URL. Error: {response["error"]}')
3554
- else:
3555
- raise Exception(f'Failed to map URL. Error: {response}')
3556
-
3557
- async def extract(
3558
- self,
3559
- urls: Optional[List[str]] = None,
3560
- *,
3561
- prompt: Optional[str] = None,
3562
- schema: Optional[Any] = None,
3563
- system_prompt: Optional[str] = None,
3564
- allow_external_links: Optional[bool] = False,
3565
- enable_web_search: Optional[bool] = False,
3566
- show_sources: Optional[bool] = False,
3567
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3568
-
3569
- """
3570
- Asynchronously extract structured information from URLs.
3571
-
3572
- Args:
3573
- urls (Optional[List[str]]): URLs to extract from
3574
- prompt (Optional[str]): Custom extraction prompt
3575
- schema (Optional[Any]): JSON schema/Pydantic model
3576
- system_prompt (Optional[str]): System context
3577
- allow_external_links (Optional[bool]): Follow external links
3578
- enable_web_search (Optional[bool]): Enable web search
3579
- show_sources (Optional[bool]): Include source URLs
3580
- agent (Optional[Dict[str, Any]]): Agent configuration
3581
-
3582
- Returns:
3583
- ExtractResponse with:
3584
- * Structured data matching schema
3585
- * Source information if requested
3586
- * Success/error status
3587
-
3588
- Raises:
3589
- ValueError: If prompt/schema missing or extraction fails
3590
- """
3591
- headers = self._prepare_headers()
3592
-
3593
- if not prompt and not schema:
3594
- raise ValueError("Either prompt or schema is required")
3595
-
3596
- if not urls and not prompt:
3597
- raise ValueError("Either urls or prompt is required")
3598
-
3599
- if schema:
3600
- if hasattr(schema, 'model_json_schema'):
3601
- # Convert Pydantic model to JSON schema
3602
- schema = schema.model_json_schema()
3603
- # Otherwise assume it's already a JSON schema dict
3604
-
3605
- request_data = {
3606
- 'urls': urls or [],
3607
- 'allowExternalLinks': allow_external_links,
3608
- 'enableWebSearch': enable_web_search,
3609
- 'showSources': show_sources,
3610
- 'schema': schema,
3611
- 'origin': f'python-sdk@{get_version()}'
3612
- }
3613
-
3614
- # Only add prompt and systemPrompt if they exist
3615
- if prompt:
3616
- request_data['prompt'] = prompt
3617
- if system_prompt:
3618
- request_data['systemPrompt'] = system_prompt
3619
-
3620
- if agent:
3621
- request_data['agent'] = agent
3622
-
3623
- response = await self._async_post_request(
3624
- f'{self.api_url}/v1/extract',
3625
- request_data,
3626
- headers
3627
- )
3628
-
3629
- if response.get('success'):
3630
- job_id = response.get('id')
3631
- if not job_id:
3632
- raise Exception('Job ID not returned from extract request.')
3633
-
3634
- while True:
3635
- status_data = await self._async_get_request(
3636
- f'{self.api_url}/v1/extract/{job_id}',
3637
- headers
3638
- )
3639
-
3640
- if status_data['status'] == 'completed':
3641
- return ExtractResponse(**status_data)
3642
- elif status_data['status'] in ['failed', 'cancelled']:
3643
- raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
3644
-
3645
- await asyncio.sleep(2)
3646
- else:
3647
- raise Exception(f'Failed to extract. Error: {response.get("error")}')
3648
-
3649
- async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
3650
- """
3651
- Check the status of an asynchronous batch scrape job.
3652
-
3653
- Args:
3654
- id (str): The ID of the batch scrape job
3655
-
3656
- Returns:
3657
- BatchScrapeStatusResponse containing:
3658
- Status Information:
3659
- * status - Current state (scraping/completed/failed/cancelled)
3660
- * completed - Number of URLs scraped
3661
- * total - Total URLs to scrape
3662
- * creditsUsed - API credits consumed
3663
- * expiresAt - Data expiration timestamp
3664
-
3665
- Results:
3666
- * data - List of scraped documents
3667
- * next - URL for next page of results (if paginated)
3668
- * success - Whether status check succeeded
3669
- * error - Error message if failed
3670
-
3671
- Raises:
3672
- Exception: If status check fails
3673
- """
3674
- headers = self._prepare_headers()
3675
- endpoint = f'/v1/batch/scrape/{id}'
3676
-
3677
- status_data = await self._async_get_request(
3678
- f'{self.api_url}{endpoint}',
3679
- headers
3680
- )
3681
-
3682
- if status_data['status'] == 'completed':
3683
- if 'data' in status_data:
3684
- data = status_data['data']
3685
- while 'next' in status_data:
3686
- if len(status_data['data']) == 0:
3687
- break
3688
- next_url = status_data.get('next')
3689
- if not next_url:
3690
- logger.warning("Expected 'next' URL is missing.")
3691
- break
3692
- next_data = await self._async_get_request(next_url, headers)
3693
- data.extend(next_data.get('data', []))
3694
- status_data = next_data
3695
- status_data['data'] = data
3696
-
3697
- response = BatchScrapeStatusResponse(
3698
- status=status_data.get('status'),
3699
- total=status_data.get('total'),
3700
- completed=status_data.get('completed'),
3701
- creditsUsed=status_data.get('creditsUsed'),
3702
- expiresAt=status_data.get('expiresAt'),
3703
- data=status_data.get('data')
3704
- )
3705
-
3706
- if 'error' in status_data:
3707
- response['error'] = status_data['error']
3708
-
3709
- if 'next' in status_data:
3710
- response['next'] = status_data['next']
3711
-
3712
- return {
3713
- 'success': False if 'error' in status_data else True,
3714
- **response
3715
- }
3716
-
3717
- async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
3718
- """
3719
- Get information about errors from an asynchronous batch scrape job.
3720
-
3721
- Args:
3722
- id (str): The ID of the batch scrape job
3723
-
3724
- Returns:
3725
- CrawlErrorsResponse containing:
3726
- errors (List[Dict[str, str]]): List of errors with fields:
3727
- * id (str): Error ID
3728
- * timestamp (str): When the error occurred
3729
- * url (str): URL that caused the error
3730
- * error (str): Error message
3731
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3732
-
3733
- Raises:
3734
- Exception: If error check fails
3735
- """
3736
- headers = self._prepare_headers()
3737
- return await self._async_get_request(
3738
- f'{self.api_url}/v1/batch/scrape/{id}/errors',
3739
- headers
3740
- )
3741
-
3742
- async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
3743
- """
3744
- Get information about errors from an asynchronous crawl job.
3745
-
3746
- Args:
3747
- id (str): The ID of the crawl job
3748
-
3749
- Returns:
3750
- CrawlErrorsResponse containing:
3751
- * errors (List[Dict[str, str]]): List of errors with fields:
3752
- - id (str): Error ID
3753
- - timestamp (str): When the error occurred
3754
- - url (str): URL that caused the error
3755
- - error (str): Error message
3756
- * robotsBlocked (List[str]): List of URLs blocked by robots.txt
3757
-
3758
- Raises:
3759
- Exception: If error check fails
3760
- """
3761
- headers = self._prepare_headers()
3762
- return await self._async_get_request(
3763
- f'{self.api_url}/v1/crawl/{id}/errors',
3764
- headers
3765
- )
3766
-
3767
- async def cancel_crawl(self, id: str) -> Dict[str, Any]:
3768
- """
3769
- Cancel an asynchronous crawl job.
3770
-
3771
- Args:
3772
- id (str): The ID of the crawl job to cancel
3773
-
3774
- Returns:
3775
- Dict[str, Any] containing:
3776
- * success (bool): Whether cancellation was successful
3777
- * error (str, optional): Error message if cancellation failed
3778
-
3779
- Raises:
3780
- Exception: If cancellation fails
3781
- """
3782
- headers = self._prepare_headers()
3783
- async with aiohttp.ClientSession() as session:
3784
- async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
3785
- return await response.json()
3786
-
3787
- async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
3788
- """
3789
- Check the status of an asynchronous extraction job.
3790
-
3791
- Args:
3792
- job_id (str): The ID of the extraction job
3793
-
3794
- Returns:
3795
- ExtractResponse[Any] with:
3796
- * success (bool): Whether request succeeded
3797
- * data (Optional[Any]): Extracted data matching schema
3798
- * error (Optional[str]): Error message if any
3799
- * warning (Optional[str]): Warning message if any
3800
- * sources (Optional[List[str]]): Source URLs if requested
3801
-
3802
- Raises:
3803
- ValueError: If status check fails
3804
- """
3805
- headers = self._prepare_headers()
3806
- try:
3807
- return await self._async_get_request(
3808
- f'{self.api_url}/v1/extract/{job_id}',
3809
- headers
3810
- )
3811
- except Exception as e:
3812
- raise ValueError(str(e))
3813
-
3814
- async def async_extract(
3815
- self,
3816
- urls: Optional[List[str]] = None,
3817
- *,
3818
- prompt: Optional[str] = None,
3819
- schema: Optional[Any] = None,
3820
- system_prompt: Optional[str] = None,
3821
- allow_external_links: Optional[bool] = False,
3822
- enable_web_search: Optional[bool] = False,
3823
- show_sources: Optional[bool] = False,
3824
- agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
3825
- """
3826
- Initiate an asynchronous extraction job without waiting for completion.
3827
-
3828
- Args:
3829
- urls (Optional[List[str]]): URLs to extract from
3830
- prompt (Optional[str]): Custom extraction prompt
3831
- schema (Optional[Any]): JSON schema/Pydantic model
3832
- system_prompt (Optional[str]): System context
3833
- allow_external_links (Optional[bool]): Follow external links
3834
- enable_web_search (Optional[bool]): Enable web search
3835
- show_sources (Optional[bool]): Include source URLs
3836
- agent (Optional[Dict[str, Any]]): Agent configuration
3837
- idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3838
-
3839
- Returns:
3840
- ExtractResponse[Any] with:
3841
- * success (bool): Whether request succeeded
3842
- * data (Optional[Any]): Extracted data matching schema
3843
- * error (Optional[str]): Error message if any
3844
-
3845
- Raises:
3846
- ValueError: If job initiation fails
3847
- """
3848
- headers = self._prepare_headers()
3849
-
3850
- if not prompt and not schema:
3851
- raise ValueError("Either prompt or schema is required")
3852
-
3853
- if not urls and not prompt:
3854
- raise ValueError("Either urls or prompt is required")
3855
-
3856
- if schema:
3857
- if hasattr(schema, 'model_json_schema'):
3858
- schema = schema.model_json_schema()
3859
-
3860
- request_data = ExtractResponse(
3861
- urls=urls or [],
3862
- allowExternalLinks=allow_external_links,
3863
- enableWebSearch=enable_web_search,
3864
- showSources=show_sources,
3865
- schema=schema,
3866
- origin=f'python-sdk@{version}'
3867
- )
3868
-
3869
- if prompt:
3870
- request_data['prompt'] = prompt
3871
- if system_prompt:
3872
- request_data['systemPrompt'] = system_prompt
3873
- if agent:
3874
- request_data['agent'] = agent
3875
-
3876
- try:
3877
- return await self._async_post_request(
3878
- f'{self.api_url}/v1/extract',
3879
- request_data,
3880
- headers
3881
- )
3882
- except Exception as e:
3883
- raise ValueError(str(e))
3884
-
3885
- async def generate_llms_text(
3886
- self,
3887
- url: str,
3888
- *,
3889
- max_urls: Optional[int] = None,
3890
- show_full_text: Optional[bool] = None,
3891
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
3892
- """
3893
- Generate LLMs.txt for a given URL and monitor until completion.
3894
-
3895
- Args:
3896
- url (str): Target URL to generate LLMs.txt from
3897
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
3898
- show_full_text (Optional[bool]): Include full text in output (default: False)
3899
- experimental_stream (Optional[bool]): Enable experimental streaming
3900
-
3901
- Returns:
3902
- GenerateLLMsTextStatusResponse containing:
3903
- * success (bool): Whether generation completed successfully
3904
- * status (str): Status of generation (processing/completed/failed)
3905
- * data (Dict[str, str], optional): Generated text with fields:
3906
- - llmstxt (str): Generated LLMs.txt content
3907
- - llmsfulltxt (str, optional): Full version if requested
3908
- * error (str, optional): Error message if generation failed
3909
- * expiresAt (str): When the generated data expires
3910
-
3911
- Raises:
3912
- Exception: If generation fails
3913
- """
3914
- params = {}
3915
- if max_urls is not None:
3916
- params['maxUrls'] = max_urls
3917
- if show_full_text is not None:
3918
- params['showFullText'] = show_full_text
3919
- if experimental_stream is not None:
3920
- params['__experimental_stream'] = experimental_stream
3921
-
3922
- response = await self.async_generate_llms_text(
3923
- url,
3924
- max_urls=max_urls,
3925
- show_full_text=show_full_text,
3926
- experimental_stream=experimental_stream
3927
- )
3928
- if not response.get('success') or 'id' not in response:
3929
- return response
3930
-
3931
- job_id = response['id']
3932
- while True:
3933
- status = await self.check_generate_llms_text_status(job_id)
3934
-
3935
- if status['status'] == 'completed':
3936
- return status
3937
- elif status['status'] == 'failed':
3938
- raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
3939
- elif status['status'] != 'processing':
3940
- break
3941
-
3942
- await asyncio.sleep(2)
3943
-
3944
- return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
3945
-
3946
- async def async_generate_llms_text(
3947
- self,
3948
- url: str,
3949
- *,
3950
- max_urls: Optional[int] = None,
3951
- show_full_text: Optional[bool] = None,
3952
- experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
3953
- """
3954
- Initiate an asynchronous LLMs.txt generation job without waiting for completion.
3955
-
3956
- Args:
3957
- url (str): Target URL to generate LLMs.txt from
3958
- max_urls (Optional[int]): Maximum URLs to process (default: 10)
3959
- show_full_text (Optional[bool]): Include full text in output (default: False)
3960
- experimental_stream (Optional[bool]): Enable experimental streaming
3961
-
3962
- Returns:
3963
- GenerateLLMsTextResponse containing:
3964
- * success (bool): Whether job started successfully
3965
- * id (str): Unique identifier for the job
3966
- * error (str, optional): Error message if start failed
3967
-
3968
- Raises:
3969
- ValueError: If job initiation fails
3970
- """
3971
- params = {}
3972
- if max_urls is not None:
3973
- params['maxUrls'] = max_urls
3974
- if show_full_text is not None:
3975
- params['showFullText'] = show_full_text
3976
- if experimental_stream is not None:
3977
- params['__experimental_stream'] = experimental_stream
3978
-
3979
- params = GenerateLLMsTextParams(
3980
- maxUrls=max_urls,
3981
- showFullText=show_full_text,
3982
- __experimental_stream=experimental_stream
3983
- )
3984
-
3985
- headers = self._prepare_headers()
3986
- json_data = {'url': url, **params.dict(exclude_none=True)}
3987
- json_data['origin'] = f"python-sdk@{version}"
3988
-
3989
- try:
3990
- return await self._async_post_request(
3991
- f'{self.api_url}/v1/llmstxt',
3992
- json_data,
3993
- headers
3994
- )
3995
- except Exception as e:
3996
- raise ValueError(str(e))
3997
-
3998
- async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
3999
- """
4000
- Check the status of an asynchronous LLMs.txt generation job.
4001
-
4002
- Args:
4003
- id (str): The ID of the generation job
4004
-
4005
- Returns:
4006
- GenerateLLMsTextStatusResponse containing:
4007
- * success (bool): Whether generation completed successfully
4008
- * status (str): Status of generation (processing/completed/failed)
4009
- * data (Dict[str, str], optional): Generated text with fields:
4010
- - llmstxt (str): Generated LLMs.txt content
4011
- - llmsfulltxt (str, optional): Full version if requested
4012
- * error (str, optional): Error message if generation failed
4013
- * expiresAt (str): When the generated data expires
4014
-
4015
- Raises:
4016
- ValueError: If status check fails
4017
- """
4018
- headers = self._prepare_headers()
4019
- try:
4020
- return await self._async_get_request(
4021
- f'{self.api_url}/v1/llmstxt/{id}',
4022
- headers
4023
- )
4024
- except Exception as e:
4025
- raise ValueError(str(e))
4026
-
4027
- async def deep_research(
4028
- self,
4029
- query: str,
4030
- *,
4031
- max_depth: Optional[int] = None,
4032
- time_limit: Optional[int] = None,
4033
- max_urls: Optional[int] = None,
4034
- analysis_prompt: Optional[str] = None,
4035
- system_prompt: Optional[str] = None,
4036
- __experimental_stream_steps: Optional[bool] = None,
4037
- on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
4038
- on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
4039
- """
4040
- Initiates a deep research operation on a given query and polls until completion.
4041
-
4042
- Args:
4043
- query (str): Research query or topic to investigate
4044
- max_depth (Optional[int]): Maximum depth of research exploration
4045
- time_limit (Optional[int]): Time limit in seconds for research
4046
- max_urls (Optional[int]): Maximum number of URLs to process
4047
- analysis_prompt (Optional[str]): Custom prompt for analysis
4048
- system_prompt (Optional[str]): Custom system prompt
4049
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4050
- on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
4051
- on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
4052
-
4053
- Returns:
4054
- DeepResearchStatusResponse containing:
4055
- * success (bool): Whether research completed successfully
4056
- * status (str): Current state (processing/completed/failed)
4057
- * error (Optional[str]): Error message if failed
4058
- * id (str): Unique identifier for the research job
4059
- * data (Any): Research findings and analysis
4060
- * sources (List[Dict]): List of discovered sources
4061
- * activities (List[Dict]): Research progress log
4062
- * summaries (List[str]): Generated research summaries
4063
-
4064
- Raises:
4065
- Exception: If research fails
4066
- """
4067
- research_params = {}
4068
- if max_depth is not None:
4069
- research_params['maxDepth'] = max_depth
4070
- if time_limit is not None:
4071
- research_params['timeLimit'] = time_limit
4072
- if max_urls is not None:
4073
- research_params['maxUrls'] = max_urls
4074
- if analysis_prompt is not None:
4075
- research_params['analysisPrompt'] = analysis_prompt
4076
- if system_prompt is not None:
4077
- research_params['systemPrompt'] = system_prompt
4078
- if __experimental_stream_steps is not None:
4079
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4080
- research_params = DeepResearchParams(**research_params)
4081
-
4082
- response = await self.async_deep_research(
4083
- query,
4084
- max_depth=max_depth,
4085
- time_limit=time_limit,
4086
- max_urls=max_urls,
4087
- analysis_prompt=analysis_prompt,
4088
- system_prompt=system_prompt
4089
- )
4090
- if not response.get('success') or 'id' not in response:
4091
- return response
4092
-
4093
- job_id = response['id']
4094
- last_activity_count = 0
4095
- last_source_count = 0
4096
-
4097
- while True:
4098
- status = await self.check_deep_research_status(job_id)
4099
-
4100
- if on_activity and 'activities' in status:
4101
- new_activities = status['activities'][last_activity_count:]
4102
- for activity in new_activities:
4103
- on_activity(activity)
4104
- last_activity_count = len(status['activities'])
4105
-
4106
- if on_source and 'sources' in status:
4107
- new_sources = status['sources'][last_source_count:]
4108
- for source in new_sources:
4109
- on_source(source)
4110
- last_source_count = len(status['sources'])
4111
-
4112
- if status['status'] == 'completed':
4113
- return status
4114
- elif status['status'] == 'failed':
4115
- raise Exception(f'Deep research failed. Error: {status.get("error")}')
4116
- elif status['status'] != 'processing':
4117
- break
4118
-
4119
- await asyncio.sleep(2)
4120
-
4121
- return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
4122
-
4123
- async def async_deep_research(
4124
- self,
4125
- query: str,
4126
- *,
4127
- max_depth: Optional[int] = None,
4128
- time_limit: Optional[int] = None,
4129
- max_urls: Optional[int] = None,
4130
- analysis_prompt: Optional[str] = None,
4131
- system_prompt: Optional[str] = None,
4132
- __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
4133
- """
4134
- Initiates an asynchronous deep research operation.
4135
-
4136
- Args:
4137
- query (str): Research query or topic to investigate
4138
- max_depth (Optional[int]): Maximum depth of research exploration
4139
- time_limit (Optional[int]): Time limit in seconds for research
4140
- max_urls (Optional[int]): Maximum number of URLs to process
4141
- analysis_prompt (Optional[str]): Custom prompt for analysis
4142
- system_prompt (Optional[str]): Custom system prompt
4143
- __experimental_stream_steps (Optional[bool]): Enable experimental streaming
4144
-
4145
- Returns:
4146
- Dict[str, Any]: A response containing:
4147
- * success (bool): Whether the research initiation was successful
4148
- * id (str): The unique identifier for the research job
4149
- * error (str, optional): Error message if initiation failed
4150
-
4151
- Raises:
4152
- Exception: If the research initiation fails.
4153
- """
4154
- research_params = {}
4155
- if max_depth is not None:
4156
- research_params['maxDepth'] = max_depth
4157
- if time_limit is not None:
4158
- research_params['timeLimit'] = time_limit
4159
- if max_urls is not None:
4160
- research_params['maxUrls'] = max_urls
4161
- if analysis_prompt is not None:
4162
- research_params['analysisPrompt'] = analysis_prompt
4163
- if system_prompt is not None:
4164
- research_params['systemPrompt'] = system_prompt
4165
- if __experimental_stream_steps is not None:
4166
- research_params['__experimental_streamSteps'] = __experimental_stream_steps
4167
- research_params = DeepResearchParams(**research_params)
4168
-
4169
- headers = self._prepare_headers()
4170
-
4171
- json_data = {'query': query, **research_params.dict(exclude_none=True)}
4172
- json_data['origin'] = f"python-sdk@{version}"
4173
-
4174
- try:
4175
- return await self._async_post_request(
4176
- f'{self.api_url}/v1/deep-research',
4177
- json_data,
4178
- headers
4179
- )
4180
- except Exception as e:
4181
- raise ValueError(str(e))
4182
-
4183
- async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
4184
- """
4185
- Check the status of a deep research operation.
4186
-
4187
- Args:
4188
- id (str): The ID of the deep research operation.
4189
-
4190
- Returns:
4191
- DeepResearchResponse containing:
4192
-
4193
- Status:
4194
- * success - Whether research completed successfully
4195
- * status - Current state (processing/completed/failed)
4196
- * error - Error message if failed
4197
-
4198
- Results:
4199
- * id - Unique identifier for the research job
4200
- * data - Research findings and analysis
4201
- * sources - List of discovered sources
4202
- * activities - Research progress log
4203
- * summaries - Generated research summaries
4204
-
4205
- Raises:
4206
- Exception: If the status check fails.
4207
- """
4208
- headers = self._prepare_headers()
4209
- try:
4210
- return await self._async_get_request(
4211
- f'{self.api_url}/v1/deep-research/{id}',
4212
- headers
4213
- )
4214
- except Exception as e:
4215
- raise ValueError(str(e))
4216
-
4217
- async def search(
4218
- self,
4219
- query: str,
4220
- *,
4221
- limit: Optional[int] = None,
4222
- tbs: Optional[str] = None,
4223
- filter: Optional[str] = None,
4224
- lang: Optional[str] = None,
4225
- country: Optional[str] = None,
4226
- location: Optional[str] = None,
4227
- timeout: Optional[int] = None,
4228
- scrape_options: Optional[ScrapeOptions] = None,
4229
- params: Optional[Union[Dict[str, Any], SearchParams]] = None,
4230
- **kwargs) -> SearchResponse:
4231
- """
4232
- Asynchronously search for content using Firecrawl.
4233
-
4234
- Args:
4235
- query (str): Search query string
4236
- limit (Optional[int]): Max results (default: 5)
4237
- tbs (Optional[str]): Time filter (e.g. "qdr:d")
4238
- filter (Optional[str]): Custom result filter
4239
- lang (Optional[str]): Language code (default: "en")
4240
- country (Optional[str]): Country code (default: "us")
4241
- location (Optional[str]): Geo-targeting
4242
- timeout (Optional[int]): Request timeout in milliseconds
4243
- scrape_options (Optional[ScrapeOptions]): Result scraping configuration
4244
- params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
4245
- **kwargs: Additional keyword arguments for future compatibility
4246
-
4247
- Returns:
4248
- SearchResponse: Response containing:
4249
- * success (bool): Whether request succeeded
4250
- * data (List[FirecrawlDocument]): Search results
4251
- * warning (Optional[str]): Warning message if any
4252
- * error (Optional[str]): Error message if any
4253
-
4254
- Raises:
4255
- Exception: If search fails or response cannot be parsed
4256
- """
4257
- # Build search parameters
4258
- search_params = {}
4259
- if params:
4260
- if isinstance(params, dict):
4261
- search_params.update(params)
4262
- else:
4263
- search_params.update(params.dict(exclude_none=True))
4264
-
4265
- # Add individual parameters
4266
- if limit is not None:
4267
- search_params['limit'] = limit
4268
- if tbs is not None:
4269
- search_params['tbs'] = tbs
4270
- if filter is not None:
4271
- search_params['filter'] = filter
4272
- if lang is not None:
4273
- search_params['lang'] = lang
4274
- if country is not None:
4275
- search_params['country'] = country
4276
- if location is not None:
4277
- search_params['location'] = location
4278
- if timeout is not None:
4279
- search_params['timeout'] = timeout
4280
- if scrape_options is not None:
4281
- search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
4282
-
4283
- # Add any additional kwargs
4284
- search_params.update(kwargs)
4285
-
4286
- # Create final params object
4287
- final_params = SearchParams(query=query, **search_params)
4288
- params_dict = final_params.dict(exclude_none=True)
4289
- params_dict['origin'] = f"python-sdk@{version}"
4290
-
4291
- return await self._async_post_request(
4292
- f"{self.api_url}/v1/search",
4293
- params_dict,
4294
- {"Authorization": f"Bearer {self.api_key}"}
4295
- )
4296
-
4297
- class AsyncCrawlWatcher(CrawlWatcher):
4298
- """
4299
- Async version of CrawlWatcher that properly handles async operations.
4300
- """
4301
- def __init__(self, id: str, app: AsyncFirecrawlApp):
4302
- super().__init__(id, app)
4303
-
4304
- async def connect(self) -> None:
4305
- """
4306
- Establishes async WebSocket connection and starts listening for messages.
4307
- """
4308
- async with websockets.connect(
4309
- self.ws_url,
4310
- additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
4311
- ) as websocket:
4312
- await self._listen(websocket)
4313
-
4314
- async def _listen(self, websocket) -> None:
4315
- """
4316
- Listens for incoming WebSocket messages and handles them asynchronously.
4317
-
4318
- Args:
4319
- websocket: The WebSocket connection object
4320
- """
4321
- async for message in websocket:
4322
- msg = json.loads(message)
4323
- await self._handle_message(msg)
4324
-
4325
- async def _handle_message(self, msg: Dict[str, Any]) -> None:
4326
- """
4327
- Handles incoming WebSocket messages based on their type asynchronously.
4328
-
4329
- Args:
4330
- msg (Dict[str, Any]): The message to handle
4331
- """
4332
- if msg['type'] == 'done':
4333
- self.status = 'completed'
4334
- self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
4335
- elif msg['type'] == 'error':
4336
- self.status = 'failed'
4337
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
4338
- elif msg['type'] == 'catchup':
4339
- self.status = msg['data']['status']
4340
- self.data.extend(msg['data'].get('data', []))
4341
- for doc in self.data:
4342
- self.dispatch_event('document', {'data': doc, 'id': self.id})
4343
- elif msg['type'] == 'document':
4344
- self.data.append(msg['data'])
4345
- self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
4346
-
4347
- async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
4348
- """
4349
- Handle errors from async API responses.
4350
- """
4351
- try:
4352
- error_data = await response.json()
4353
- error_message = error_data.get('error', 'No error message provided.')
4354
- error_details = error_data.get('details', 'No additional error details provided.')
4355
- except:
4356
- raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
4357
-
4358
- # Use the app's method to get the error message
4359
- message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
4360
-
4361
- raise aiohttp.ClientError(message)
4362
-
4363
- async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
4364
- """
4365
- Generate a standardized error message based on HTTP status code for async operations.
4366
-
4367
- Args:
4368
- status_code (int): The HTTP status code from the response
4369
- action (str): Description of the action that was being performed
4370
- error_message (str): The error message from the API response
4371
- error_details (str): Additional error details from the API response
4372
-
4373
- Returns:
4374
- str: A formatted error message
4375
- """
4376
- return self._get_error_message(status_code, action, error_message, error_details)