firecrawl 1.16.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/firecrawl.py +3338 -409
- {firecrawl-1.16.0.dist-info → firecrawl-2.0.0.dist-info}/METADATA +3 -2
- firecrawl-2.0.0.dist-info/RECORD +12 -0
- firecrawl-1.16.0.dist-info/RECORD +0 -12
- {firecrawl-1.16.0.dist-info → firecrawl-2.0.0.dist-info}/LICENSE +0 -0
- {firecrawl-1.16.0.dist-info → firecrawl-2.0.0.dist-info}/WHEEL +0 -0
- {firecrawl-1.16.0.dist-info → firecrawl-2.0.0.dist-info}/top_level.txt +0 -0
firecrawl/firecrawl.py
CHANGED
|
@@ -12,15 +12,293 @@ Classes:
|
|
|
12
12
|
import logging
|
|
13
13
|
import os
|
|
14
14
|
import time
|
|
15
|
-
from typing import Any, Dict, Optional, List, Union, Callable
|
|
15
|
+
from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
|
|
16
16
|
import json
|
|
17
|
-
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
import re
|
|
19
|
+
import warnings
|
|
18
20
|
import requests
|
|
19
21
|
import pydantic
|
|
20
22
|
import websockets
|
|
23
|
+
import aiohttp
|
|
24
|
+
import asyncio
|
|
25
|
+
from pydantic import Field
|
|
26
|
+
|
|
27
|
+
# Suppress Pydantic warnings about attribute shadowing
|
|
28
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
|
|
29
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
|
30
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractConfig\" shadows an attribute in parent \"BaseModel\"")
|
|
31
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_version():
|
|
35
|
+
try:
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
package_path = os.path.dirname(__file__)
|
|
38
|
+
version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
|
|
39
|
+
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
|
40
|
+
if version_match:
|
|
41
|
+
return version_match.group(1).strip()
|
|
42
|
+
except Exception:
|
|
43
|
+
print("Failed to get version from __init__.py")
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
version = get_version()
|
|
21
47
|
|
|
22
48
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
23
49
|
|
|
50
|
+
T = TypeVar('T')
|
|
51
|
+
|
|
52
|
+
# class FirecrawlDocumentMetadata(pydantic.BaseModel):
|
|
53
|
+
# """Metadata for a Firecrawl document."""
|
|
54
|
+
# title: Optional[str] = None
|
|
55
|
+
# description: Optional[str] = None
|
|
56
|
+
# language: Optional[str] = None
|
|
57
|
+
# keywords: Optional[str] = None
|
|
58
|
+
# robots: Optional[str] = None
|
|
59
|
+
# ogTitle: Optional[str] = None
|
|
60
|
+
# ogDescription: Optional[str] = None
|
|
61
|
+
# ogUrl: Optional[str] = None
|
|
62
|
+
# ogImage: Optional[str] = None
|
|
63
|
+
# ogAudio: Optional[str] = None
|
|
64
|
+
# ogDeterminer: Optional[str] = None
|
|
65
|
+
# ogLocale: Optional[str] = None
|
|
66
|
+
# ogLocaleAlternate: Optional[List[str]] = None
|
|
67
|
+
# ogSiteName: Optional[str] = None
|
|
68
|
+
# ogVideo: Optional[str] = None
|
|
69
|
+
# dctermsCreated: Optional[str] = None
|
|
70
|
+
# dcDateCreated: Optional[str] = None
|
|
71
|
+
# dcDate: Optional[str] = None
|
|
72
|
+
# dctermsType: Optional[str] = None
|
|
73
|
+
# dcType: Optional[str] = None
|
|
74
|
+
# dctermsAudience: Optional[str] = None
|
|
75
|
+
# dctermsSubject: Optional[str] = None
|
|
76
|
+
# dcSubject: Optional[str] = None
|
|
77
|
+
# dcDescription: Optional[str] = None
|
|
78
|
+
# dctermsKeywords: Optional[str] = None
|
|
79
|
+
# modifiedTime: Optional[str] = None
|
|
80
|
+
# publishedTime: Optional[str] = None
|
|
81
|
+
# articleTag: Optional[str] = None
|
|
82
|
+
# articleSection: Optional[str] = None
|
|
83
|
+
# sourceURL: Optional[str] = None
|
|
84
|
+
# statusCode: Optional[int] = None
|
|
85
|
+
# error: Optional[str] = None
|
|
86
|
+
|
|
87
|
+
class AgentOptions(pydantic.BaseModel):
|
|
88
|
+
"""Configuration for the agent."""
|
|
89
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
90
|
+
prompt: Optional[str] = None
|
|
91
|
+
|
|
92
|
+
class AgentOptionsExtract(pydantic.BaseModel):
|
|
93
|
+
"""Configuration for the agent in extract operations."""
|
|
94
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
95
|
+
|
|
96
|
+
class ActionsResult(pydantic.BaseModel):
|
|
97
|
+
"""Result of actions performed during scraping."""
|
|
98
|
+
screenshots: List[str]
|
|
99
|
+
|
|
100
|
+
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
101
|
+
"""Document retrieved or processed by Firecrawl."""
|
|
102
|
+
url: Optional[str] = None
|
|
103
|
+
markdown: Optional[str] = None
|
|
104
|
+
html: Optional[str] = None
|
|
105
|
+
rawHtml: Optional[str] = None
|
|
106
|
+
links: Optional[List[str]] = None
|
|
107
|
+
extract: Optional[T] = None
|
|
108
|
+
json: Optional[T] = None
|
|
109
|
+
screenshot: Optional[str] = None
|
|
110
|
+
metadata: Optional[Any] = None
|
|
111
|
+
actions: Optional[ActionsResult] = None
|
|
112
|
+
title: Optional[str] = None # v1 search only
|
|
113
|
+
description: Optional[str] = None # v1 search only
|
|
114
|
+
|
|
115
|
+
class LocationConfig(pydantic.BaseModel):
|
|
116
|
+
"""Location configuration for scraping."""
|
|
117
|
+
country: Optional[str] = None
|
|
118
|
+
languages: Optional[List[str]] = None
|
|
119
|
+
|
|
120
|
+
class WebhookConfig(pydantic.BaseModel):
|
|
121
|
+
"""Configuration for webhooks."""
|
|
122
|
+
url: str
|
|
123
|
+
headers: Optional[Dict[str, str]] = None
|
|
124
|
+
metadata: Optional[Dict[str, str]] = None
|
|
125
|
+
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
126
|
+
|
|
127
|
+
class CommonOptions(pydantic.BaseModel):
|
|
128
|
+
"""Parameters for scraping operations."""
|
|
129
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None
|
|
130
|
+
headers: Optional[Dict[str, str]] = None
|
|
131
|
+
includeTags: Optional[List[str]] = None
|
|
132
|
+
excludeTags: Optional[List[str]] = None
|
|
133
|
+
onlyMainContent: Optional[bool] = None
|
|
134
|
+
waitFor: Optional[int] = None
|
|
135
|
+
timeout: Optional[int] = None
|
|
136
|
+
location: Optional[LocationConfig] = None
|
|
137
|
+
mobile: Optional[bool] = None
|
|
138
|
+
skipTlsVerification: Optional[bool] = None
|
|
139
|
+
removeBase64Images: Optional[bool] = None
|
|
140
|
+
blockAds: Optional[bool] = None
|
|
141
|
+
proxy: Optional[Literal["basic", "stealth"]] = None
|
|
142
|
+
|
|
143
|
+
class WaitAction(pydantic.BaseModel):
|
|
144
|
+
"""Wait action to perform during scraping."""
|
|
145
|
+
type: Literal["wait"]
|
|
146
|
+
milliseconds: int
|
|
147
|
+
selector: Optional[str] = None
|
|
148
|
+
|
|
149
|
+
class ScreenshotAction(pydantic.BaseModel):
|
|
150
|
+
"""Screenshot action to perform during scraping."""
|
|
151
|
+
type: Literal["screenshot"]
|
|
152
|
+
fullPage: Optional[bool] = None
|
|
153
|
+
|
|
154
|
+
class ClickAction(pydantic.BaseModel):
|
|
155
|
+
"""Click action to perform during scraping."""
|
|
156
|
+
type: Literal["click"]
|
|
157
|
+
selector: str
|
|
158
|
+
|
|
159
|
+
class WriteAction(pydantic.BaseModel):
|
|
160
|
+
"""Write action to perform during scraping."""
|
|
161
|
+
type: Literal["write"]
|
|
162
|
+
text: str
|
|
163
|
+
|
|
164
|
+
class PressAction(pydantic.BaseModel):
|
|
165
|
+
"""Press action to perform during scraping."""
|
|
166
|
+
type: Literal["press"]
|
|
167
|
+
key: str
|
|
168
|
+
|
|
169
|
+
class ScrollAction(pydantic.BaseModel):
|
|
170
|
+
"""Scroll action to perform during scraping."""
|
|
171
|
+
type: Literal["scroll"]
|
|
172
|
+
direction: Literal["up", "down"]
|
|
173
|
+
selector: Optional[str] = None
|
|
174
|
+
|
|
175
|
+
class ScrapeAction(pydantic.BaseModel):
|
|
176
|
+
"""Scrape action to perform during scraping."""
|
|
177
|
+
type: Literal["scrape"]
|
|
178
|
+
|
|
179
|
+
class ExecuteJavascriptAction(pydantic.BaseModel):
|
|
180
|
+
"""Execute javascript action to perform during scraping."""
|
|
181
|
+
type: Literal["executeJavascript"]
|
|
182
|
+
script: str
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class ExtractAgent(pydantic.BaseModel):
|
|
186
|
+
"""Configuration for the agent in extract operations."""
|
|
187
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
188
|
+
|
|
189
|
+
class ExtractConfig(pydantic.BaseModel):
|
|
190
|
+
"""Configuration for extraction."""
|
|
191
|
+
prompt: Optional[str] = None
|
|
192
|
+
schema: Optional[Any] = None
|
|
193
|
+
systemPrompt: Optional[str] = None
|
|
194
|
+
agent: Optional[ExtractAgent] = None
|
|
195
|
+
|
|
196
|
+
class ScrapeParams(CommonOptions):
|
|
197
|
+
"""Parameters for scraping operations."""
|
|
198
|
+
extract: Optional[ExtractConfig] = None
|
|
199
|
+
jsonOptions: Optional[ExtractConfig] = None
|
|
200
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
|
201
|
+
agent: Optional[AgentOptions] = None
|
|
202
|
+
|
|
203
|
+
class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
|
|
204
|
+
"""Response from scraping operations."""
|
|
205
|
+
success: bool = True
|
|
206
|
+
warning: Optional[str] = None
|
|
207
|
+
error: Optional[str] = None
|
|
208
|
+
|
|
209
|
+
class BatchScrapeResponse(pydantic.BaseModel):
|
|
210
|
+
"""Response from batch scrape operations."""
|
|
211
|
+
id: Optional[str] = None
|
|
212
|
+
url: Optional[str] = None
|
|
213
|
+
success: bool = True
|
|
214
|
+
error: Optional[str] = None
|
|
215
|
+
invalidURLs: Optional[List[str]] = None
|
|
216
|
+
|
|
217
|
+
class BatchScrapeStatusResponse(pydantic.BaseModel):
|
|
218
|
+
"""Response from batch scrape status checks."""
|
|
219
|
+
success: bool = True
|
|
220
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
221
|
+
completed: int
|
|
222
|
+
total: int
|
|
223
|
+
creditsUsed: int
|
|
224
|
+
expiresAt: datetime
|
|
225
|
+
next: Optional[str] = None
|
|
226
|
+
data: List[FirecrawlDocument]
|
|
227
|
+
|
|
228
|
+
class CrawlParams(pydantic.BaseModel):
|
|
229
|
+
"""Parameters for crawling operations."""
|
|
230
|
+
includePaths: Optional[List[str]] = None
|
|
231
|
+
excludePaths: Optional[List[str]] = None
|
|
232
|
+
maxDepth: Optional[int] = None
|
|
233
|
+
maxDiscoveryDepth: Optional[int] = None
|
|
234
|
+
limit: Optional[int] = None
|
|
235
|
+
allowBackwardLinks: Optional[bool] = None
|
|
236
|
+
allowExternalLinks: Optional[bool] = None
|
|
237
|
+
ignoreSitemap: Optional[bool] = None
|
|
238
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
239
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
240
|
+
deduplicateSimilarURLs: Optional[bool] = None
|
|
241
|
+
ignoreQueryParameters: Optional[bool] = None
|
|
242
|
+
regexOnFullURL: Optional[bool] = None
|
|
243
|
+
|
|
244
|
+
class CrawlResponse(pydantic.BaseModel):
|
|
245
|
+
"""Response from crawling operations."""
|
|
246
|
+
id: Optional[str] = None
|
|
247
|
+
url: Optional[str] = None
|
|
248
|
+
success: bool = True
|
|
249
|
+
error: Optional[str] = None
|
|
250
|
+
|
|
251
|
+
class CrawlStatusResponse(pydantic.BaseModel):
|
|
252
|
+
"""Response from crawl status checks."""
|
|
253
|
+
success: bool = True
|
|
254
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
255
|
+
completed: int
|
|
256
|
+
total: int
|
|
257
|
+
creditsUsed: int
|
|
258
|
+
expiresAt: datetime
|
|
259
|
+
next: Optional[str] = None
|
|
260
|
+
data: List[FirecrawlDocument]
|
|
261
|
+
|
|
262
|
+
class CrawlErrorsResponse(pydantic.BaseModel):
|
|
263
|
+
"""Response from crawl/batch scrape error monitoring."""
|
|
264
|
+
errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
|
|
265
|
+
robotsBlocked: List[str]
|
|
266
|
+
|
|
267
|
+
class MapParams(pydantic.BaseModel):
|
|
268
|
+
"""Parameters for mapping operations."""
|
|
269
|
+
search: Optional[str] = None
|
|
270
|
+
ignoreSitemap: Optional[bool] = None
|
|
271
|
+
includeSubdomains: Optional[bool] = None
|
|
272
|
+
sitemapOnly: Optional[bool] = None
|
|
273
|
+
limit: Optional[int] = None
|
|
274
|
+
timeout: Optional[int] = None
|
|
275
|
+
|
|
276
|
+
class MapResponse(pydantic.BaseModel):
|
|
277
|
+
"""Response from mapping operations."""
|
|
278
|
+
success: bool = True
|
|
279
|
+
links: Optional[List[str]] = None
|
|
280
|
+
error: Optional[str] = None
|
|
281
|
+
|
|
282
|
+
class ExtractParams(pydantic.BaseModel):
|
|
283
|
+
"""Parameters for extracting information from URLs."""
|
|
284
|
+
prompt: Optional[str] = None
|
|
285
|
+
schema: Optional[Any] = None
|
|
286
|
+
systemPrompt: Optional[str] = None
|
|
287
|
+
allowExternalLinks: Optional[bool] = None
|
|
288
|
+
enableWebSearch: Optional[bool] = None
|
|
289
|
+
includeSubdomains: Optional[bool] = None
|
|
290
|
+
origin: Optional[str] = None
|
|
291
|
+
showSources: Optional[bool] = None
|
|
292
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
293
|
+
|
|
294
|
+
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
295
|
+
"""Response from extract operations."""
|
|
296
|
+
success: bool = True
|
|
297
|
+
data: Optional[T] = None
|
|
298
|
+
error: Optional[str] = None
|
|
299
|
+
warning: Optional[str] = None
|
|
300
|
+
sources: Optional[List[str]] = None
|
|
301
|
+
|
|
24
302
|
class SearchParams(pydantic.BaseModel):
|
|
25
303
|
query: str
|
|
26
304
|
limit: Optional[int] = 5
|
|
@@ -31,7 +309,14 @@ class SearchParams(pydantic.BaseModel):
|
|
|
31
309
|
location: Optional[str] = None
|
|
32
310
|
origin: Optional[str] = "api"
|
|
33
311
|
timeout: Optional[int] = 60000
|
|
34
|
-
scrapeOptions: Optional[
|
|
312
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
313
|
+
|
|
314
|
+
class SearchResponse(pydantic.BaseModel):
|
|
315
|
+
"""Response from search operations."""
|
|
316
|
+
success: bool = True
|
|
317
|
+
data: List[FirecrawlDocument]
|
|
318
|
+
warning: Optional[str] = None
|
|
319
|
+
error: Optional[str] = None
|
|
35
320
|
|
|
36
321
|
class GenerateLLMsTextParams(pydantic.BaseModel):
|
|
37
322
|
"""
|
|
@@ -75,6 +360,24 @@ class DeepResearchStatusResponse(pydantic.BaseModel):
|
|
|
75
360
|
sources: List[Dict[str, Any]]
|
|
76
361
|
summaries: List[str]
|
|
77
362
|
|
|
363
|
+
class GenerateLLMsTextResponse(pydantic.BaseModel):
|
|
364
|
+
"""Response from LLMs.txt generation operations."""
|
|
365
|
+
success: bool = True
|
|
366
|
+
id: str
|
|
367
|
+
error: Optional[str] = None
|
|
368
|
+
|
|
369
|
+
class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
|
|
370
|
+
llmstxt: str
|
|
371
|
+
llmsfulltxt: Optional[str] = None
|
|
372
|
+
|
|
373
|
+
class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
|
374
|
+
"""Status response from LLMs.txt generation operations."""
|
|
375
|
+
success: bool = True
|
|
376
|
+
data: Optional[GenerateLLMsTextStatusResponseData] = None
|
|
377
|
+
status: Literal["processing", "completed", "failed"]
|
|
378
|
+
error: Optional[str] = None
|
|
379
|
+
expiresAt: str
|
|
380
|
+
|
|
78
381
|
class ChangeTrackingData(pydantic.BaseModel):
|
|
79
382
|
"""
|
|
80
383
|
Data for the change tracking format.
|
|
@@ -84,41 +387,39 @@ class ChangeTrackingData(pydantic.BaseModel):
|
|
|
84
387
|
visibility: str # "visible" | "hidden"
|
|
85
388
|
diff: Optional[Dict[str, Any]] = None
|
|
86
389
|
json: Optional[Any] = None
|
|
390
|
+
|
|
391
|
+
class SearchResponse(pydantic.BaseModel):
|
|
392
|
+
"""
|
|
393
|
+
Response from the search operation.
|
|
394
|
+
"""
|
|
395
|
+
success: bool
|
|
396
|
+
data: List[Dict[str, Any]]
|
|
397
|
+
warning: Optional[str] = None
|
|
398
|
+
error: Optional[str] = None
|
|
87
399
|
|
|
88
|
-
class
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
show_sources: Optional[bool] = False
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
class ExtractResponse(pydantic.BaseModel):
|
|
115
|
-
"""
|
|
116
|
-
Response from the extract operation.
|
|
117
|
-
"""
|
|
118
|
-
success: bool
|
|
119
|
-
data: Optional[Any] = None
|
|
120
|
-
error: Optional[str] = None
|
|
400
|
+
class ExtractParams(pydantic.BaseModel):
|
|
401
|
+
"""
|
|
402
|
+
Parameters for the extract operation.
|
|
403
|
+
"""
|
|
404
|
+
prompt: Optional[str] = None
|
|
405
|
+
schema: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
406
|
+
system_prompt: Optional[str] = None
|
|
407
|
+
allow_external_links: Optional[bool] = False
|
|
408
|
+
enable_web_search: Optional[bool] = False
|
|
409
|
+
# Just for backwards compatibility
|
|
410
|
+
enableWebSearch: Optional[bool] = False
|
|
411
|
+
show_sources: Optional[bool] = False
|
|
412
|
+
agent: Optional[Dict[str, Any]] = None
|
|
413
|
+
|
|
414
|
+
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
415
|
+
"""
|
|
416
|
+
Response from the extract operation.
|
|
417
|
+
"""
|
|
418
|
+
success: bool
|
|
419
|
+
data: Optional[T] = None
|
|
420
|
+
error: Optional[str] = None
|
|
121
421
|
|
|
422
|
+
class FirecrawlApp:
|
|
122
423
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
|
123
424
|
"""
|
|
124
425
|
Initialize the FirecrawlApp instance with API key, API URL.
|
|
@@ -137,196 +438,451 @@ class FirecrawlApp:
|
|
|
137
438
|
|
|
138
439
|
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
|
|
139
440
|
|
|
140
|
-
def scrape_url(
|
|
441
|
+
def scrape_url(
|
|
442
|
+
self,
|
|
443
|
+
url: str,
|
|
444
|
+
*,
|
|
445
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
446
|
+
include_tags: Optional[List[str]] = None,
|
|
447
|
+
exclude_tags: Optional[List[str]] = None,
|
|
448
|
+
only_main_content: Optional[bool] = None,
|
|
449
|
+
wait_for: Optional[int] = None,
|
|
450
|
+
timeout: Optional[int] = None,
|
|
451
|
+
location: Optional[LocationConfig] = None,
|
|
452
|
+
mobile: Optional[bool] = None,
|
|
453
|
+
skip_tls_verification: Optional[bool] = None,
|
|
454
|
+
remove_base64_images: Optional[bool] = None,
|
|
455
|
+
block_ads: Optional[bool] = None,
|
|
456
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
457
|
+
extract: Optional[ExtractConfig] = None,
|
|
458
|
+
json_options: Optional[ExtractConfig] = None,
|
|
459
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
460
|
+
**kwargs) -> ScrapeResponse[Any]:
|
|
141
461
|
"""
|
|
142
|
-
Scrape
|
|
462
|
+
Scrape and extract content from a URL.
|
|
143
463
|
|
|
144
464
|
Args:
|
|
145
|
-
|
|
146
|
-
|
|
465
|
+
url (str): Target URL to scrape
|
|
466
|
+
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
|
467
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
468
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
469
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
470
|
+
wait_for (Optional[int]): Wait for a specific element to appear
|
|
471
|
+
timeout (Optional[int]): Request timeout (ms)
|
|
472
|
+
location (Optional[LocationConfig]): Location configuration
|
|
473
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
474
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
475
|
+
remove_base64_images (Optional[bool]): Remove base64 images
|
|
476
|
+
block_ads (Optional[bool]): Block ads
|
|
477
|
+
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
478
|
+
extract (Optional[ExtractConfig]): Content extraction settings
|
|
479
|
+
json_options (Optional[ExtractConfig]): JSON extraction settings
|
|
480
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
481
|
+
|
|
147
482
|
|
|
148
483
|
Returns:
|
|
149
|
-
|
|
484
|
+
ScrapeResponse with:
|
|
485
|
+
* Requested content formats
|
|
486
|
+
* Page metadata
|
|
487
|
+
* Extraction results
|
|
488
|
+
* Success/error status
|
|
150
489
|
|
|
151
490
|
Raises:
|
|
152
|
-
|
|
491
|
+
Exception: If scraping fails
|
|
153
492
|
"""
|
|
154
|
-
|
|
155
493
|
headers = self._prepare_headers()
|
|
156
494
|
|
|
157
|
-
#
|
|
158
|
-
scrape_params = {
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
# Handle extract (for v1)
|
|
163
|
-
extract = params.get('extract', {})
|
|
164
|
-
if extract:
|
|
165
|
-
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
|
|
166
|
-
extract['schema'] = extract['schema'].schema()
|
|
167
|
-
scrape_params['extract'] = extract
|
|
168
|
-
|
|
169
|
-
# Include any other params directly at the top level of scrape_params
|
|
170
|
-
for key, value in params.items():
|
|
171
|
-
if key not in ['extract']:
|
|
172
|
-
scrape_params[key] = value
|
|
173
|
-
|
|
174
|
-
json = params.get("jsonOptions", {})
|
|
175
|
-
if json:
|
|
176
|
-
if 'schema' in json and hasattr(json['schema'], 'schema'):
|
|
177
|
-
json['schema'] = json['schema'].schema()
|
|
178
|
-
scrape_params['jsonOptions'] = json
|
|
179
|
-
|
|
180
|
-
change_tracking = params.get("changeTrackingOptions", {})
|
|
181
|
-
if change_tracking:
|
|
182
|
-
scrape_params['changeTrackingOptions'] = change_tracking
|
|
183
|
-
|
|
184
|
-
# Include any other params directly at the top level of scrape_params
|
|
185
|
-
for key, value in params.items():
|
|
186
|
-
if key not in ['jsonOptions', 'changeTrackingOptions']:
|
|
187
|
-
scrape_params[key] = value
|
|
188
|
-
|
|
495
|
+
# Build scrape parameters
|
|
496
|
+
scrape_params = {
|
|
497
|
+
'url': url,
|
|
498
|
+
'origin': f"python-sdk@{version}"
|
|
499
|
+
}
|
|
189
500
|
|
|
190
|
-
|
|
191
|
-
|
|
501
|
+
# Add optional parameters if provided
|
|
502
|
+
if formats:
|
|
503
|
+
scrape_params['formats'] = formats
|
|
504
|
+
if include_tags:
|
|
505
|
+
scrape_params['includeTags'] = include_tags
|
|
506
|
+
if exclude_tags:
|
|
507
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
508
|
+
if only_main_content is not None:
|
|
509
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
510
|
+
if wait_for:
|
|
511
|
+
scrape_params['waitFor'] = wait_for
|
|
512
|
+
if timeout:
|
|
513
|
+
scrape_params['timeout'] = timeout
|
|
514
|
+
if location:
|
|
515
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
516
|
+
if mobile is not None:
|
|
517
|
+
scrape_params['mobile'] = mobile
|
|
518
|
+
if skip_tls_verification is not None:
|
|
519
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
520
|
+
if remove_base64_images is not None:
|
|
521
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
522
|
+
if block_ads is not None:
|
|
523
|
+
scrape_params['blockAds'] = block_ads
|
|
524
|
+
if proxy:
|
|
525
|
+
scrape_params['proxy'] = proxy
|
|
526
|
+
if extract:
|
|
527
|
+
if hasattr(extract.schema, 'schema'):
|
|
528
|
+
extract.schema = extract.schema.schema()
|
|
529
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
530
|
+
if json_options:
|
|
531
|
+
if hasattr(json_options.schema, 'schema'):
|
|
532
|
+
json_options.schema = json_options.schema.schema()
|
|
533
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
534
|
+
if actions:
|
|
535
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
536
|
+
scrape_params.update(kwargs)
|
|
537
|
+
|
|
538
|
+
# Make request
|
|
192
539
|
response = requests.post(
|
|
193
|
-
f'{self.api_url}
|
|
540
|
+
f'{self.api_url}/v1/scrape',
|
|
194
541
|
headers=headers,
|
|
195
542
|
json=scrape_params,
|
|
196
|
-
timeout=(
|
|
543
|
+
timeout=(timeout + 5000 if timeout else None)
|
|
197
544
|
)
|
|
545
|
+
|
|
198
546
|
if response.status_code == 200:
|
|
199
547
|
try:
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
raise Exception(
|
|
548
|
+
response_json = response.json()
|
|
549
|
+
if response_json.get('success') and 'data' in response_json:
|
|
550
|
+
return ScrapeResponse(**response_json['data'])
|
|
551
|
+
elif "error" in response_json:
|
|
552
|
+
raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
|
|
553
|
+
else:
|
|
554
|
+
raise Exception(f'Failed to scrape URL. Error: {response_json}')
|
|
555
|
+
except ValueError:
|
|
556
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
209
557
|
else:
|
|
210
558
|
self._handle_error(response, 'scrape URL')
|
|
211
559
|
|
|
212
|
-
def search(
|
|
560
|
+
def search(
|
|
561
|
+
self,
|
|
562
|
+
query: str,
|
|
563
|
+
*,
|
|
564
|
+
limit: Optional[int] = None,
|
|
565
|
+
tbs: Optional[str] = None,
|
|
566
|
+
filter: Optional[str] = None,
|
|
567
|
+
lang: Optional[str] = None,
|
|
568
|
+
country: Optional[str] = None,
|
|
569
|
+
location: Optional[str] = None,
|
|
570
|
+
timeout: Optional[int] = None,
|
|
571
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
572
|
+
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
573
|
+
**kwargs) -> SearchResponse:
|
|
213
574
|
"""
|
|
214
|
-
Search for content using
|
|
575
|
+
Search for content using Firecrawl.
|
|
215
576
|
|
|
216
577
|
Args:
|
|
217
|
-
query (str):
|
|
218
|
-
|
|
578
|
+
query (str): Search query string
|
|
579
|
+
limit (Optional[int]): Max results (default: 5)
|
|
580
|
+
tbs (Optional[str]): Time filter (e.g. "qdr:d")
|
|
581
|
+
filter (Optional[str]): Custom result filter
|
|
582
|
+
lang (Optional[str]): Language code (default: "en")
|
|
583
|
+
country (Optional[str]): Country code (default: "us")
|
|
584
|
+
location (Optional[str]): Geo-targeting
|
|
585
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
586
|
+
scrape_options (Optional[CommonOptions]): Result scraping configuration
|
|
587
|
+
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
588
|
+
**kwargs: Additional keyword arguments for future compatibility
|
|
219
589
|
|
|
220
590
|
Returns:
|
|
221
|
-
|
|
591
|
+
SearchResponse: Response containing:
|
|
592
|
+
* success (bool): Whether request succeeded
|
|
593
|
+
* data (List[FirecrawlDocument]): Search results
|
|
594
|
+
* warning (Optional[str]): Warning message if any
|
|
595
|
+
* error (Optional[str]): Error message if any
|
|
596
|
+
|
|
597
|
+
Raises:
|
|
598
|
+
Exception: If search fails or response cannot be parsed
|
|
222
599
|
"""
|
|
223
|
-
|
|
224
|
-
|
|
600
|
+
# Build search parameters
|
|
601
|
+
search_params = {}
|
|
602
|
+
if params:
|
|
603
|
+
if isinstance(params, dict):
|
|
604
|
+
search_params.update(params)
|
|
605
|
+
else:
|
|
606
|
+
search_params.update(params.dict(exclude_none=True))
|
|
607
|
+
|
|
608
|
+
# Add individual parameters
|
|
609
|
+
if limit is not None:
|
|
610
|
+
search_params['limit'] = limit
|
|
611
|
+
if tbs is not None:
|
|
612
|
+
search_params['tbs'] = tbs
|
|
613
|
+
if filter is not None:
|
|
614
|
+
search_params['filter'] = filter
|
|
615
|
+
if lang is not None:
|
|
616
|
+
search_params['lang'] = lang
|
|
617
|
+
if country is not None:
|
|
618
|
+
search_params['country'] = country
|
|
619
|
+
if location is not None:
|
|
620
|
+
search_params['location'] = location
|
|
621
|
+
if timeout is not None:
|
|
622
|
+
search_params['timeout'] = timeout
|
|
623
|
+
if scrape_options is not None:
|
|
624
|
+
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
625
|
+
|
|
626
|
+
# Add any additional kwargs
|
|
627
|
+
search_params.update(kwargs)
|
|
225
628
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
search_params.query = query
|
|
629
|
+
# Create final params object
|
|
630
|
+
final_params = SearchParams(query=query, **search_params)
|
|
631
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
632
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
231
633
|
|
|
634
|
+
# Make request
|
|
232
635
|
response = requests.post(
|
|
233
636
|
f"{self.api_url}/v1/search",
|
|
234
637
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
235
|
-
json=
|
|
638
|
+
json=params_dict
|
|
236
639
|
)
|
|
237
640
|
|
|
238
|
-
if response.status_code
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
641
|
+
if response.status_code == 200:
|
|
642
|
+
try:
|
|
643
|
+
response_json = response.json()
|
|
644
|
+
if response_json.get('success') and 'data' in response_json:
|
|
645
|
+
return SearchResponse(**response_json)
|
|
646
|
+
elif "error" in response_json:
|
|
647
|
+
raise Exception(f'Search failed. Error: {response_json["error"]}')
|
|
648
|
+
else:
|
|
649
|
+
raise Exception(f'Search failed. Error: {response_json}')
|
|
650
|
+
except ValueError:
|
|
651
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
652
|
+
else:
|
|
653
|
+
self._handle_error(response, 'search')
|
|
654
|
+
|
|
655
|
+
def crawl_url(
|
|
656
|
+
self,
|
|
657
|
+
url: str,
|
|
658
|
+
*,
|
|
659
|
+
include_paths: Optional[List[str]] = None,
|
|
660
|
+
exclude_paths: Optional[List[str]] = None,
|
|
661
|
+
max_depth: Optional[int] = None,
|
|
662
|
+
max_discovery_depth: Optional[int] = None,
|
|
663
|
+
limit: Optional[int] = None,
|
|
664
|
+
allow_backward_links: Optional[bool] = None,
|
|
665
|
+
allow_external_links: Optional[bool] = None,
|
|
666
|
+
ignore_sitemap: Optional[bool] = None,
|
|
667
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
668
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
669
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
670
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
671
|
+
regex_on_full_url: Optional[bool] = None,
|
|
672
|
+
poll_interval: Optional[int] = 2,
|
|
673
|
+
idempotency_key: Optional[str] = None,
|
|
674
|
+
**kwargs
|
|
675
|
+
) -> CrawlStatusResponse:
|
|
250
676
|
"""
|
|
251
|
-
|
|
677
|
+
Crawl a website starting from a URL.
|
|
252
678
|
|
|
253
679
|
Args:
|
|
254
|
-
url (str):
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
680
|
+
url (str): Target URL to start crawling from
|
|
681
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
682
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
683
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
684
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
685
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
686
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
687
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
688
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
689
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
690
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
691
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
692
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
693
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
694
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
695
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
696
|
+
**kwargs: Additional parameters to pass to the API
|
|
258
697
|
|
|
259
698
|
Returns:
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
- 'total' (int): Total number of scraped pages.
|
|
265
|
-
- 'creditsUsed' (int): Estimated number of API credits used for this crawl.
|
|
266
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires.
|
|
267
|
-
- 'data' (List[Dict]): List of all the scraped pages.
|
|
699
|
+
CrawlStatusResponse with:
|
|
700
|
+
* Crawling status and progress
|
|
701
|
+
* Crawled page contents
|
|
702
|
+
* Success/error information
|
|
268
703
|
|
|
269
704
|
Raises:
|
|
270
|
-
Exception: If
|
|
705
|
+
Exception: If crawl fails
|
|
271
706
|
"""
|
|
272
|
-
|
|
707
|
+
crawl_params = {}
|
|
708
|
+
|
|
709
|
+
# Add individual parameters
|
|
710
|
+
if include_paths is not None:
|
|
711
|
+
crawl_params['includePaths'] = include_paths
|
|
712
|
+
if exclude_paths is not None:
|
|
713
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
714
|
+
if max_depth is not None:
|
|
715
|
+
crawl_params['maxDepth'] = max_depth
|
|
716
|
+
if max_discovery_depth is not None:
|
|
717
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
718
|
+
if limit is not None:
|
|
719
|
+
crawl_params['limit'] = limit
|
|
720
|
+
if allow_backward_links is not None:
|
|
721
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
722
|
+
if allow_external_links is not None:
|
|
723
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
724
|
+
if ignore_sitemap is not None:
|
|
725
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
726
|
+
if scrape_options is not None:
|
|
727
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
728
|
+
if webhook is not None:
|
|
729
|
+
crawl_params['webhook'] = webhook
|
|
730
|
+
if deduplicate_similar_urls is not None:
|
|
731
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
732
|
+
if ignore_query_parameters is not None:
|
|
733
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
734
|
+
if regex_on_full_url is not None:
|
|
735
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
736
|
+
|
|
737
|
+
# Add any additional kwargs
|
|
738
|
+
crawl_params.update(kwargs)
|
|
739
|
+
|
|
740
|
+
# Create final params object
|
|
741
|
+
final_params = CrawlParams(**crawl_params)
|
|
742
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
743
|
+
params_dict['url'] = url
|
|
744
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
745
|
+
|
|
746
|
+
# Make request
|
|
273
747
|
headers = self._prepare_headers(idempotency_key)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
json_data.update(params)
|
|
277
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
748
|
+
response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
749
|
+
|
|
278
750
|
if response.status_code == 200:
|
|
279
751
|
try:
|
|
280
752
|
id = response.json().get('id')
|
|
281
753
|
except:
|
|
282
754
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
283
755
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
284
|
-
|
|
285
756
|
else:
|
|
286
757
|
self._handle_error(response, 'start crawl job')
|
|
287
758
|
|
|
288
|
-
|
|
289
|
-
|
|
759
|
+
def async_crawl_url(
|
|
760
|
+
self,
|
|
761
|
+
url: str,
|
|
762
|
+
*,
|
|
763
|
+
include_paths: Optional[List[str]] = None,
|
|
764
|
+
exclude_paths: Optional[List[str]] = None,
|
|
765
|
+
max_depth: Optional[int] = None,
|
|
766
|
+
max_discovery_depth: Optional[int] = None,
|
|
767
|
+
limit: Optional[int] = None,
|
|
768
|
+
allow_backward_links: Optional[bool] = None,
|
|
769
|
+
allow_external_links: Optional[bool] = None,
|
|
770
|
+
ignore_sitemap: Optional[bool] = None,
|
|
771
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
772
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
773
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
774
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
775
|
+
regex_on_full_url: Optional[bool] = None,
|
|
776
|
+
idempotency_key: Optional[str] = None,
|
|
777
|
+
**kwargs
|
|
778
|
+
) -> CrawlResponse:
|
|
290
779
|
"""
|
|
291
|
-
|
|
780
|
+
Start an asynchronous crawl job.
|
|
292
781
|
|
|
293
782
|
Args:
|
|
294
|
-
url (str):
|
|
295
|
-
|
|
296
|
-
|
|
783
|
+
url (str): Target URL to start crawling from
|
|
784
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
785
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
786
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
787
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
788
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
789
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
790
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
791
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
792
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
793
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
794
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
795
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
796
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
797
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
798
|
+
**kwargs: Additional parameters to pass to the API
|
|
297
799
|
|
|
298
800
|
Returns:
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
801
|
+
CrawlResponse with:
|
|
802
|
+
* success - Whether crawl started successfully
|
|
803
|
+
* id - Unique identifier for the crawl job
|
|
804
|
+
* url - Status check URL for the crawl
|
|
805
|
+
* error - Error message if start failed
|
|
806
|
+
|
|
807
|
+
Raises:
|
|
808
|
+
Exception: If crawl initiation fails
|
|
303
809
|
"""
|
|
304
|
-
|
|
810
|
+
crawl_params = {}
|
|
811
|
+
|
|
812
|
+
# Add individual parameters
|
|
813
|
+
if include_paths is not None:
|
|
814
|
+
crawl_params['includePaths'] = include_paths
|
|
815
|
+
if exclude_paths is not None:
|
|
816
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
817
|
+
if max_depth is not None:
|
|
818
|
+
crawl_params['maxDepth'] = max_depth
|
|
819
|
+
if max_discovery_depth is not None:
|
|
820
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
821
|
+
if limit is not None:
|
|
822
|
+
crawl_params['limit'] = limit
|
|
823
|
+
if allow_backward_links is not None:
|
|
824
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
825
|
+
if allow_external_links is not None:
|
|
826
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
827
|
+
if ignore_sitemap is not None:
|
|
828
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
829
|
+
if scrape_options is not None:
|
|
830
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
831
|
+
if webhook is not None:
|
|
832
|
+
crawl_params['webhook'] = webhook
|
|
833
|
+
if deduplicate_similar_urls is not None:
|
|
834
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
835
|
+
if ignore_query_parameters is not None:
|
|
836
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
837
|
+
if regex_on_full_url is not None:
|
|
838
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
839
|
+
|
|
840
|
+
# Add any additional kwargs
|
|
841
|
+
crawl_params.update(kwargs)
|
|
842
|
+
|
|
843
|
+
# Create final params object
|
|
844
|
+
final_params = CrawlParams(**crawl_params)
|
|
845
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
846
|
+
params_dict['url'] = url
|
|
847
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
848
|
+
|
|
849
|
+
# Make request
|
|
305
850
|
headers = self._prepare_headers(idempotency_key)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
json_data.update(params)
|
|
309
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
851
|
+
response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
852
|
+
|
|
310
853
|
if response.status_code == 200:
|
|
311
854
|
try:
|
|
312
|
-
return response.json()
|
|
855
|
+
return CrawlResponse(**response.json())
|
|
313
856
|
except:
|
|
314
857
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
315
858
|
else:
|
|
316
859
|
self._handle_error(response, 'start crawl job')
|
|
317
860
|
|
|
318
|
-
def check_crawl_status(self, id: str) ->
|
|
861
|
+
def check_crawl_status(self, id: str) -> CrawlStatusResponse:
|
|
319
862
|
"""
|
|
320
|
-
Check the status of a crawl job
|
|
863
|
+
Check the status and results of a crawl job.
|
|
321
864
|
|
|
322
865
|
Args:
|
|
323
|
-
id
|
|
866
|
+
id: Unique identifier for the crawl job
|
|
324
867
|
|
|
325
868
|
Returns:
|
|
326
|
-
|
|
869
|
+
CrawlStatusResponse containing:
|
|
870
|
+
|
|
871
|
+
Status Information:
|
|
872
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
873
|
+
* completed - Number of pages crawled
|
|
874
|
+
* total - Total pages to crawl
|
|
875
|
+
* creditsUsed - API credits consumed
|
|
876
|
+
* expiresAt - Data expiration timestamp
|
|
877
|
+
|
|
878
|
+
Results:
|
|
879
|
+
* data - List of crawled documents
|
|
880
|
+
* next - URL for next page of results (if paginated)
|
|
881
|
+
* success - Whether status check succeeded
|
|
882
|
+
* error - Error message if failed
|
|
327
883
|
|
|
328
884
|
Raises:
|
|
329
|
-
Exception: If
|
|
885
|
+
Exception: If status check fails
|
|
330
886
|
"""
|
|
331
887
|
endpoint = f'/v1/crawl/{id}'
|
|
332
888
|
|
|
@@ -378,28 +934,37 @@ class FirecrawlApp:
|
|
|
378
934
|
if 'next' in status_data:
|
|
379
935
|
response['next'] = status_data['next']
|
|
380
936
|
|
|
381
|
-
return
|
|
382
|
-
|
|
937
|
+
return CrawlStatusResponse(
|
|
938
|
+
success=False if 'error' in status_data else True,
|
|
383
939
|
**response
|
|
384
|
-
|
|
940
|
+
)
|
|
385
941
|
else:
|
|
386
942
|
self._handle_error(response, 'check crawl status')
|
|
387
943
|
|
|
388
|
-
def check_crawl_errors(self, id: str) ->
|
|
944
|
+
def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
|
|
389
945
|
"""
|
|
390
946
|
Returns information about crawl errors.
|
|
391
947
|
|
|
392
948
|
Args:
|
|
393
|
-
id (str): The ID of the crawl job
|
|
949
|
+
id (str): The ID of the crawl job
|
|
394
950
|
|
|
395
951
|
Returns:
|
|
396
|
-
|
|
952
|
+
CrawlErrorsResponse containing:
|
|
953
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
954
|
+
- id (str): Error ID
|
|
955
|
+
- timestamp (str): When the error occurred
|
|
956
|
+
- url (str): URL that caused the error
|
|
957
|
+
- error (str): Error message
|
|
958
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
959
|
+
|
|
960
|
+
Raises:
|
|
961
|
+
Exception: If error check fails
|
|
397
962
|
"""
|
|
398
963
|
headers = self._prepare_headers()
|
|
399
964
|
response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
|
|
400
965
|
if response.status_code == 200:
|
|
401
966
|
try:
|
|
402
|
-
return response.json()
|
|
967
|
+
return CrawlErrorsResponse(**response.json())
|
|
403
968
|
except:
|
|
404
969
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
405
970
|
else:
|
|
@@ -407,13 +972,18 @@ class FirecrawlApp:
|
|
|
407
972
|
|
|
408
973
|
def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
|
409
974
|
"""
|
|
410
|
-
Cancel an asynchronous crawl job
|
|
975
|
+
Cancel an asynchronous crawl job.
|
|
411
976
|
|
|
412
977
|
Args:
|
|
413
|
-
id (str): The ID of the crawl job to cancel
|
|
978
|
+
id (str): The ID of the crawl job to cancel
|
|
414
979
|
|
|
415
980
|
Returns:
|
|
416
|
-
Dict[str, Any]:
|
|
981
|
+
Dict[str, Any] containing:
|
|
982
|
+
* success (bool): Whether cancellation was successful
|
|
983
|
+
* error (str, optional): Error message if cancellation failed
|
|
984
|
+
|
|
985
|
+
Raises:
|
|
986
|
+
Exception: If cancellation fails
|
|
417
987
|
"""
|
|
418
988
|
headers = self._prepare_headers()
|
|
419
989
|
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
|
|
@@ -425,154 +995,524 @@ class FirecrawlApp:
|
|
|
425
995
|
else:
|
|
426
996
|
self._handle_error(response, "cancel crawl job")
|
|
427
997
|
|
|
428
|
-
def crawl_url_and_watch(
|
|
998
|
+
def crawl_url_and_watch(
|
|
999
|
+
self,
|
|
1000
|
+
url: str,
|
|
1001
|
+
*,
|
|
1002
|
+
include_paths: Optional[List[str]] = None,
|
|
1003
|
+
exclude_paths: Optional[List[str]] = None,
|
|
1004
|
+
max_depth: Optional[int] = None,
|
|
1005
|
+
max_discovery_depth: Optional[int] = None,
|
|
1006
|
+
limit: Optional[int] = None,
|
|
1007
|
+
allow_backward_links: Optional[bool] = None,
|
|
1008
|
+
allow_external_links: Optional[bool] = None,
|
|
1009
|
+
ignore_sitemap: Optional[bool] = None,
|
|
1010
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
1011
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
1012
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
1013
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
1014
|
+
regex_on_full_url: Optional[bool] = None,
|
|
1015
|
+
idempotency_key: Optional[str] = None,
|
|
1016
|
+
**kwargs
|
|
1017
|
+
) -> 'CrawlWatcher':
|
|
429
1018
|
"""
|
|
430
1019
|
Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
431
1020
|
|
|
432
1021
|
Args:
|
|
433
|
-
url (str):
|
|
434
|
-
|
|
435
|
-
|
|
1022
|
+
url (str): Target URL to start crawling from
|
|
1023
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
1024
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
1025
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
1026
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1027
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
1028
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
1029
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
1030
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1031
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
1032
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
1033
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1034
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1035
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1036
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1037
|
+
**kwargs: Additional parameters to pass to the API
|
|
436
1038
|
|
|
437
1039
|
Returns:
|
|
438
|
-
CrawlWatcher: An instance
|
|
1040
|
+
CrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
1041
|
+
|
|
1042
|
+
Raises:
|
|
1043
|
+
Exception: If crawl job fails to start
|
|
439
1044
|
"""
|
|
440
|
-
crawl_response = self.async_crawl_url(
|
|
441
|
-
|
|
442
|
-
|
|
1045
|
+
crawl_response = self.async_crawl_url(
|
|
1046
|
+
url,
|
|
1047
|
+
include_paths=include_paths,
|
|
1048
|
+
exclude_paths=exclude_paths,
|
|
1049
|
+
max_depth=max_depth,
|
|
1050
|
+
max_discovery_depth=max_discovery_depth,
|
|
1051
|
+
limit=limit,
|
|
1052
|
+
allow_backward_links=allow_backward_links,
|
|
1053
|
+
allow_external_links=allow_external_links,
|
|
1054
|
+
ignore_sitemap=ignore_sitemap,
|
|
1055
|
+
scrape_options=scrape_options,
|
|
1056
|
+
webhook=webhook,
|
|
1057
|
+
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1058
|
+
ignore_query_parameters=ignore_query_parameters,
|
|
1059
|
+
regex_on_full_url=regex_on_full_url,
|
|
1060
|
+
idempotency_key=idempotency_key,
|
|
1061
|
+
**kwargs
|
|
1062
|
+
)
|
|
1063
|
+
if crawl_response.success and crawl_response.id:
|
|
1064
|
+
return CrawlWatcher(crawl_response.id, self)
|
|
443
1065
|
else:
|
|
444
1066
|
raise Exception("Crawl job failed to start")
|
|
445
1067
|
|
|
446
|
-
def map_url(
|
|
1068
|
+
def map_url(
|
|
1069
|
+
self,
|
|
1070
|
+
url: str,
|
|
1071
|
+
*,
|
|
1072
|
+
search: Optional[str] = None,
|
|
1073
|
+
ignore_sitemap: Optional[bool] = None,
|
|
1074
|
+
include_subdomains: Optional[bool] = None,
|
|
1075
|
+
sitemap_only: Optional[bool] = None,
|
|
1076
|
+
limit: Optional[int] = None,
|
|
1077
|
+
timeout: Optional[int] = None,
|
|
1078
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
447
1079
|
"""
|
|
448
|
-
|
|
1080
|
+
Map and discover links from a URL.
|
|
449
1081
|
|
|
450
1082
|
Args:
|
|
451
|
-
url (str):
|
|
452
|
-
|
|
1083
|
+
url (str): Target URL to map
|
|
1084
|
+
search (Optional[str]): Filter pattern for URLs
|
|
1085
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1086
|
+
include_subdomains (Optional[bool]): Include subdomain links
|
|
1087
|
+
sitemap_only (Optional[bool]): Only use sitemap.xml
|
|
1088
|
+
limit (Optional[int]): Maximum URLs to return
|
|
1089
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1090
|
+
params (Optional[MapParams]): Additional mapping parameters
|
|
453
1091
|
|
|
454
1092
|
Returns:
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
1093
|
+
MapResponse: Response containing:
|
|
1094
|
+
* success (bool): Whether request succeeded
|
|
1095
|
+
* links (List[str]): Discovered URLs
|
|
1096
|
+
* error (Optional[str]): Error message if any
|
|
459
1097
|
|
|
460
|
-
|
|
461
|
-
|
|
1098
|
+
Raises:
|
|
1099
|
+
Exception: If mapping fails or response cannot be parsed
|
|
1100
|
+
"""
|
|
1101
|
+
# Build map parameters
|
|
1102
|
+
map_params = {}
|
|
462
1103
|
if params:
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
#
|
|
1104
|
+
map_params.update(params.dict(exclude_none=True))
|
|
1105
|
+
|
|
1106
|
+
# Add individual parameters
|
|
1107
|
+
if search is not None:
|
|
1108
|
+
map_params['search'] = search
|
|
1109
|
+
if ignore_sitemap is not None:
|
|
1110
|
+
map_params['ignoreSitemap'] = ignore_sitemap
|
|
1111
|
+
if include_subdomains is not None:
|
|
1112
|
+
map_params['includeSubdomains'] = include_subdomains
|
|
1113
|
+
if sitemap_only is not None:
|
|
1114
|
+
map_params['sitemapOnly'] = sitemap_only
|
|
1115
|
+
if limit is not None:
|
|
1116
|
+
map_params['limit'] = limit
|
|
1117
|
+
if timeout is not None:
|
|
1118
|
+
map_params['timeout'] = timeout
|
|
1119
|
+
|
|
1120
|
+
# Create final params object
|
|
1121
|
+
final_params = MapParams(**map_params)
|
|
1122
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1123
|
+
params_dict['url'] = url
|
|
1124
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1125
|
+
|
|
1126
|
+
# Make request
|
|
466
1127
|
response = requests.post(
|
|
467
|
-
f
|
|
468
|
-
headers=
|
|
469
|
-
json=
|
|
1128
|
+
f"{self.api_url}/v1/map",
|
|
1129
|
+
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
1130
|
+
json=params_dict
|
|
470
1131
|
)
|
|
1132
|
+
|
|
471
1133
|
if response.status_code == 200:
|
|
472
1134
|
try:
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
raise Exception(
|
|
1135
|
+
response_json = response.json()
|
|
1136
|
+
if response_json.get('success') and 'links' in response_json:
|
|
1137
|
+
return MapResponse(**response_json)
|
|
1138
|
+
elif "error" in response_json:
|
|
1139
|
+
raise Exception(f'Map failed. Error: {response_json["error"]}')
|
|
1140
|
+
else:
|
|
1141
|
+
raise Exception(f'Map failed. Error: {response_json}')
|
|
1142
|
+
except ValueError:
|
|
1143
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
482
1144
|
else:
|
|
483
1145
|
self._handle_error(response, 'map')
|
|
484
1146
|
|
|
485
|
-
def batch_scrape_urls(
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
1147
|
+
def batch_scrape_urls(
|
|
1148
|
+
self,
|
|
1149
|
+
urls: List[str],
|
|
1150
|
+
*,
|
|
1151
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1152
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1153
|
+
include_tags: Optional[List[str]] = None,
|
|
1154
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1155
|
+
only_main_content: Optional[bool] = None,
|
|
1156
|
+
wait_for: Optional[int] = None,
|
|
1157
|
+
timeout: Optional[int] = None,
|
|
1158
|
+
location: Optional[LocationConfig] = None,
|
|
1159
|
+
mobile: Optional[bool] = None,
|
|
1160
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1161
|
+
remove_base64_images: Optional[bool] = None,
|
|
1162
|
+
block_ads: Optional[bool] = None,
|
|
1163
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1164
|
+
extract: Optional[ExtractConfig] = None,
|
|
1165
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1166
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1167
|
+
agent: Optional[AgentOptions] = None,
|
|
1168
|
+
poll_interval: Optional[int] = 2,
|
|
1169
|
+
idempotency_key: Optional[str] = None,
|
|
1170
|
+
**kwargs
|
|
1171
|
+
) -> BatchScrapeStatusResponse:
|
|
489
1172
|
"""
|
|
490
|
-
|
|
1173
|
+
Batch scrape multiple URLs and monitor until completion.
|
|
491
1174
|
|
|
492
1175
|
Args:
|
|
493
|
-
urls (List[str]):
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
1176
|
+
urls (List[str]): URLs to scrape
|
|
1177
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1178
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1179
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1180
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1181
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1182
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1183
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1184
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1185
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1186
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1187
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1188
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1189
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1190
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1191
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1192
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1193
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1194
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1195
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1196
|
+
**kwargs: Additional parameters to pass to the API
|
|
497
1197
|
|
|
498
1198
|
Returns:
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
- 'total' (int): Total number of scraped pages.
|
|
504
|
-
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
|
|
505
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
|
|
506
|
-
- 'data' (List[Dict]): List of all the scraped pages.
|
|
1199
|
+
BatchScrapeStatusResponse with:
|
|
1200
|
+
* Scraping status and progress
|
|
1201
|
+
* Scraped content for each URL
|
|
1202
|
+
* Success/error information
|
|
507
1203
|
|
|
508
1204
|
Raises:
|
|
509
|
-
Exception: If
|
|
1205
|
+
Exception: If batch scrape fails
|
|
510
1206
|
"""
|
|
511
|
-
|
|
1207
|
+
scrape_params = {}
|
|
1208
|
+
|
|
1209
|
+
# Add individual parameters
|
|
1210
|
+
if formats is not None:
|
|
1211
|
+
scrape_params['formats'] = formats
|
|
1212
|
+
if headers is not None:
|
|
1213
|
+
scrape_params['headers'] = headers
|
|
1214
|
+
if include_tags is not None:
|
|
1215
|
+
scrape_params['includeTags'] = include_tags
|
|
1216
|
+
if exclude_tags is not None:
|
|
1217
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1218
|
+
if only_main_content is not None:
|
|
1219
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1220
|
+
if wait_for is not None:
|
|
1221
|
+
scrape_params['waitFor'] = wait_for
|
|
1222
|
+
if timeout is not None:
|
|
1223
|
+
scrape_params['timeout'] = timeout
|
|
1224
|
+
if location is not None:
|
|
1225
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1226
|
+
if mobile is not None:
|
|
1227
|
+
scrape_params['mobile'] = mobile
|
|
1228
|
+
if skip_tls_verification is not None:
|
|
1229
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1230
|
+
if remove_base64_images is not None:
|
|
1231
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1232
|
+
if block_ads is not None:
|
|
1233
|
+
scrape_params['blockAds'] = block_ads
|
|
1234
|
+
if proxy is not None:
|
|
1235
|
+
scrape_params['proxy'] = proxy
|
|
1236
|
+
if extract is not None:
|
|
1237
|
+
if hasattr(extract.schema, 'schema'):
|
|
1238
|
+
extract.schema = extract.schema.schema()
|
|
1239
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1240
|
+
if json_options is not None:
|
|
1241
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1242
|
+
json_options.schema = json_options.schema.schema()
|
|
1243
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1244
|
+
if actions is not None:
|
|
1245
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1246
|
+
if agent is not None:
|
|
1247
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1248
|
+
|
|
1249
|
+
# Add any additional kwargs
|
|
1250
|
+
scrape_params.update(kwargs)
|
|
1251
|
+
|
|
1252
|
+
# Create final params object
|
|
1253
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1254
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1255
|
+
params_dict['urls'] = urls
|
|
1256
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1257
|
+
|
|
1258
|
+
# Make request
|
|
512
1259
|
headers = self._prepare_headers(idempotency_key)
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
json_data.update(params)
|
|
516
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
1260
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1261
|
+
|
|
517
1262
|
if response.status_code == 200:
|
|
518
1263
|
try:
|
|
519
1264
|
id = response.json().get('id')
|
|
520
1265
|
except:
|
|
521
1266
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
522
1267
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
523
|
-
|
|
524
1268
|
else:
|
|
525
1269
|
self._handle_error(response, 'start batch scrape job')
|
|
526
1270
|
|
|
527
|
-
|
|
528
|
-
|
|
1271
|
+
def async_batch_scrape_urls(
|
|
1272
|
+
self,
|
|
1273
|
+
urls: List[str],
|
|
1274
|
+
*,
|
|
1275
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1276
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1277
|
+
include_tags: Optional[List[str]] = None,
|
|
1278
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1279
|
+
only_main_content: Optional[bool] = None,
|
|
1280
|
+
wait_for: Optional[int] = None,
|
|
1281
|
+
timeout: Optional[int] = None,
|
|
1282
|
+
location: Optional[LocationConfig] = None,
|
|
1283
|
+
mobile: Optional[bool] = None,
|
|
1284
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1285
|
+
remove_base64_images: Optional[bool] = None,
|
|
1286
|
+
block_ads: Optional[bool] = None,
|
|
1287
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1288
|
+
extract: Optional[ExtractConfig] = None,
|
|
1289
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1290
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1291
|
+
agent: Optional[AgentOptions] = None,
|
|
1292
|
+
idempotency_key: Optional[str] = None,
|
|
1293
|
+
**kwargs
|
|
1294
|
+
) -> BatchScrapeResponse:
|
|
529
1295
|
"""
|
|
530
|
-
Initiate a
|
|
1296
|
+
Initiate a batch scrape job asynchronously.
|
|
531
1297
|
|
|
532
1298
|
Args:
|
|
533
|
-
urls (List[str]):
|
|
534
|
-
|
|
535
|
-
|
|
1299
|
+
urls (List[str]): URLs to scrape
|
|
1300
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1301
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1302
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1303
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1304
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1305
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1306
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1307
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1308
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1309
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1310
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1311
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1312
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1313
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1314
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1315
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1316
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1317
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1318
|
+
**kwargs: Additional parameters to pass to the API
|
|
536
1319
|
|
|
537
1320
|
Returns:
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
1321
|
+
BatchScrapeResponse with:
|
|
1322
|
+
* success - Whether job started successfully
|
|
1323
|
+
* id - Unique identifier for the job
|
|
1324
|
+
* url - Status check URL
|
|
1325
|
+
* error - Error message if start failed
|
|
1326
|
+
|
|
1327
|
+
Raises:
|
|
1328
|
+
Exception: If job initiation fails
|
|
542
1329
|
"""
|
|
543
|
-
|
|
1330
|
+
scrape_params = {}
|
|
1331
|
+
|
|
1332
|
+
# Add individual parameters
|
|
1333
|
+
if formats is not None:
|
|
1334
|
+
scrape_params['formats'] = formats
|
|
1335
|
+
if headers is not None:
|
|
1336
|
+
scrape_params['headers'] = headers
|
|
1337
|
+
if include_tags is not None:
|
|
1338
|
+
scrape_params['includeTags'] = include_tags
|
|
1339
|
+
if exclude_tags is not None:
|
|
1340
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1341
|
+
if only_main_content is not None:
|
|
1342
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1343
|
+
if wait_for is not None:
|
|
1344
|
+
scrape_params['waitFor'] = wait_for
|
|
1345
|
+
if timeout is not None:
|
|
1346
|
+
scrape_params['timeout'] = timeout
|
|
1347
|
+
if location is not None:
|
|
1348
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1349
|
+
if mobile is not None:
|
|
1350
|
+
scrape_params['mobile'] = mobile
|
|
1351
|
+
if skip_tls_verification is not None:
|
|
1352
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1353
|
+
if remove_base64_images is not None:
|
|
1354
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1355
|
+
if block_ads is not None:
|
|
1356
|
+
scrape_params['blockAds'] = block_ads
|
|
1357
|
+
if proxy is not None:
|
|
1358
|
+
scrape_params['proxy'] = proxy
|
|
1359
|
+
if extract is not None:
|
|
1360
|
+
if hasattr(extract.schema, 'schema'):
|
|
1361
|
+
extract.schema = extract.schema.schema()
|
|
1362
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1363
|
+
if json_options is not None:
|
|
1364
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1365
|
+
json_options.schema = json_options.schema.schema()
|
|
1366
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1367
|
+
if actions is not None:
|
|
1368
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1369
|
+
if agent is not None:
|
|
1370
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1371
|
+
|
|
1372
|
+
# Add any additional kwargs
|
|
1373
|
+
scrape_params.update(kwargs)
|
|
1374
|
+
|
|
1375
|
+
# Create final params object
|
|
1376
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1377
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1378
|
+
params_dict['urls'] = urls
|
|
1379
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1380
|
+
|
|
1381
|
+
# Make request
|
|
544
1382
|
headers = self._prepare_headers(idempotency_key)
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
json_data.update(params)
|
|
548
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
1383
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1384
|
+
|
|
549
1385
|
if response.status_code == 200:
|
|
550
1386
|
try:
|
|
551
|
-
return response.json()
|
|
1387
|
+
return BatchScrapeResponse(**response.json())
|
|
552
1388
|
except:
|
|
553
1389
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
554
1390
|
else:
|
|
555
1391
|
self._handle_error(response, 'start batch scrape job')
|
|
556
1392
|
|
|
557
|
-
def batch_scrape_urls_and_watch(
|
|
1393
|
+
def batch_scrape_urls_and_watch(
|
|
1394
|
+
self,
|
|
1395
|
+
urls: List[str],
|
|
1396
|
+
*,
|
|
1397
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1398
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1399
|
+
include_tags: Optional[List[str]] = None,
|
|
1400
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1401
|
+
only_main_content: Optional[bool] = None,
|
|
1402
|
+
wait_for: Optional[int] = None,
|
|
1403
|
+
timeout: Optional[int] = None,
|
|
1404
|
+
location: Optional[LocationConfig] = None,
|
|
1405
|
+
mobile: Optional[bool] = None,
|
|
1406
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1407
|
+
remove_base64_images: Optional[bool] = None,
|
|
1408
|
+
block_ads: Optional[bool] = None,
|
|
1409
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1410
|
+
extract: Optional[ExtractConfig] = None,
|
|
1411
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1412
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1413
|
+
agent: Optional[AgentOptions] = None,
|
|
1414
|
+
idempotency_key: Optional[str] = None,
|
|
1415
|
+
**kwargs
|
|
1416
|
+
) -> 'CrawlWatcher':
|
|
558
1417
|
"""
|
|
559
1418
|
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
560
1419
|
|
|
561
1420
|
Args:
|
|
562
|
-
urls (List[str]):
|
|
563
|
-
|
|
564
|
-
|
|
1421
|
+
urls (List[str]): URLs to scrape
|
|
1422
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1423
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1424
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1425
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1426
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1427
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1428
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1429
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1430
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1431
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1432
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1433
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1434
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1435
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1436
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1437
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1438
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1439
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1440
|
+
**kwargs: Additional parameters to pass to the API
|
|
565
1441
|
|
|
566
1442
|
Returns:
|
|
567
|
-
CrawlWatcher: An instance
|
|
1443
|
+
CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
1444
|
+
|
|
1445
|
+
Raises:
|
|
1446
|
+
Exception: If batch scrape job fails to start
|
|
568
1447
|
"""
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
1448
|
+
scrape_params = {}
|
|
1449
|
+
|
|
1450
|
+
# Add individual parameters
|
|
1451
|
+
if formats is not None:
|
|
1452
|
+
scrape_params['formats'] = formats
|
|
1453
|
+
if headers is not None:
|
|
1454
|
+
scrape_params['headers'] = headers
|
|
1455
|
+
if include_tags is not None:
|
|
1456
|
+
scrape_params['includeTags'] = include_tags
|
|
1457
|
+
if exclude_tags is not None:
|
|
1458
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1459
|
+
if only_main_content is not None:
|
|
1460
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1461
|
+
if wait_for is not None:
|
|
1462
|
+
scrape_params['waitFor'] = wait_for
|
|
1463
|
+
if timeout is not None:
|
|
1464
|
+
scrape_params['timeout'] = timeout
|
|
1465
|
+
if location is not None:
|
|
1466
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1467
|
+
if mobile is not None:
|
|
1468
|
+
scrape_params['mobile'] = mobile
|
|
1469
|
+
if skip_tls_verification is not None:
|
|
1470
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1471
|
+
if remove_base64_images is not None:
|
|
1472
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1473
|
+
if block_ads is not None:
|
|
1474
|
+
scrape_params['blockAds'] = block_ads
|
|
1475
|
+
if proxy is not None:
|
|
1476
|
+
scrape_params['proxy'] = proxy
|
|
1477
|
+
if extract is not None:
|
|
1478
|
+
if hasattr(extract.schema, 'schema'):
|
|
1479
|
+
extract.schema = extract.schema.schema()
|
|
1480
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1481
|
+
if json_options is not None:
|
|
1482
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1483
|
+
json_options.schema = json_options.schema.schema()
|
|
1484
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1485
|
+
if actions is not None:
|
|
1486
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1487
|
+
if agent is not None:
|
|
1488
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1489
|
+
|
|
1490
|
+
# Add any additional kwargs
|
|
1491
|
+
scrape_params.update(kwargs)
|
|
1492
|
+
|
|
1493
|
+
# Create final params object
|
|
1494
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1495
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1496
|
+
params_dict['urls'] = urls
|
|
1497
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1498
|
+
|
|
1499
|
+
# Make request
|
|
1500
|
+
headers = self._prepare_headers(idempotency_key)
|
|
1501
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1502
|
+
|
|
1503
|
+
if response.status_code == 200:
|
|
1504
|
+
try:
|
|
1505
|
+
crawl_response = BatchScrapeResponse(**response.json())
|
|
1506
|
+
if crawl_response.success and crawl_response.id:
|
|
1507
|
+
return CrawlWatcher(crawl_response.id, self)
|
|
1508
|
+
else:
|
|
1509
|
+
raise Exception("Batch scrape job failed to start")
|
|
1510
|
+
except:
|
|
1511
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
572
1512
|
else:
|
|
573
|
-
|
|
1513
|
+
self._handle_error(response, 'start batch scrape job')
|
|
574
1514
|
|
|
575
|
-
def check_batch_scrape_status(self, id: str) ->
|
|
1515
|
+
def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
|
|
576
1516
|
"""
|
|
577
1517
|
Check the status of a batch scrape job using the Firecrawl API.
|
|
578
1518
|
|
|
@@ -580,7 +1520,7 @@ class FirecrawlApp:
|
|
|
580
1520
|
id (str): The ID of the batch scrape job.
|
|
581
1521
|
|
|
582
1522
|
Returns:
|
|
583
|
-
|
|
1523
|
+
BatchScrapeStatusResponse: The status of the batch scrape job.
|
|
584
1524
|
|
|
585
1525
|
Raises:
|
|
586
1526
|
Exception: If the status check request fails.
|
|
@@ -620,29 +1560,21 @@ class FirecrawlApp:
|
|
|
620
1560
|
break
|
|
621
1561
|
status_data['data'] = data
|
|
622
1562
|
|
|
623
|
-
|
|
1563
|
+
return BatchScrapeStatusResponse(**{
|
|
1564
|
+
'success': False if 'error' in status_data else True,
|
|
624
1565
|
'status': status_data.get('status'),
|
|
625
1566
|
'total': status_data.get('total'),
|
|
626
1567
|
'completed': status_data.get('completed'),
|
|
627
1568
|
'creditsUsed': status_data.get('creditsUsed'),
|
|
628
1569
|
'expiresAt': status_data.get('expiresAt'),
|
|
629
|
-
'data': status_data.get('data')
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
response['error'] = status_data['error']
|
|
634
|
-
|
|
635
|
-
if 'next' in status_data:
|
|
636
|
-
response['next'] = status_data['next']
|
|
637
|
-
|
|
638
|
-
return {
|
|
639
|
-
'success': False if 'error' in status_data else True,
|
|
640
|
-
**response
|
|
641
|
-
}
|
|
1570
|
+
'data': status_data.get('data'),
|
|
1571
|
+
'next': status_data.get('next'),
|
|
1572
|
+
'error': status_data.get('error')
|
|
1573
|
+
})
|
|
642
1574
|
else:
|
|
643
1575
|
self._handle_error(response, 'check batch scrape status')
|
|
644
1576
|
|
|
645
|
-
def check_batch_scrape_errors(self, id: str) ->
|
|
1577
|
+
def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
|
|
646
1578
|
"""
|
|
647
1579
|
Returns information about batch scrape errors.
|
|
648
1580
|
|
|
@@ -650,38 +1582,68 @@ class FirecrawlApp:
|
|
|
650
1582
|
id (str): The ID of the crawl job.
|
|
651
1583
|
|
|
652
1584
|
Returns:
|
|
653
|
-
|
|
1585
|
+
CrawlErrorsResponse: A response containing:
|
|
1586
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
1587
|
+
* id (str): Error ID
|
|
1588
|
+
* timestamp (str): When the error occurred
|
|
1589
|
+
* url (str): URL that caused the error
|
|
1590
|
+
* error (str): Error message
|
|
1591
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
1592
|
+
|
|
1593
|
+
Raises:
|
|
1594
|
+
Exception: If the error check request fails
|
|
654
1595
|
"""
|
|
655
1596
|
headers = self._prepare_headers()
|
|
656
1597
|
response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
|
|
657
1598
|
if response.status_code == 200:
|
|
658
1599
|
try:
|
|
659
|
-
return response.json()
|
|
1600
|
+
return CrawlErrorsResponse(**response.json())
|
|
660
1601
|
except:
|
|
661
1602
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
662
1603
|
else:
|
|
663
1604
|
self._handle_error(response, "check batch scrape errors")
|
|
664
1605
|
|
|
665
|
-
def extract(
|
|
1606
|
+
def extract(
|
|
1607
|
+
self,
|
|
1608
|
+
urls: Optional[List[str]] = None,
|
|
1609
|
+
*,
|
|
1610
|
+
prompt: Optional[str] = None,
|
|
1611
|
+
schema: Optional[Any] = None,
|
|
1612
|
+
system_prompt: Optional[str] = None,
|
|
1613
|
+
allow_external_links: Optional[bool] = False,
|
|
1614
|
+
enable_web_search: Optional[bool] = False,
|
|
1615
|
+
show_sources: Optional[bool] = False,
|
|
1616
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
666
1617
|
"""
|
|
667
|
-
|
|
1618
|
+
Extract structured information from URLs.
|
|
668
1619
|
|
|
669
1620
|
Args:
|
|
670
|
-
urls (Optional[List[str]]):
|
|
671
|
-
|
|
1621
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
1622
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
1623
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
1624
|
+
system_prompt (Optional[str]): System context
|
|
1625
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
1626
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
1627
|
+
show_sources (Optional[bool]): Include source URLs
|
|
1628
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
672
1629
|
|
|
673
1630
|
Returns:
|
|
674
|
-
|
|
1631
|
+
ExtractResponse[Any] with:
|
|
1632
|
+
* success (bool): Whether request succeeded
|
|
1633
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
1634
|
+
* error (Optional[str]): Error message if any
|
|
1635
|
+
|
|
1636
|
+
Raises:
|
|
1637
|
+
ValueError: If prompt/schema missing or extraction fails
|
|
675
1638
|
"""
|
|
676
1639
|
headers = self._prepare_headers()
|
|
677
1640
|
|
|
678
|
-
if not
|
|
1641
|
+
if not prompt and not schema:
|
|
679
1642
|
raise ValueError("Either prompt or schema is required")
|
|
680
1643
|
|
|
681
|
-
if not urls and not
|
|
1644
|
+
if not urls and not prompt:
|
|
682
1645
|
raise ValueError("Either urls or prompt is required")
|
|
683
1646
|
|
|
684
|
-
schema = params.get('schema')
|
|
685
1647
|
if schema:
|
|
686
1648
|
if hasattr(schema, 'model_json_schema'):
|
|
687
1649
|
# Convert Pydantic model to JSON schema
|
|
@@ -689,23 +1651,22 @@ class FirecrawlApp:
|
|
|
689
1651
|
# Otherwise assume it's already a JSON schema dict
|
|
690
1652
|
|
|
691
1653
|
request_data = {
|
|
692
|
-
'urls': urls,
|
|
693
|
-
'allowExternalLinks':
|
|
694
|
-
'enableWebSearch':
|
|
695
|
-
'showSources':
|
|
1654
|
+
'urls': urls or [],
|
|
1655
|
+
'allowExternalLinks': allow_external_links,
|
|
1656
|
+
'enableWebSearch': enable_web_search,
|
|
1657
|
+
'showSources': show_sources,
|
|
696
1658
|
'schema': schema,
|
|
697
|
-
'origin': '
|
|
1659
|
+
'origin': f'python-sdk@{get_version()}'
|
|
698
1660
|
}
|
|
699
1661
|
|
|
700
|
-
if not request_data['urls']:
|
|
701
|
-
request_data['urls'] = []
|
|
702
1662
|
# Only add prompt and systemPrompt if they exist
|
|
703
|
-
if
|
|
704
|
-
request_data['prompt'] =
|
|
705
|
-
if
|
|
706
|
-
request_data['systemPrompt'] =
|
|
707
|
-
|
|
708
|
-
|
|
1663
|
+
if prompt:
|
|
1664
|
+
request_data['prompt'] = prompt
|
|
1665
|
+
if system_prompt:
|
|
1666
|
+
request_data['systemPrompt'] = system_prompt
|
|
1667
|
+
|
|
1668
|
+
if agent:
|
|
1669
|
+
request_data['agent'] = agent
|
|
709
1670
|
|
|
710
1671
|
try:
|
|
711
1672
|
# Send the initial extract request
|
|
@@ -736,10 +1697,7 @@ class FirecrawlApp:
|
|
|
736
1697
|
except:
|
|
737
1698
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
738
1699
|
if status_data['status'] == 'completed':
|
|
739
|
-
|
|
740
|
-
return status_data
|
|
741
|
-
else:
|
|
742
|
-
raise Exception(f'Failed to extract. Error: {status_data["error"]}')
|
|
1700
|
+
return ExtractResponse(**status_data)
|
|
743
1701
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
744
1702
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
745
1703
|
else:
|
|
@@ -753,9 +1711,9 @@ class FirecrawlApp:
|
|
|
753
1711
|
except Exception as e:
|
|
754
1712
|
raise ValueError(str(e), 500)
|
|
755
1713
|
|
|
756
|
-
return
|
|
1714
|
+
return ExtractResponse(success=False, error="Internal server error.")
|
|
757
1715
|
|
|
758
|
-
def get_extract_status(self, job_id: str) ->
|
|
1716
|
+
def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
|
|
759
1717
|
"""
|
|
760
1718
|
Retrieve the status of an extract job.
|
|
761
1719
|
|
|
@@ -763,7 +1721,7 @@ class FirecrawlApp:
|
|
|
763
1721
|
job_id (str): The ID of the extract job.
|
|
764
1722
|
|
|
765
1723
|
Returns:
|
|
766
|
-
|
|
1724
|
+
ExtractResponse[Any]: The status of the extract job.
|
|
767
1725
|
|
|
768
1726
|
Raises:
|
|
769
1727
|
ValueError: If there is an error retrieving the status.
|
|
@@ -773,7 +1731,7 @@ class FirecrawlApp:
|
|
|
773
1731
|
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
|
774
1732
|
if response.status_code == 200:
|
|
775
1733
|
try:
|
|
776
|
-
return response.json()
|
|
1734
|
+
return ExtractResponse(**response.json())
|
|
777
1735
|
except:
|
|
778
1736
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
779
1737
|
else:
|
|
@@ -781,43 +1739,71 @@ class FirecrawlApp:
|
|
|
781
1739
|
except Exception as e:
|
|
782
1740
|
raise ValueError(str(e), 500)
|
|
783
1741
|
|
|
784
|
-
def async_extract(
|
|
1742
|
+
def async_extract(
|
|
1743
|
+
self,
|
|
1744
|
+
urls: List[str],
|
|
1745
|
+
*,
|
|
1746
|
+
prompt: Optional[str] = None,
|
|
1747
|
+
schema: Optional[Any] = None,
|
|
1748
|
+
system_prompt: Optional[str] = None,
|
|
1749
|
+
allow_external_links: Optional[bool] = False,
|
|
1750
|
+
enable_web_search: Optional[bool] = False,
|
|
1751
|
+
show_sources: Optional[bool] = False,
|
|
1752
|
+
agent: Optional[Dict[str, Any]] = None,
|
|
1753
|
+
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
785
1754
|
"""
|
|
786
1755
|
Initiate an asynchronous extract job.
|
|
787
1756
|
|
|
788
1757
|
Args:
|
|
789
|
-
urls (List[str]):
|
|
790
|
-
|
|
791
|
-
|
|
1758
|
+
urls (List[str]): URLs to extract information from
|
|
1759
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
1760
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
1761
|
+
system_prompt (Optional[str]): System context
|
|
1762
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
1763
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
1764
|
+
show_sources (Optional[bool]): Include source URLs
|
|
1765
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
1766
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
792
1767
|
|
|
793
1768
|
Returns:
|
|
794
|
-
|
|
1769
|
+
ExtractResponse[Any] with:
|
|
1770
|
+
* success (bool): Whether request succeeded
|
|
1771
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
1772
|
+
* error (Optional[str]): Error message if any
|
|
795
1773
|
|
|
796
1774
|
Raises:
|
|
797
|
-
ValueError: If
|
|
1775
|
+
ValueError: If job initiation fails
|
|
798
1776
|
"""
|
|
799
1777
|
headers = self._prepare_headers(idempotency_key)
|
|
800
1778
|
|
|
801
|
-
schema =
|
|
1779
|
+
schema = schema
|
|
802
1780
|
if schema:
|
|
803
1781
|
if hasattr(schema, 'model_json_schema'):
|
|
804
1782
|
# Convert Pydantic model to JSON schema
|
|
805
1783
|
schema = schema.model_json_schema()
|
|
806
1784
|
# Otherwise assume it's already a JSON schema dict
|
|
807
1785
|
|
|
808
|
-
jsonData = {'urls': urls, **(params or {})}
|
|
809
1786
|
request_data = {
|
|
810
|
-
|
|
811
|
-
'allowExternalLinks':
|
|
1787
|
+
'urls': urls,
|
|
1788
|
+
'allowExternalLinks': allow_external_links,
|
|
1789
|
+
'enableWebSearch': enable_web_search,
|
|
1790
|
+
'showSources': show_sources,
|
|
812
1791
|
'schema': schema,
|
|
813
|
-
'origin': '
|
|
1792
|
+
'origin': f'python-sdk@{version}'
|
|
814
1793
|
}
|
|
815
1794
|
|
|
1795
|
+
if prompt:
|
|
1796
|
+
request_data['prompt'] = prompt
|
|
1797
|
+
if system_prompt:
|
|
1798
|
+
request_data['systemPrompt'] = system_prompt
|
|
1799
|
+
if agent:
|
|
1800
|
+
request_data['agent'] = agent
|
|
1801
|
+
|
|
816
1802
|
try:
|
|
817
1803
|
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
|
818
1804
|
if response.status_code == 200:
|
|
819
1805
|
try:
|
|
820
|
-
return response.json()
|
|
1806
|
+
return ExtractResponse(**response.json())
|
|
821
1807
|
except:
|
|
822
1808
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
823
1809
|
else:
|
|
@@ -825,34 +1811,44 @@ class FirecrawlApp:
|
|
|
825
1811
|
except Exception as e:
|
|
826
1812
|
raise ValueError(str(e), 500)
|
|
827
1813
|
|
|
828
|
-
def generate_llms_text(
|
|
1814
|
+
def generate_llms_text(
|
|
1815
|
+
self,
|
|
1816
|
+
url: str,
|
|
1817
|
+
*,
|
|
1818
|
+
max_urls: Optional[int] = None,
|
|
1819
|
+
show_full_text: Optional[bool] = None,
|
|
1820
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
|
829
1821
|
"""
|
|
830
1822
|
Generate LLMs.txt for a given URL and poll until completion.
|
|
831
1823
|
|
|
832
1824
|
Args:
|
|
833
|
-
url (str):
|
|
834
|
-
|
|
1825
|
+
url (str): Target URL to generate LLMs.txt from
|
|
1826
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1827
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1828
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
835
1829
|
|
|
836
1830
|
Returns:
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires.
|
|
1831
|
+
GenerateLLMsTextStatusResponse with:
|
|
1832
|
+
* Generated LLMs.txt content
|
|
1833
|
+
* Full version if requested
|
|
1834
|
+
* Generation status
|
|
1835
|
+
* Success/error information
|
|
843
1836
|
|
|
844
1837
|
Raises:
|
|
845
|
-
Exception: If
|
|
1838
|
+
Exception: If generation fails
|
|
846
1839
|
"""
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
else:
|
|
853
|
-
generation_params = params
|
|
1840
|
+
params = GenerateLLMsTextParams(
|
|
1841
|
+
maxUrls=max_urls,
|
|
1842
|
+
showFullText=show_full_text,
|
|
1843
|
+
__experimental_stream=experimental_stream
|
|
1844
|
+
)
|
|
854
1845
|
|
|
855
|
-
response = self.async_generate_llms_text(
|
|
1846
|
+
response = self.async_generate_llms_text(
|
|
1847
|
+
url,
|
|
1848
|
+
max_urls=max_urls,
|
|
1849
|
+
show_full_text=show_full_text,
|
|
1850
|
+
experimental_stream=experimental_stream
|
|
1851
|
+
)
|
|
856
1852
|
if not response.get('success') or 'id' not in response:
|
|
857
1853
|
return response
|
|
858
1854
|
|
|
@@ -871,32 +1867,40 @@ class FirecrawlApp:
|
|
|
871
1867
|
|
|
872
1868
|
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
|
873
1869
|
|
|
874
|
-
def async_generate_llms_text(
|
|
1870
|
+
def async_generate_llms_text(
|
|
1871
|
+
self,
|
|
1872
|
+
url: str,
|
|
1873
|
+
*,
|
|
1874
|
+
max_urls: Optional[int] = None,
|
|
1875
|
+
show_full_text: Optional[bool] = None,
|
|
1876
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
875
1877
|
"""
|
|
876
1878
|
Initiate an asynchronous LLMs.txt generation operation.
|
|
877
1879
|
|
|
878
1880
|
Args:
|
|
879
|
-
url (str): The URL to generate LLMs.txt from.
|
|
880
|
-
|
|
1881
|
+
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
|
|
1882
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1883
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1884
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
881
1885
|
|
|
882
1886
|
Returns:
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
1887
|
+
GenerateLLMsTextResponse: A response containing:
|
|
1888
|
+
* success (bool): Whether the generation initiation was successful
|
|
1889
|
+
* id (str): The unique identifier for the generation job
|
|
1890
|
+
* error (str, optional): Error message if initiation failed
|
|
886
1891
|
|
|
887
1892
|
Raises:
|
|
888
1893
|
Exception: If the generation job initiation fails.
|
|
889
1894
|
"""
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
else:
|
|
896
|
-
generation_params = params
|
|
1895
|
+
params = GenerateLLMsTextParams(
|
|
1896
|
+
maxUrls=max_urls,
|
|
1897
|
+
showFullText=show_full_text,
|
|
1898
|
+
__experimental_stream=experimental_stream
|
|
1899
|
+
)
|
|
897
1900
|
|
|
898
1901
|
headers = self._prepare_headers()
|
|
899
|
-
json_data = {'url': url, **
|
|
1902
|
+
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
1903
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
900
1904
|
|
|
901
1905
|
try:
|
|
902
1906
|
response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
|
|
@@ -912,15 +1916,22 @@ class FirecrawlApp:
|
|
|
912
1916
|
|
|
913
1917
|
return {'success': False, 'error': 'Internal server error'}
|
|
914
1918
|
|
|
915
|
-
def check_generate_llms_text_status(self, id: str) ->
|
|
1919
|
+
def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
|
|
916
1920
|
"""
|
|
917
1921
|
Check the status of a LLMs.txt generation operation.
|
|
918
1922
|
|
|
919
1923
|
Args:
|
|
920
|
-
id (str): The
|
|
1924
|
+
id (str): The unique identifier of the LLMs.txt generation job to check status for.
|
|
921
1925
|
|
|
922
1926
|
Returns:
|
|
923
|
-
|
|
1927
|
+
GenerateLLMsTextStatusResponse: A response containing:
|
|
1928
|
+
* success (bool): Whether the generation was successful
|
|
1929
|
+
* status (str): Status of generation ("processing", "completed", "failed")
|
|
1930
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
1931
|
+
* llmstxt (str): Generated LLMs.txt content
|
|
1932
|
+
* llmsfulltxt (str, optional): Full version if requested
|
|
1933
|
+
* error (str, optional): Error message if generation failed
|
|
1934
|
+
* expiresAt (str): When the generated data expires
|
|
924
1935
|
|
|
925
1936
|
Raises:
|
|
926
1937
|
Exception: If the status check fails.
|
|
@@ -942,7 +1953,9 @@ class FirecrawlApp:
|
|
|
942
1953
|
|
|
943
1954
|
return {'success': False, 'error': 'Internal server error'}
|
|
944
1955
|
|
|
945
|
-
def _prepare_headers(
|
|
1956
|
+
def _prepare_headers(
|
|
1957
|
+
self,
|
|
1958
|
+
idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
|
946
1959
|
"""
|
|
947
1960
|
Prepare the headers for API requests.
|
|
948
1961
|
|
|
@@ -964,11 +1977,13 @@ class FirecrawlApp:
|
|
|
964
1977
|
'Authorization': f'Bearer {self.api_key}',
|
|
965
1978
|
}
|
|
966
1979
|
|
|
967
|
-
def _post_request(
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
1980
|
+
def _post_request(
|
|
1981
|
+
self,
|
|
1982
|
+
url: str,
|
|
1983
|
+
data: Dict[str, Any],
|
|
1984
|
+
headers: Dict[str, str],
|
|
1985
|
+
retries: int = 3,
|
|
1986
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
972
1987
|
"""
|
|
973
1988
|
Make a POST request with retries.
|
|
974
1989
|
|
|
@@ -993,10 +2008,12 @@ class FirecrawlApp:
|
|
|
993
2008
|
return response
|
|
994
2009
|
return response
|
|
995
2010
|
|
|
996
|
-
def _get_request(
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
2011
|
+
def _get_request(
|
|
2012
|
+
self,
|
|
2013
|
+
url: str,
|
|
2014
|
+
headers: Dict[str, str],
|
|
2015
|
+
retries: int = 3,
|
|
2016
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
1000
2017
|
"""
|
|
1001
2018
|
Make a GET request with retries.
|
|
1002
2019
|
|
|
@@ -1020,10 +2037,12 @@ class FirecrawlApp:
|
|
|
1020
2037
|
return response
|
|
1021
2038
|
return response
|
|
1022
2039
|
|
|
1023
|
-
def _delete_request(
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
2040
|
+
def _delete_request(
|
|
2041
|
+
self,
|
|
2042
|
+
url: str,
|
|
2043
|
+
headers: Dict[str, str],
|
|
2044
|
+
retries: int = 3,
|
|
2045
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
1027
2046
|
"""
|
|
1028
2047
|
Make a DELETE request with retries.
|
|
1029
2048
|
|
|
@@ -1047,16 +2066,21 @@ class FirecrawlApp:
|
|
|
1047
2066
|
return response
|
|
1048
2067
|
return response
|
|
1049
2068
|
|
|
1050
|
-
def _monitor_job_status(
|
|
2069
|
+
def _monitor_job_status(
|
|
2070
|
+
self,
|
|
2071
|
+
id: str,
|
|
2072
|
+
headers: Dict[str, str],
|
|
2073
|
+
poll_interval: int) -> CrawlStatusResponse:
|
|
1051
2074
|
"""
|
|
1052
2075
|
Monitor the status of a crawl job until completion.
|
|
1053
2076
|
|
|
1054
2077
|
Args:
|
|
1055
2078
|
id (str): The ID of the crawl job.
|
|
1056
2079
|
headers (Dict[str, str]): The headers to include in the status check requests.
|
|
1057
|
-
poll_interval (int):
|
|
2080
|
+
poll_interval (int): Seconds between status checks.
|
|
2081
|
+
|
|
1058
2082
|
Returns:
|
|
1059
|
-
|
|
2083
|
+
CrawlStatusResponse: The crawl results if the job is completed successfully.
|
|
1060
2084
|
|
|
1061
2085
|
Raises:
|
|
1062
2086
|
Exception: If the job fails or an error occurs during status checks.
|
|
@@ -1083,7 +2107,7 @@ class FirecrawlApp:
|
|
|
1083
2107
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1084
2108
|
data.extend(status_data.get('data', []))
|
|
1085
2109
|
status_data['data'] = data
|
|
1086
|
-
return status_data
|
|
2110
|
+
return CrawlStatusResponse(**status_data)
|
|
1087
2111
|
else:
|
|
1088
2112
|
raise Exception('Crawl job completed but no data was returned')
|
|
1089
2113
|
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
@@ -1094,7 +2118,10 @@ class FirecrawlApp:
|
|
|
1094
2118
|
else:
|
|
1095
2119
|
self._handle_error(status_response, 'check crawl status')
|
|
1096
2120
|
|
|
1097
|
-
def _handle_error(
|
|
2121
|
+
def _handle_error(
|
|
2122
|
+
self,
|
|
2123
|
+
response: requests.Response,
|
|
2124
|
+
action: str) -> None:
|
|
1098
2125
|
"""
|
|
1099
2126
|
Handle errors from API responses.
|
|
1100
2127
|
|
|
@@ -1111,49 +2138,100 @@ class FirecrawlApp:
|
|
|
1111
2138
|
except:
|
|
1112
2139
|
raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
|
|
1113
2140
|
|
|
1114
|
-
|
|
1115
|
-
if response.status_code == 402:
|
|
1116
|
-
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
1117
|
-
elif response.status_code == 403:
|
|
1118
|
-
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
1119
|
-
elif response.status_code == 408:
|
|
1120
|
-
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
1121
|
-
elif response.status_code == 409:
|
|
1122
|
-
message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
|
|
1123
|
-
elif response.status_code == 500:
|
|
1124
|
-
message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
|
1125
|
-
else:
|
|
1126
|
-
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
|
|
2141
|
+
message = self._get_error_message(response.status_code, action, error_message, error_details)
|
|
1127
2142
|
|
|
1128
2143
|
# Raise an HTTPError with the custom message and attach the response
|
|
1129
2144
|
raise requests.exceptions.HTTPError(message, response=response)
|
|
1130
2145
|
|
|
1131
|
-
def
|
|
1132
|
-
|
|
1133
|
-
|
|
2146
|
+
def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
2147
|
+
"""
|
|
2148
|
+
Generate a standardized error message based on HTTP status code.
|
|
2149
|
+
|
|
2150
|
+
Args:
|
|
2151
|
+
status_code (int): The HTTP status code from the response
|
|
2152
|
+
action (str): Description of the action that was being performed
|
|
2153
|
+
error_message (str): The error message from the API response
|
|
2154
|
+
error_details (str): Additional error details from the API response
|
|
2155
|
+
|
|
2156
|
+
Returns:
|
|
2157
|
+
str: A formatted error message
|
|
2158
|
+
"""
|
|
2159
|
+
if status_code == 402:
|
|
2160
|
+
return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
2161
|
+
elif status_code == 403:
|
|
2162
|
+
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
2163
|
+
elif status_code == 408:
|
|
2164
|
+
return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
2165
|
+
elif status_code == 409:
|
|
2166
|
+
return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
|
|
2167
|
+
elif status_code == 500:
|
|
2168
|
+
return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
|
2169
|
+
else:
|
|
2170
|
+
return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
|
|
2171
|
+
|
|
2172
|
+
def deep_research(
|
|
2173
|
+
self,
|
|
2174
|
+
query: str,
|
|
2175
|
+
*,
|
|
2176
|
+
max_depth: Optional[int] = None,
|
|
2177
|
+
time_limit: Optional[int] = None,
|
|
2178
|
+
max_urls: Optional[int] = None,
|
|
2179
|
+
analysis_prompt: Optional[str] = None,
|
|
2180
|
+
system_prompt: Optional[str] = None,
|
|
2181
|
+
__experimental_stream_steps: Optional[bool] = None,
|
|
2182
|
+
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
2183
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
|
|
1134
2184
|
"""
|
|
1135
2185
|
Initiates a deep research operation on a given query and polls until completion.
|
|
1136
2186
|
|
|
1137
2187
|
Args:
|
|
1138
|
-
query (str):
|
|
1139
|
-
|
|
1140
|
-
|
|
2188
|
+
query (str): Research query or topic to investigate
|
|
2189
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
2190
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
2191
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
2192
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
2193
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
2194
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
2195
|
+
on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
|
|
2196
|
+
on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
|
|
1141
2197
|
|
|
1142
2198
|
Returns:
|
|
1143
|
-
|
|
2199
|
+
DeepResearchStatusResponse containing:
|
|
2200
|
+
* success (bool): Whether research completed successfully
|
|
2201
|
+
* status (str): Current state (processing/completed/failed)
|
|
2202
|
+
* error (Optional[str]): Error message if failed
|
|
2203
|
+
* id (str): Unique identifier for the research job
|
|
2204
|
+
* data (Any): Research findings and analysis
|
|
2205
|
+
* sources (List[Dict]): List of discovered sources
|
|
2206
|
+
* activities (List[Dict]): Research progress log
|
|
2207
|
+
* summaries (List[str]): Generated research summaries
|
|
1144
2208
|
|
|
1145
2209
|
Raises:
|
|
1146
|
-
Exception: If
|
|
2210
|
+
Exception: If research fails
|
|
1147
2211
|
"""
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
if
|
|
1152
|
-
research_params =
|
|
1153
|
-
|
|
1154
|
-
research_params =
|
|
1155
|
-
|
|
1156
|
-
|
|
2212
|
+
research_params = {}
|
|
2213
|
+
if max_depth is not None:
|
|
2214
|
+
research_params['maxDepth'] = max_depth
|
|
2215
|
+
if time_limit is not None:
|
|
2216
|
+
research_params['timeLimit'] = time_limit
|
|
2217
|
+
if max_urls is not None:
|
|
2218
|
+
research_params['maxUrls'] = max_urls
|
|
2219
|
+
if analysis_prompt is not None:
|
|
2220
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
2221
|
+
if system_prompt is not None:
|
|
2222
|
+
research_params['systemPrompt'] = system_prompt
|
|
2223
|
+
if __experimental_stream_steps is not None:
|
|
2224
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2225
|
+
research_params = DeepResearchParams(**research_params)
|
|
2226
|
+
|
|
2227
|
+
response = self.async_deep_research(
|
|
2228
|
+
query,
|
|
2229
|
+
max_depth=max_depth,
|
|
2230
|
+
time_limit=time_limit,
|
|
2231
|
+
max_urls=max_urls,
|
|
2232
|
+
analysis_prompt=analysis_prompt,
|
|
2233
|
+
system_prompt=system_prompt
|
|
2234
|
+
)
|
|
1157
2235
|
if not response.get('success') or 'id' not in response:
|
|
1158
2236
|
return response
|
|
1159
2237
|
|
|
@@ -1186,31 +2264,57 @@ class FirecrawlApp:
|
|
|
1186
2264
|
time.sleep(2) # Polling interval
|
|
1187
2265
|
|
|
1188
2266
|
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
|
1189
|
-
|
|
2267
|
+
|
|
2268
|
+
def async_deep_research(
|
|
2269
|
+
self,
|
|
2270
|
+
query: str,
|
|
2271
|
+
*,
|
|
2272
|
+
max_depth: Optional[int] = None,
|
|
2273
|
+
time_limit: Optional[int] = None,
|
|
2274
|
+
max_urls: Optional[int] = None,
|
|
2275
|
+
analysis_prompt: Optional[str] = None,
|
|
2276
|
+
system_prompt: Optional[str] = None,
|
|
2277
|
+
__experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
|
|
1190
2278
|
"""
|
|
1191
2279
|
Initiates an asynchronous deep research operation.
|
|
1192
2280
|
|
|
1193
2281
|
Args:
|
|
1194
|
-
query (str):
|
|
1195
|
-
|
|
2282
|
+
query (str): Research query or topic to investigate
|
|
2283
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
2284
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
2285
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
2286
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
2287
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
2288
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
1196
2289
|
|
|
1197
2290
|
Returns:
|
|
1198
|
-
Dict[str, Any]:
|
|
2291
|
+
Dict[str, Any]: A response containing:
|
|
2292
|
+
* success (bool): Whether the research initiation was successful
|
|
2293
|
+
* id (str): The unique identifier for the research job
|
|
2294
|
+
* error (str, optional): Error message if initiation failed
|
|
1199
2295
|
|
|
1200
2296
|
Raises:
|
|
1201
2297
|
Exception: If the research initiation fails.
|
|
1202
2298
|
"""
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
if
|
|
1207
|
-
research_params =
|
|
1208
|
-
|
|
1209
|
-
research_params =
|
|
2299
|
+
research_params = {}
|
|
2300
|
+
if max_depth is not None:
|
|
2301
|
+
research_params['maxDepth'] = max_depth
|
|
2302
|
+
if time_limit is not None:
|
|
2303
|
+
research_params['timeLimit'] = time_limit
|
|
2304
|
+
if max_urls is not None:
|
|
2305
|
+
research_params['maxUrls'] = max_urls
|
|
2306
|
+
if analysis_prompt is not None:
|
|
2307
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
2308
|
+
if system_prompt is not None:
|
|
2309
|
+
research_params['systemPrompt'] = system_prompt
|
|
2310
|
+
if __experimental_stream_steps is not None:
|
|
2311
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2312
|
+
research_params = DeepResearchParams(**research_params)
|
|
1210
2313
|
|
|
1211
2314
|
headers = self._prepare_headers()
|
|
1212
2315
|
|
|
1213
2316
|
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
|
2317
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
1214
2318
|
|
|
1215
2319
|
# Handle json options schema if present
|
|
1216
2320
|
if 'jsonOptions' in json_data:
|
|
@@ -1232,7 +2336,7 @@ class FirecrawlApp:
|
|
|
1232
2336
|
|
|
1233
2337
|
return {'success': False, 'error': 'Internal server error'}
|
|
1234
2338
|
|
|
1235
|
-
def check_deep_research_status(self, id: str) ->
|
|
2339
|
+
def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
|
|
1236
2340
|
"""
|
|
1237
2341
|
Check the status of a deep research operation.
|
|
1238
2342
|
|
|
@@ -1240,7 +2344,19 @@ class FirecrawlApp:
|
|
|
1240
2344
|
id (str): The ID of the deep research operation.
|
|
1241
2345
|
|
|
1242
2346
|
Returns:
|
|
1243
|
-
|
|
2347
|
+
DeepResearchResponse containing:
|
|
2348
|
+
|
|
2349
|
+
Status:
|
|
2350
|
+
* success - Whether research completed successfully
|
|
2351
|
+
* status - Current state (processing/completed/failed)
|
|
2352
|
+
* error - Error message if failed
|
|
2353
|
+
|
|
2354
|
+
Results:
|
|
2355
|
+
* id - Unique identifier for the research job
|
|
2356
|
+
* data - Research findings and analysis
|
|
2357
|
+
* sources - List of discovered sources
|
|
2358
|
+
* activities - Research progress log
|
|
2359
|
+
* summaries - Generated research summaries
|
|
1244
2360
|
|
|
1245
2361
|
Raises:
|
|
1246
2362
|
Exception: If the status check fails.
|
|
@@ -1263,6 +2379,17 @@ class FirecrawlApp:
|
|
|
1263
2379
|
return {'success': False, 'error': 'Internal server error'}
|
|
1264
2380
|
|
|
1265
2381
|
class CrawlWatcher:
|
|
2382
|
+
"""
|
|
2383
|
+
A class to watch and handle crawl job events via WebSocket connection.
|
|
2384
|
+
|
|
2385
|
+
Attributes:
|
|
2386
|
+
id (str): The ID of the crawl job to watch
|
|
2387
|
+
app (FirecrawlApp): The FirecrawlApp instance
|
|
2388
|
+
data (List[Dict[str, Any]]): List of crawled documents/data
|
|
2389
|
+
status (str): Current status of the crawl job
|
|
2390
|
+
ws_url (str): WebSocket URL for the crawl job
|
|
2391
|
+
event_handlers (dict): Dictionary of event type to list of handler functions
|
|
2392
|
+
"""
|
|
1266
2393
|
def __init__(self, id: str, app: FirecrawlApp):
|
|
1267
2394
|
self.id = id
|
|
1268
2395
|
self.app = app
|
|
@@ -1275,25 +2402,57 @@ class CrawlWatcher:
|
|
|
1275
2402
|
'document': []
|
|
1276
2403
|
}
|
|
1277
2404
|
|
|
1278
|
-
async def connect(self):
|
|
1279
|
-
|
|
2405
|
+
async def connect(self) -> None:
|
|
2406
|
+
"""
|
|
2407
|
+
Establishes WebSocket connection and starts listening for messages.
|
|
2408
|
+
"""
|
|
2409
|
+
async with websockets.connect(
|
|
2410
|
+
self.ws_url,
|
|
2411
|
+
additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
|
|
2412
|
+
) as websocket:
|
|
1280
2413
|
await self._listen(websocket)
|
|
1281
2414
|
|
|
1282
|
-
async def _listen(self, websocket):
|
|
2415
|
+
async def _listen(self, websocket) -> None:
|
|
2416
|
+
"""
|
|
2417
|
+
Listens for incoming WebSocket messages and handles them.
|
|
2418
|
+
|
|
2419
|
+
Args:
|
|
2420
|
+
websocket: The WebSocket connection object
|
|
2421
|
+
"""
|
|
1283
2422
|
async for message in websocket:
|
|
1284
2423
|
msg = json.loads(message)
|
|
1285
2424
|
await self._handle_message(msg)
|
|
1286
2425
|
|
|
1287
|
-
def add_event_listener(self, event_type: str, handler):
|
|
2426
|
+
def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
|
|
2427
|
+
"""
|
|
2428
|
+
Adds an event handler function for a specific event type.
|
|
2429
|
+
|
|
2430
|
+
Args:
|
|
2431
|
+
event_type (str): Type of event to listen for ('done', 'error', or 'document')
|
|
2432
|
+
handler (Callable): Function to handle the event
|
|
2433
|
+
"""
|
|
1288
2434
|
if event_type in self.event_handlers:
|
|
1289
2435
|
self.event_handlers[event_type].append(handler)
|
|
1290
2436
|
|
|
1291
|
-
def dispatch_event(self, event_type: str, detail: Dict[str, Any]):
|
|
2437
|
+
def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
|
|
2438
|
+
"""
|
|
2439
|
+
Dispatches an event to all registered handlers for that event type.
|
|
2440
|
+
|
|
2441
|
+
Args:
|
|
2442
|
+
event_type (str): Type of event to dispatch
|
|
2443
|
+
detail (Dict[str, Any]): Event details/data to pass to handlers
|
|
2444
|
+
"""
|
|
1292
2445
|
if event_type in self.event_handlers:
|
|
1293
2446
|
for handler in self.event_handlers[event_type]:
|
|
1294
2447
|
handler(detail)
|
|
1295
2448
|
|
|
1296
|
-
async def _handle_message(self, msg: Dict[str, Any]):
|
|
2449
|
+
async def _handle_message(self, msg: Dict[str, Any]) -> None:
|
|
2450
|
+
"""
|
|
2451
|
+
Handles incoming WebSocket messages based on their type.
|
|
2452
|
+
|
|
2453
|
+
Args:
|
|
2454
|
+
msg (Dict[str, Any]): The message to handle
|
|
2455
|
+
"""
|
|
1297
2456
|
if msg['type'] == 'done':
|
|
1298
2457
|
self.status = 'completed'
|
|
1299
2458
|
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
|
|
@@ -1308,3 +2467,1773 @@ class CrawlWatcher:
|
|
|
1308
2467
|
elif msg['type'] == 'document':
|
|
1309
2468
|
self.data.append(msg['data'])
|
|
1310
2469
|
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
|
2470
|
+
|
|
2471
|
+
class AsyncFirecrawlApp(FirecrawlApp):
|
|
2472
|
+
"""
|
|
2473
|
+
Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
|
|
2474
|
+
Provides non-blocking alternatives to all FirecrawlApp operations.
|
|
2475
|
+
"""
|
|
2476
|
+
|
|
2477
|
+
async def _async_request(
|
|
2478
|
+
self,
|
|
2479
|
+
method: str,
|
|
2480
|
+
url: str,
|
|
2481
|
+
headers: Dict[str, str],
|
|
2482
|
+
data: Optional[Dict[str, Any]] = None,
|
|
2483
|
+
retries: int = 3,
|
|
2484
|
+
backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2485
|
+
"""
|
|
2486
|
+
Generic async request method with exponential backoff retry logic.
|
|
2487
|
+
|
|
2488
|
+
Args:
|
|
2489
|
+
method (str): The HTTP method to use (e.g., "GET" or "POST").
|
|
2490
|
+
url (str): The URL to send the request to.
|
|
2491
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2492
|
+
data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
|
|
2493
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2494
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2495
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2496
|
+
|
|
2497
|
+
Returns:
|
|
2498
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2499
|
+
|
|
2500
|
+
Raises:
|
|
2501
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2502
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2503
|
+
"""
|
|
2504
|
+
async with aiohttp.ClientSession() as session:
|
|
2505
|
+
for attempt in range(retries):
|
|
2506
|
+
try:
|
|
2507
|
+
async with session.request(
|
|
2508
|
+
method=method, url=url, headers=headers, json=data
|
|
2509
|
+
) as response:
|
|
2510
|
+
if response.status == 502:
|
|
2511
|
+
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
|
2512
|
+
continue
|
|
2513
|
+
if response.status >= 300:
|
|
2514
|
+
await self._handle_error(response, f"make {method} request")
|
|
2515
|
+
return await response.json()
|
|
2516
|
+
except aiohttp.ClientError as e:
|
|
2517
|
+
if attempt == retries - 1:
|
|
2518
|
+
raise e
|
|
2519
|
+
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
|
2520
|
+
raise Exception("Max retries exceeded")
|
|
2521
|
+
|
|
2522
|
+
async def _async_post_request(
|
|
2523
|
+
self, url: str, data: Dict[str, Any], headers: Dict[str, str],
|
|
2524
|
+
retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2525
|
+
"""
|
|
2526
|
+
Make an async POST request with exponential backoff retry logic.
|
|
2527
|
+
|
|
2528
|
+
Args:
|
|
2529
|
+
url (str): The URL to send the POST request to.
|
|
2530
|
+
data (Dict[str, Any]): The JSON data to include in the request body.
|
|
2531
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2532
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2533
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2534
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2535
|
+
|
|
2536
|
+
Returns:
|
|
2537
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2538
|
+
|
|
2539
|
+
Raises:
|
|
2540
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2541
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2542
|
+
"""
|
|
2543
|
+
return await self._async_request("POST", url, headers, data, retries, backoff_factor)
|
|
2544
|
+
|
|
2545
|
+
async def _async_get_request(
|
|
2546
|
+
self, url: str, headers: Dict[str, str],
|
|
2547
|
+
retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2548
|
+
"""
|
|
2549
|
+
Make an async GET request with exponential backoff retry logic.
|
|
2550
|
+
|
|
2551
|
+
Args:
|
|
2552
|
+
url (str): The URL to send the GET request to.
|
|
2553
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2554
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2555
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2556
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2557
|
+
|
|
2558
|
+
Returns:
|
|
2559
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2560
|
+
|
|
2561
|
+
Raises:
|
|
2562
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2563
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2564
|
+
"""
|
|
2565
|
+
return await self._async_request("GET", url, headers, None, retries, backoff_factor)
|
|
2566
|
+
|
|
2567
|
+
async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
|
|
2568
|
+
"""
|
|
2569
|
+
Handle errors from async API responses with detailed error messages.
|
|
2570
|
+
|
|
2571
|
+
Args:
|
|
2572
|
+
response (aiohttp.ClientResponse): The response object from the failed request
|
|
2573
|
+
action (str): Description of the action that was being attempted
|
|
2574
|
+
|
|
2575
|
+
Raises:
|
|
2576
|
+
aiohttp.ClientError: With a detailed error message based on the response status:
|
|
2577
|
+
- 402: Payment Required
|
|
2578
|
+
- 408: Request Timeout
|
|
2579
|
+
- 409: Conflict
|
|
2580
|
+
- 500: Internal Server Error
|
|
2581
|
+
- Other: Unexpected error with status code
|
|
2582
|
+
"""
|
|
2583
|
+
try:
|
|
2584
|
+
error_data = await response.json()
|
|
2585
|
+
error_message = error_data.get('error', 'No error message provided.')
|
|
2586
|
+
error_details = error_data.get('details', 'No additional error details provided.')
|
|
2587
|
+
except:
|
|
2588
|
+
raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
|
|
2589
|
+
|
|
2590
|
+
message = await self._get_async_error_message(response.status, action, error_message, error_details)
|
|
2591
|
+
|
|
2592
|
+
raise aiohttp.ClientError(message)
|
|
2593
|
+
|
|
2594
|
+
async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
2595
|
+
"""
|
|
2596
|
+
Generate a standardized error message based on HTTP status code for async operations.
|
|
2597
|
+
|
|
2598
|
+
Args:
|
|
2599
|
+
status_code (int): The HTTP status code from the response
|
|
2600
|
+
action (str): Description of the action that was being performed
|
|
2601
|
+
error_message (str): The error message from the API response
|
|
2602
|
+
error_details (str): Additional error details from the API response
|
|
2603
|
+
|
|
2604
|
+
Returns:
|
|
2605
|
+
str: A formatted error message
|
|
2606
|
+
"""
|
|
2607
|
+
return self._get_error_message(status_code, action, error_message, error_details)
|
|
2608
|
+
|
|
2609
|
+
async def crawl_url_and_watch(
|
|
2610
|
+
self,
|
|
2611
|
+
url: str,
|
|
2612
|
+
params: Optional[CrawlParams] = None,
|
|
2613
|
+
idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
|
|
2614
|
+
"""
|
|
2615
|
+
Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
|
|
2616
|
+
|
|
2617
|
+
Args:
|
|
2618
|
+
url (str): Target URL to start crawling from
|
|
2619
|
+
params (Optional[CrawlParams]): See CrawlParams model for configuration:
|
|
2620
|
+
URL Discovery:
|
|
2621
|
+
* includePaths - Patterns of URLs to include
|
|
2622
|
+
* excludePaths - Patterns of URLs to exclude
|
|
2623
|
+
* maxDepth - Maximum crawl depth
|
|
2624
|
+
* maxDiscoveryDepth - Maximum depth for finding new URLs
|
|
2625
|
+
* limit - Maximum pages to crawl
|
|
2626
|
+
|
|
2627
|
+
Link Following:
|
|
2628
|
+
* allowBackwardLinks - Follow parent directory links
|
|
2629
|
+
* allowExternalLinks - Follow external domain links
|
|
2630
|
+
* ignoreSitemap - Skip sitemap.xml processing
|
|
2631
|
+
|
|
2632
|
+
Advanced:
|
|
2633
|
+
* scrapeOptions - Page scraping configuration
|
|
2634
|
+
* webhook - Notification webhook settings
|
|
2635
|
+
* deduplicateSimilarURLs - Remove similar URLs
|
|
2636
|
+
* ignoreQueryParameters - Ignore URL parameters
|
|
2637
|
+
* regexOnFullURL - Apply regex to full URLs
|
|
2638
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2639
|
+
|
|
2640
|
+
Returns:
|
|
2641
|
+
AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
2642
|
+
|
|
2643
|
+
Raises:
|
|
2644
|
+
Exception: If crawl job fails to start
|
|
2645
|
+
"""
|
|
2646
|
+
crawl_response = await self.async_crawl_url(url, params, idempotency_key)
|
|
2647
|
+
if crawl_response.get('success') and 'id' in crawl_response:
|
|
2648
|
+
return AsyncCrawlWatcher(crawl_response['id'], self)
|
|
2649
|
+
else:
|
|
2650
|
+
raise Exception("Crawl job failed to start")
|
|
2651
|
+
|
|
2652
|
+
async def batch_scrape_urls_and_watch(
|
|
2653
|
+
self,
|
|
2654
|
+
urls: List[str],
|
|
2655
|
+
params: Optional[ScrapeParams] = None,
|
|
2656
|
+
idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
|
|
2657
|
+
"""
|
|
2658
|
+
Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
|
|
2659
|
+
|
|
2660
|
+
Args:
|
|
2661
|
+
urls (List[str]): List of URLs to scrape
|
|
2662
|
+
params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
|
|
2663
|
+
|
|
2664
|
+
Content Options:
|
|
2665
|
+
* formats - Content formats to retrieve
|
|
2666
|
+
* includeTags - HTML tags to include
|
|
2667
|
+
* excludeTags - HTML tags to exclude
|
|
2668
|
+
* onlyMainContent - Extract main content only
|
|
2669
|
+
|
|
2670
|
+
Request Options:
|
|
2671
|
+
* headers - Custom HTTP headers
|
|
2672
|
+
* timeout - Request timeout (ms)
|
|
2673
|
+
* mobile - Use mobile user agent
|
|
2674
|
+
* proxy - Proxy type
|
|
2675
|
+
|
|
2676
|
+
Extraction Options:
|
|
2677
|
+
* extract - Content extraction config
|
|
2678
|
+
* jsonOptions - JSON extraction config
|
|
2679
|
+
* actions - Actions to perform
|
|
2680
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2681
|
+
|
|
2682
|
+
Returns:
|
|
2683
|
+
AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
2684
|
+
|
|
2685
|
+
Raises:
|
|
2686
|
+
Exception: If batch scrape job fails to start
|
|
2687
|
+
"""
|
|
2688
|
+
batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
|
|
2689
|
+
if batch_response.get('success') and 'id' in batch_response:
|
|
2690
|
+
return AsyncCrawlWatcher(batch_response['id'], self)
|
|
2691
|
+
else:
|
|
2692
|
+
raise Exception("Batch scrape job failed to start")
|
|
2693
|
+
|
|
2694
|
+
async def scrape_url(
|
|
2695
|
+
self,
|
|
2696
|
+
url: str,
|
|
2697
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2698
|
+
include_tags: Optional[List[str]] = None,
|
|
2699
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2700
|
+
only_main_content: Optional[bool] = None,
|
|
2701
|
+
wait_for: Optional[int] = None,
|
|
2702
|
+
timeout: Optional[int] = None,
|
|
2703
|
+
location: Optional[LocationConfig] = None,
|
|
2704
|
+
mobile: Optional[bool] = None,
|
|
2705
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2706
|
+
remove_base64_images: Optional[bool] = None,
|
|
2707
|
+
block_ads: Optional[bool] = None,
|
|
2708
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2709
|
+
extract: Optional[ExtractConfig] = None,
|
|
2710
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2711
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
|
|
2712
|
+
"""
|
|
2713
|
+
Scrape and extract content from a URL asynchronously.
|
|
2714
|
+
|
|
2715
|
+
Args:
|
|
2716
|
+
url (str): Target URL to scrape
|
|
2717
|
+
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
|
2718
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2719
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2720
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2721
|
+
wait_for (Optional[int]): Wait for a specific element to appear
|
|
2722
|
+
timeout (Optional[int]): Request timeout (ms)
|
|
2723
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2724
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2725
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2726
|
+
remove_base64_images (Optional[bool]): Remove base64 images
|
|
2727
|
+
block_ads (Optional[bool]): Block ads
|
|
2728
|
+
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
2729
|
+
extract (Optional[ExtractConfig]): Content extraction settings
|
|
2730
|
+
json_options (Optional[ExtractConfig]): JSON extraction settings
|
|
2731
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
2732
|
+
|
|
2733
|
+
Returns:
|
|
2734
|
+
ScrapeResponse with:
|
|
2735
|
+
* Requested content formats
|
|
2736
|
+
* Page metadata
|
|
2737
|
+
* Extraction results
|
|
2738
|
+
* Success/error status
|
|
2739
|
+
|
|
2740
|
+
Raises:
|
|
2741
|
+
Exception: If scraping fails
|
|
2742
|
+
"""
|
|
2743
|
+
headers = self._prepare_headers()
|
|
2744
|
+
|
|
2745
|
+
# Build scrape parameters
|
|
2746
|
+
scrape_params = {
|
|
2747
|
+
'url': url,
|
|
2748
|
+
'origin': f"python-sdk@{version}"
|
|
2749
|
+
}
|
|
2750
|
+
|
|
2751
|
+
# Add optional parameters if provided and not None
|
|
2752
|
+
if formats:
|
|
2753
|
+
scrape_params['formats'] = formats
|
|
2754
|
+
if include_tags:
|
|
2755
|
+
scrape_params['includeTags'] = include_tags
|
|
2756
|
+
if exclude_tags:
|
|
2757
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
2758
|
+
if only_main_content is not None:
|
|
2759
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
2760
|
+
if wait_for:
|
|
2761
|
+
scrape_params['waitFor'] = wait_for
|
|
2762
|
+
if timeout:
|
|
2763
|
+
scrape_params['timeout'] = timeout
|
|
2764
|
+
if location:
|
|
2765
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
2766
|
+
if mobile is not None:
|
|
2767
|
+
scrape_params['mobile'] = mobile
|
|
2768
|
+
if skip_tls_verification is not None:
|
|
2769
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
2770
|
+
if remove_base64_images is not None:
|
|
2771
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
2772
|
+
if block_ads is not None:
|
|
2773
|
+
scrape_params['blockAds'] = block_ads
|
|
2774
|
+
if proxy:
|
|
2775
|
+
scrape_params['proxy'] = proxy
|
|
2776
|
+
if extract:
|
|
2777
|
+
extract_dict = extract.dict(exclude_none=True)
|
|
2778
|
+
if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
|
|
2779
|
+
extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
|
|
2780
|
+
scrape_params['extract'] = extract_dict
|
|
2781
|
+
if json_options:
|
|
2782
|
+
json_options_dict = json_options.dict(exclude_none=True)
|
|
2783
|
+
if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
|
|
2784
|
+
json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
|
|
2785
|
+
scrape_params['jsonOptions'] = json_options_dict
|
|
2786
|
+
if actions:
|
|
2787
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
2788
|
+
|
|
2789
|
+
# Make async request
|
|
2790
|
+
endpoint = f'/v1/scrape'
|
|
2791
|
+
response = await self._async_post_request(
|
|
2792
|
+
f'{self.api_url}{endpoint}',
|
|
2793
|
+
scrape_params,
|
|
2794
|
+
headers
|
|
2795
|
+
)
|
|
2796
|
+
|
|
2797
|
+
if response.get('success') and 'data' in response:
|
|
2798
|
+
return ScrapeResponse(**response['data'])
|
|
2799
|
+
elif "error" in response:
|
|
2800
|
+
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
2801
|
+
else:
|
|
2802
|
+
# Use the response content directly if possible, otherwise a generic message
|
|
2803
|
+
error_content = response.get('error', str(response))
|
|
2804
|
+
raise Exception(f'Failed to scrape URL. Error: {error_content}')
|
|
2805
|
+
|
|
2806
|
+
async def batch_scrape_urls(
|
|
2807
|
+
self,
|
|
2808
|
+
urls: List[str],
|
|
2809
|
+
*,
|
|
2810
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2811
|
+
headers: Optional[Dict[str, str]] = None,
|
|
2812
|
+
include_tags: Optional[List[str]] = None,
|
|
2813
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2814
|
+
only_main_content: Optional[bool] = None,
|
|
2815
|
+
wait_for: Optional[int] = None,
|
|
2816
|
+
timeout: Optional[int] = None,
|
|
2817
|
+
location: Optional[LocationConfig] = None,
|
|
2818
|
+
mobile: Optional[bool] = None,
|
|
2819
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2820
|
+
remove_base64_images: Optional[bool] = None,
|
|
2821
|
+
block_ads: Optional[bool] = None,
|
|
2822
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2823
|
+
extract: Optional[ExtractConfig] = None,
|
|
2824
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2825
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2826
|
+
agent: Optional[AgentOptions] = None,
|
|
2827
|
+
poll_interval: Optional[int] = 2,
|
|
2828
|
+
idempotency_key: Optional[str] = None,
|
|
2829
|
+
**kwargs
|
|
2830
|
+
) -> BatchScrapeStatusResponse:
|
|
2831
|
+
"""
|
|
2832
|
+
Asynchronously scrape multiple URLs and monitor until completion.
|
|
2833
|
+
|
|
2834
|
+
Args:
|
|
2835
|
+
urls (List[str]): URLs to scrape
|
|
2836
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
2837
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
2838
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2839
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2840
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2841
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
2842
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
2843
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2844
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2845
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2846
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
2847
|
+
block_ads (Optional[bool]): Block advertisements
|
|
2848
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
2849
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
2850
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
2851
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
2852
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
2853
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
2854
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2855
|
+
**kwargs: Additional parameters to pass to the API
|
|
2856
|
+
|
|
2857
|
+
Returns:
|
|
2858
|
+
BatchScrapeStatusResponse with:
|
|
2859
|
+
* Scraping status and progress
|
|
2860
|
+
* Scraped content for each URL
|
|
2861
|
+
* Success/error information
|
|
2862
|
+
|
|
2863
|
+
Raises:
|
|
2864
|
+
Exception: If batch scrape fails
|
|
2865
|
+
"""
|
|
2866
|
+
scrape_params = {}
|
|
2867
|
+
|
|
2868
|
+
# Add individual parameters
|
|
2869
|
+
if formats is not None:
|
|
2870
|
+
scrape_params['formats'] = formats
|
|
2871
|
+
if headers is not None:
|
|
2872
|
+
scrape_params['headers'] = headers
|
|
2873
|
+
if include_tags is not None:
|
|
2874
|
+
scrape_params['includeTags'] = include_tags
|
|
2875
|
+
if exclude_tags is not None:
|
|
2876
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
2877
|
+
if only_main_content is not None:
|
|
2878
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
2879
|
+
if wait_for is not None:
|
|
2880
|
+
scrape_params['waitFor'] = wait_for
|
|
2881
|
+
if timeout is not None:
|
|
2882
|
+
scrape_params['timeout'] = timeout
|
|
2883
|
+
if location is not None:
|
|
2884
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
2885
|
+
if mobile is not None:
|
|
2886
|
+
scrape_params['mobile'] = mobile
|
|
2887
|
+
if skip_tls_verification is not None:
|
|
2888
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
2889
|
+
if remove_base64_images is not None:
|
|
2890
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
2891
|
+
if block_ads is not None:
|
|
2892
|
+
scrape_params['blockAds'] = block_ads
|
|
2893
|
+
if proxy is not None:
|
|
2894
|
+
scrape_params['proxy'] = proxy
|
|
2895
|
+
if extract is not None:
|
|
2896
|
+
if hasattr(extract.schema, 'schema'):
|
|
2897
|
+
extract.schema = extract.schema.schema()
|
|
2898
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
2899
|
+
if json_options is not None:
|
|
2900
|
+
if hasattr(json_options.schema, 'schema'):
|
|
2901
|
+
json_options.schema = json_options.schema.schema()
|
|
2902
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
2903
|
+
if actions is not None:
|
|
2904
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
2905
|
+
if agent is not None:
|
|
2906
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
2907
|
+
|
|
2908
|
+
# Add any additional kwargs
|
|
2909
|
+
scrape_params.update(kwargs)
|
|
2910
|
+
|
|
2911
|
+
# Create final params object
|
|
2912
|
+
final_params = ScrapeParams(**scrape_params)
|
|
2913
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
2914
|
+
params_dict['urls'] = urls
|
|
2915
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
2916
|
+
|
|
2917
|
+
# Make request
|
|
2918
|
+
headers = self._prepare_headers(idempotency_key)
|
|
2919
|
+
response = await self._async_post_request(
|
|
2920
|
+
f'{self.api_url}/v1/batch/scrape',
|
|
2921
|
+
params_dict,
|
|
2922
|
+
headers
|
|
2923
|
+
)
|
|
2924
|
+
|
|
2925
|
+
if response.status_code == 200:
|
|
2926
|
+
try:
|
|
2927
|
+
id = response.json().get('id')
|
|
2928
|
+
except:
|
|
2929
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
2930
|
+
return self._monitor_job_status(id, headers, poll_interval)
|
|
2931
|
+
else:
|
|
2932
|
+
self._handle_error(response, 'start batch scrape job')
|
|
2933
|
+
|
|
2934
|
+
|
|
2935
|
+
async def async_batch_scrape_urls(
|
|
2936
|
+
self,
|
|
2937
|
+
urls: List[str],
|
|
2938
|
+
*,
|
|
2939
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2940
|
+
headers: Optional[Dict[str, str]] = None,
|
|
2941
|
+
include_tags: Optional[List[str]] = None,
|
|
2942
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2943
|
+
only_main_content: Optional[bool] = None,
|
|
2944
|
+
wait_for: Optional[int] = None,
|
|
2945
|
+
timeout: Optional[int] = None,
|
|
2946
|
+
location: Optional[LocationConfig] = None,
|
|
2947
|
+
mobile: Optional[bool] = None,
|
|
2948
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2949
|
+
remove_base64_images: Optional[bool] = None,
|
|
2950
|
+
block_ads: Optional[bool] = None,
|
|
2951
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2952
|
+
extract: Optional[ExtractConfig] = None,
|
|
2953
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2954
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2955
|
+
agent: Optional[AgentOptions] = None,
|
|
2956
|
+
idempotency_key: Optional[str] = None,
|
|
2957
|
+
**kwargs
|
|
2958
|
+
) -> BatchScrapeResponse:
|
|
2959
|
+
"""
|
|
2960
|
+
Initiate a batch scrape job asynchronously.
|
|
2961
|
+
|
|
2962
|
+
Args:
|
|
2963
|
+
urls (List[str]): URLs to scrape
|
|
2964
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
2965
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
2966
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2967
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2968
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2969
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
2970
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
2971
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2972
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2973
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2974
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
2975
|
+
block_ads (Optional[bool]): Block advertisements
|
|
2976
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
2977
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
2978
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
2979
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
2980
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
2981
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2982
|
+
**kwargs: Additional parameters to pass to the API
|
|
2983
|
+
|
|
2984
|
+
Returns:
|
|
2985
|
+
BatchScrapeResponse with:
|
|
2986
|
+
* success - Whether job started successfully
|
|
2987
|
+
* id - Unique identifier for the job
|
|
2988
|
+
* url - Status check URL
|
|
2989
|
+
* error - Error message if start failed
|
|
2990
|
+
|
|
2991
|
+
Raises:
|
|
2992
|
+
Exception: If job initiation fails
|
|
2993
|
+
"""
|
|
2994
|
+
scrape_params = {}
|
|
2995
|
+
|
|
2996
|
+
# Add individual parameters
|
|
2997
|
+
if formats is not None:
|
|
2998
|
+
scrape_params['formats'] = formats
|
|
2999
|
+
if headers is not None:
|
|
3000
|
+
scrape_params['headers'] = headers
|
|
3001
|
+
if include_tags is not None:
|
|
3002
|
+
scrape_params['includeTags'] = include_tags
|
|
3003
|
+
if exclude_tags is not None:
|
|
3004
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
3005
|
+
if only_main_content is not None:
|
|
3006
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
3007
|
+
if wait_for is not None:
|
|
3008
|
+
scrape_params['waitFor'] = wait_for
|
|
3009
|
+
if timeout is not None:
|
|
3010
|
+
scrape_params['timeout'] = timeout
|
|
3011
|
+
if location is not None:
|
|
3012
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
3013
|
+
if mobile is not None:
|
|
3014
|
+
scrape_params['mobile'] = mobile
|
|
3015
|
+
if skip_tls_verification is not None:
|
|
3016
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
3017
|
+
if remove_base64_images is not None:
|
|
3018
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
3019
|
+
if block_ads is not None:
|
|
3020
|
+
scrape_params['blockAds'] = block_ads
|
|
3021
|
+
if proxy is not None:
|
|
3022
|
+
scrape_params['proxy'] = proxy
|
|
3023
|
+
if extract is not None:
|
|
3024
|
+
if hasattr(extract.schema, 'schema'):
|
|
3025
|
+
extract.schema = extract.schema.schema()
|
|
3026
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
3027
|
+
if json_options is not None:
|
|
3028
|
+
if hasattr(json_options.schema, 'schema'):
|
|
3029
|
+
json_options.schema = json_options.schema.schema()
|
|
3030
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
3031
|
+
if actions is not None:
|
|
3032
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
3033
|
+
if agent is not None:
|
|
3034
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
3035
|
+
|
|
3036
|
+
# Add any additional kwargs
|
|
3037
|
+
scrape_params.update(kwargs)
|
|
3038
|
+
|
|
3039
|
+
# Create final params object
|
|
3040
|
+
final_params = ScrapeParams(**scrape_params)
|
|
3041
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3042
|
+
params_dict['urls'] = urls
|
|
3043
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3044
|
+
|
|
3045
|
+
# Make request
|
|
3046
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3047
|
+
response = await self._async_post_request(
|
|
3048
|
+
f'{self.api_url}/v1/batch/scrape',
|
|
3049
|
+
params_dict,
|
|
3050
|
+
headers
|
|
3051
|
+
)
|
|
3052
|
+
|
|
3053
|
+
if response.status_code == 200:
|
|
3054
|
+
try:
|
|
3055
|
+
return BatchScrapeResponse(**response.json())
|
|
3056
|
+
except:
|
|
3057
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3058
|
+
else:
|
|
3059
|
+
self._handle_error(response, 'start batch scrape job')
|
|
3060
|
+
|
|
3061
|
+
async def crawl_url(
|
|
3062
|
+
self,
|
|
3063
|
+
url: str,
|
|
3064
|
+
*,
|
|
3065
|
+
include_paths: Optional[List[str]] = None,
|
|
3066
|
+
exclude_paths: Optional[List[str]] = None,
|
|
3067
|
+
max_depth: Optional[int] = None,
|
|
3068
|
+
max_discovery_depth: Optional[int] = None,
|
|
3069
|
+
limit: Optional[int] = None,
|
|
3070
|
+
allow_backward_links: Optional[bool] = None,
|
|
3071
|
+
allow_external_links: Optional[bool] = None,
|
|
3072
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3073
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
3074
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3075
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
3076
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
3077
|
+
regex_on_full_url: Optional[bool] = None,
|
|
3078
|
+
poll_interval: Optional[int] = 2,
|
|
3079
|
+
idempotency_key: Optional[str] = None,
|
|
3080
|
+
**kwargs
|
|
3081
|
+
) -> CrawlStatusResponse:
|
|
3082
|
+
"""
|
|
3083
|
+
Crawl a website starting from a URL.
|
|
3084
|
+
|
|
3085
|
+
Args:
|
|
3086
|
+
url (str): Target URL to start crawling from
|
|
3087
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
3088
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
3089
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
3090
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3091
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
3092
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3093
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
3094
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3095
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
3096
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3097
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3098
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3099
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
3100
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
3101
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3102
|
+
**kwargs: Additional parameters to pass to the API
|
|
3103
|
+
|
|
3104
|
+
Returns:
|
|
3105
|
+
CrawlStatusResponse with:
|
|
3106
|
+
* Crawling status and progress
|
|
3107
|
+
* Crawled page contents
|
|
3108
|
+
* Success/error information
|
|
3109
|
+
|
|
3110
|
+
Raises:
|
|
3111
|
+
Exception: If crawl fails
|
|
3112
|
+
"""
|
|
3113
|
+
crawl_params = {}
|
|
3114
|
+
|
|
3115
|
+
# Add individual parameters
|
|
3116
|
+
if include_paths is not None:
|
|
3117
|
+
crawl_params['includePaths'] = include_paths
|
|
3118
|
+
if exclude_paths is not None:
|
|
3119
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
3120
|
+
if max_depth is not None:
|
|
3121
|
+
crawl_params['maxDepth'] = max_depth
|
|
3122
|
+
if max_discovery_depth is not None:
|
|
3123
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3124
|
+
if limit is not None:
|
|
3125
|
+
crawl_params['limit'] = limit
|
|
3126
|
+
if allow_backward_links is not None:
|
|
3127
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3128
|
+
if allow_external_links is not None:
|
|
3129
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
3130
|
+
if ignore_sitemap is not None:
|
|
3131
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
3132
|
+
if scrape_options is not None:
|
|
3133
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
3134
|
+
if webhook is not None:
|
|
3135
|
+
crawl_params['webhook'] = webhook
|
|
3136
|
+
if deduplicate_similar_urls is not None:
|
|
3137
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
3138
|
+
if ignore_query_parameters is not None:
|
|
3139
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
3140
|
+
if regex_on_full_url is not None:
|
|
3141
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3142
|
+
|
|
3143
|
+
# Add any additional kwargs
|
|
3144
|
+
crawl_params.update(kwargs)
|
|
3145
|
+
|
|
3146
|
+
# Create final params object
|
|
3147
|
+
final_params = CrawlParams(**crawl_params)
|
|
3148
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3149
|
+
params_dict['url'] = url
|
|
3150
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3151
|
+
|
|
3152
|
+
# Make request
|
|
3153
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3154
|
+
response = await self._async_post_request(
|
|
3155
|
+
f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
3156
|
+
|
|
3157
|
+
if response.status_code == 200:
|
|
3158
|
+
try:
|
|
3159
|
+
id = response.json().get('id')
|
|
3160
|
+
except:
|
|
3161
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3162
|
+
return self._monitor_job_status(id, headers, poll_interval)
|
|
3163
|
+
else:
|
|
3164
|
+
self._handle_error(response, 'start crawl job')
|
|
3165
|
+
|
|
3166
|
+
|
|
3167
|
+
async def async_crawl_url(
|
|
3168
|
+
self,
|
|
3169
|
+
url: str,
|
|
3170
|
+
*,
|
|
3171
|
+
include_paths: Optional[List[str]] = None,
|
|
3172
|
+
exclude_paths: Optional[List[str]] = None,
|
|
3173
|
+
max_depth: Optional[int] = None,
|
|
3174
|
+
max_discovery_depth: Optional[int] = None,
|
|
3175
|
+
limit: Optional[int] = None,
|
|
3176
|
+
allow_backward_links: Optional[bool] = None,
|
|
3177
|
+
allow_external_links: Optional[bool] = None,
|
|
3178
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3179
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
3180
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3181
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
3182
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
3183
|
+
regex_on_full_url: Optional[bool] = None,
|
|
3184
|
+
idempotency_key: Optional[str] = None,
|
|
3185
|
+
**kwargs
|
|
3186
|
+
) -> CrawlResponse:
|
|
3187
|
+
"""
|
|
3188
|
+
Start an asynchronous crawl job.
|
|
3189
|
+
|
|
3190
|
+
Args:
|
|
3191
|
+
url (str): Target URL to start crawling from
|
|
3192
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
3193
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
3194
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
3195
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3196
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
3197
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3198
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
3199
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3200
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
3201
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3202
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3203
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3204
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
3205
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3206
|
+
**kwargs: Additional parameters to pass to the API
|
|
3207
|
+
|
|
3208
|
+
Returns:
|
|
3209
|
+
CrawlResponse with:
|
|
3210
|
+
* success - Whether crawl started successfully
|
|
3211
|
+
* id - Unique identifier for the crawl job
|
|
3212
|
+
* url - Status check URL for the crawl
|
|
3213
|
+
* error - Error message if start failed
|
|
3214
|
+
|
|
3215
|
+
Raises:
|
|
3216
|
+
Exception: If crawl initiation fails
|
|
3217
|
+
"""
|
|
3218
|
+
crawl_params = {}
|
|
3219
|
+
|
|
3220
|
+
# Add individual parameters
|
|
3221
|
+
if include_paths is not None:
|
|
3222
|
+
crawl_params['includePaths'] = include_paths
|
|
3223
|
+
if exclude_paths is not None:
|
|
3224
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
3225
|
+
if max_depth is not None:
|
|
3226
|
+
crawl_params['maxDepth'] = max_depth
|
|
3227
|
+
if max_discovery_depth is not None:
|
|
3228
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3229
|
+
if limit is not None:
|
|
3230
|
+
crawl_params['limit'] = limit
|
|
3231
|
+
if allow_backward_links is not None:
|
|
3232
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3233
|
+
if allow_external_links is not None:
|
|
3234
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
3235
|
+
if ignore_sitemap is not None:
|
|
3236
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
3237
|
+
if scrape_options is not None:
|
|
3238
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
3239
|
+
if webhook is not None:
|
|
3240
|
+
crawl_params['webhook'] = webhook
|
|
3241
|
+
if deduplicate_similar_urls is not None:
|
|
3242
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
3243
|
+
if ignore_query_parameters is not None:
|
|
3244
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
3245
|
+
if regex_on_full_url is not None:
|
|
3246
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3247
|
+
|
|
3248
|
+
# Add any additional kwargs
|
|
3249
|
+
crawl_params.update(kwargs)
|
|
3250
|
+
|
|
3251
|
+
# Create final params object
|
|
3252
|
+
final_params = CrawlParams(**crawl_params)
|
|
3253
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3254
|
+
params_dict['url'] = url
|
|
3255
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3256
|
+
|
|
3257
|
+
# Make request
|
|
3258
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3259
|
+
response = await self._async_post_request(
|
|
3260
|
+
f'{self.api_url}/v1/crawl',
|
|
3261
|
+
params_dict,
|
|
3262
|
+
headers
|
|
3263
|
+
)
|
|
3264
|
+
|
|
3265
|
+
if response.status_code == 200:
|
|
3266
|
+
try:
|
|
3267
|
+
return CrawlResponse(**response.json())
|
|
3268
|
+
except:
|
|
3269
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3270
|
+
else:
|
|
3271
|
+
self._handle_error(response, 'start crawl job')
|
|
3272
|
+
|
|
3273
|
+
async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
|
|
3274
|
+
"""
|
|
3275
|
+
Check the status and results of an asynchronous crawl job.
|
|
3276
|
+
|
|
3277
|
+
Args:
|
|
3278
|
+
id (str): Unique identifier for the crawl job
|
|
3279
|
+
|
|
3280
|
+
Returns:
|
|
3281
|
+
CrawlStatusResponse containing:
|
|
3282
|
+
Status Information:
|
|
3283
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
3284
|
+
* completed - Number of pages crawled
|
|
3285
|
+
* total - Total pages to crawl
|
|
3286
|
+
* creditsUsed - API credits consumed
|
|
3287
|
+
* expiresAt - Data expiration timestamp
|
|
3288
|
+
|
|
3289
|
+
Results:
|
|
3290
|
+
* data - List of crawled documents
|
|
3291
|
+
* next - URL for next page of results (if paginated)
|
|
3292
|
+
* success - Whether status check succeeded
|
|
3293
|
+
* error - Error message if failed
|
|
3294
|
+
|
|
3295
|
+
Raises:
|
|
3296
|
+
Exception: If status check fails
|
|
3297
|
+
"""
|
|
3298
|
+
headers = self._prepare_headers()
|
|
3299
|
+
endpoint = f'/v1/crawl/{id}'
|
|
3300
|
+
|
|
3301
|
+
status_data = await self._async_get_request(
|
|
3302
|
+
f'{self.api_url}{endpoint}',
|
|
3303
|
+
headers
|
|
3304
|
+
)
|
|
3305
|
+
|
|
3306
|
+
if status_data['status'] == 'completed':
|
|
3307
|
+
if 'data' in status_data:
|
|
3308
|
+
data = status_data['data']
|
|
3309
|
+
while 'next' in status_data:
|
|
3310
|
+
if len(status_data['data']) == 0:
|
|
3311
|
+
break
|
|
3312
|
+
next_url = status_data.get('next')
|
|
3313
|
+
if not next_url:
|
|
3314
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3315
|
+
break
|
|
3316
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3317
|
+
data.extend(next_data.get('data', []))
|
|
3318
|
+
status_data = next_data
|
|
3319
|
+
status_data['data'] = data
|
|
3320
|
+
|
|
3321
|
+
response = {
|
|
3322
|
+
'status': status_data.get('status'),
|
|
3323
|
+
'total': status_data.get('total'),
|
|
3324
|
+
'completed': status_data.get('completed'),
|
|
3325
|
+
'creditsUsed': status_data.get('creditsUsed'),
|
|
3326
|
+
'expiresAt': status_data.get('expiresAt'),
|
|
3327
|
+
'data': status_data.get('data')
|
|
3328
|
+
}
|
|
3329
|
+
|
|
3330
|
+
if 'error' in status_data:
|
|
3331
|
+
response['error'] = status_data['error']
|
|
3332
|
+
|
|
3333
|
+
if 'next' in status_data:
|
|
3334
|
+
response['next'] = status_data['next']
|
|
3335
|
+
|
|
3336
|
+
return {
|
|
3337
|
+
'success': False if 'error' in status_data else True,
|
|
3338
|
+
**response
|
|
3339
|
+
}
|
|
3340
|
+
|
|
3341
|
+
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
|
|
3342
|
+
"""
|
|
3343
|
+
Monitor the status of an asynchronous job until completion.
|
|
3344
|
+
|
|
3345
|
+
Args:
|
|
3346
|
+
id (str): The ID of the job to monitor
|
|
3347
|
+
headers (Dict[str, str]): Headers to include in status check requests
|
|
3348
|
+
poll_interval (int): Seconds between status checks (default: 2)
|
|
3349
|
+
|
|
3350
|
+
Returns:
|
|
3351
|
+
CrawlStatusResponse: The job results if completed successfully
|
|
3352
|
+
|
|
3353
|
+
Raises:
|
|
3354
|
+
Exception: If the job fails or an error occurs during status checks
|
|
3355
|
+
"""
|
|
3356
|
+
while True:
|
|
3357
|
+
status_data = await self._async_get_request(
|
|
3358
|
+
f'{self.api_url}/v1/crawl/{id}',
|
|
3359
|
+
headers
|
|
3360
|
+
)
|
|
3361
|
+
|
|
3362
|
+
if status_data['status'] == 'completed':
|
|
3363
|
+
if 'data' in status_data:
|
|
3364
|
+
data = status_data['data']
|
|
3365
|
+
while 'next' in status_data:
|
|
3366
|
+
if len(status_data['data']) == 0:
|
|
3367
|
+
break
|
|
3368
|
+
next_url = status_data.get('next')
|
|
3369
|
+
if not next_url:
|
|
3370
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3371
|
+
break
|
|
3372
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3373
|
+
data.extend(next_data.get('data', []))
|
|
3374
|
+
status_data = next_data
|
|
3375
|
+
status_data['data'] = data
|
|
3376
|
+
return status_data
|
|
3377
|
+
else:
|
|
3378
|
+
raise Exception('Job completed but no data was returned')
|
|
3379
|
+
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
3380
|
+
await asyncio.sleep(max(poll_interval, 2))
|
|
3381
|
+
else:
|
|
3382
|
+
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
|
|
3383
|
+
|
|
3384
|
+
async def map_url(
|
|
3385
|
+
self,
|
|
3386
|
+
url: str,
|
|
3387
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
3388
|
+
"""
|
|
3389
|
+
Asynchronously map and discover links from a URL.
|
|
3390
|
+
|
|
3391
|
+
Args:
|
|
3392
|
+
url (str): Target URL to map
|
|
3393
|
+
params (Optional[MapParams]): See MapParams model:
|
|
3394
|
+
Discovery Options:
|
|
3395
|
+
* search - Filter pattern for URLs
|
|
3396
|
+
* ignoreSitemap - Skip sitemap.xml
|
|
3397
|
+
* includeSubdomains - Include subdomain links
|
|
3398
|
+
* sitemapOnly - Only use sitemap.xml
|
|
3399
|
+
|
|
3400
|
+
Limits:
|
|
3401
|
+
* limit - Max URLs to return
|
|
3402
|
+
* timeout - Request timeout (ms)
|
|
3403
|
+
|
|
3404
|
+
Returns:
|
|
3405
|
+
MapResponse with:
|
|
3406
|
+
* Discovered URLs
|
|
3407
|
+
* Success/error status
|
|
3408
|
+
|
|
3409
|
+
Raises:
|
|
3410
|
+
Exception: If mapping fails
|
|
3411
|
+
"""
|
|
3412
|
+
headers = self._prepare_headers()
|
|
3413
|
+
json_data = {'url': url}
|
|
3414
|
+
if params:
|
|
3415
|
+
json_data.update(params)
|
|
3416
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
3417
|
+
|
|
3418
|
+
endpoint = f'/v1/map'
|
|
3419
|
+
response = await self._async_post_request(
|
|
3420
|
+
f'{self.api_url}{endpoint}',
|
|
3421
|
+
json_data,
|
|
3422
|
+
headers
|
|
3423
|
+
)
|
|
3424
|
+
|
|
3425
|
+
if response.get('success') and 'links' in response:
|
|
3426
|
+
return response
|
|
3427
|
+
elif 'error' in response:
|
|
3428
|
+
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
|
3429
|
+
else:
|
|
3430
|
+
raise Exception(f'Failed to map URL. Error: {response}')
|
|
3431
|
+
|
|
3432
|
+
async def extract(
|
|
3433
|
+
self,
|
|
3434
|
+
urls: List[str],
|
|
3435
|
+
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
|
|
3436
|
+
"""
|
|
3437
|
+
Asynchronously extract structured information from URLs.
|
|
3438
|
+
|
|
3439
|
+
Args:
|
|
3440
|
+
urls (List[str]): URLs to extract from
|
|
3441
|
+
params (Optional[ExtractParams]): See ExtractParams model:
|
|
3442
|
+
Extraction Config:
|
|
3443
|
+
* prompt - Custom extraction prompt
|
|
3444
|
+
* schema - JSON schema/Pydantic model
|
|
3445
|
+
* systemPrompt - System context
|
|
3446
|
+
|
|
3447
|
+
Behavior Options:
|
|
3448
|
+
* allowExternalLinks - Follow external links
|
|
3449
|
+
* enableWebSearch - Enable web search
|
|
3450
|
+
* includeSubdomains - Include subdomains
|
|
3451
|
+
* showSources - Include source URLs
|
|
3452
|
+
|
|
3453
|
+
Scraping Options:
|
|
3454
|
+
* scrapeOptions - Page scraping config
|
|
3455
|
+
|
|
3456
|
+
Returns:
|
|
3457
|
+
ExtractResponse with:
|
|
3458
|
+
* Structured data matching schema
|
|
3459
|
+
* Source information if requested
|
|
3460
|
+
* Success/error status
|
|
3461
|
+
|
|
3462
|
+
Raises:
|
|
3463
|
+
ValueError: If prompt/schema missing or extraction fails
|
|
3464
|
+
"""
|
|
3465
|
+
headers = self._prepare_headers()
|
|
3466
|
+
|
|
3467
|
+
if not params or (not params.get('prompt') and not params.get('schema')):
|
|
3468
|
+
raise ValueError("Either prompt or schema is required")
|
|
3469
|
+
|
|
3470
|
+
schema = params.get('schema')
|
|
3471
|
+
if schema:
|
|
3472
|
+
if hasattr(schema, 'model_json_schema'):
|
|
3473
|
+
schema = schema.model_json_schema()
|
|
3474
|
+
|
|
3475
|
+
request_data = {
|
|
3476
|
+
'urls': urls,
|
|
3477
|
+
'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)),
|
|
3478
|
+
'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)),
|
|
3479
|
+
'showSources': params.get('show_sources', params.get('showSources', False)),
|
|
3480
|
+
'schema': schema,
|
|
3481
|
+
'origin': f'python-sdk@{version}'
|
|
3482
|
+
}
|
|
3483
|
+
|
|
3484
|
+
if params.get('prompt'):
|
|
3485
|
+
request_data['prompt'] = params['prompt']
|
|
3486
|
+
if params.get('system_prompt'):
|
|
3487
|
+
request_data['systemPrompt'] = params['system_prompt']
|
|
3488
|
+
elif params.get('systemPrompt'):
|
|
3489
|
+
request_data['systemPrompt'] = params['systemPrompt']
|
|
3490
|
+
|
|
3491
|
+
response = await self._async_post_request(
|
|
3492
|
+
f'{self.api_url}/v1/extract',
|
|
3493
|
+
request_data,
|
|
3494
|
+
headers
|
|
3495
|
+
)
|
|
3496
|
+
|
|
3497
|
+
if response.get('success'):
|
|
3498
|
+
job_id = response.get('id')
|
|
3499
|
+
if not job_id:
|
|
3500
|
+
raise Exception('Job ID not returned from extract request.')
|
|
3501
|
+
|
|
3502
|
+
while True:
|
|
3503
|
+
status_data = await self._async_get_request(
|
|
3504
|
+
f'{self.api_url}/v1/extract/{job_id}',
|
|
3505
|
+
headers
|
|
3506
|
+
)
|
|
3507
|
+
|
|
3508
|
+
if status_data['status'] == 'completed':
|
|
3509
|
+
return status_data
|
|
3510
|
+
elif status_data['status'] in ['failed', 'cancelled']:
|
|
3511
|
+
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
3512
|
+
|
|
3513
|
+
await asyncio.sleep(2)
|
|
3514
|
+
else:
|
|
3515
|
+
raise Exception(f'Failed to extract. Error: {response.get("error")}')
|
|
3516
|
+
|
|
3517
|
+
async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
|
|
3518
|
+
"""
|
|
3519
|
+
Check the status of an asynchronous batch scrape job.
|
|
3520
|
+
|
|
3521
|
+
Args:
|
|
3522
|
+
id (str): The ID of the batch scrape job
|
|
3523
|
+
|
|
3524
|
+
Returns:
|
|
3525
|
+
BatchScrapeStatusResponse containing:
|
|
3526
|
+
Status Information:
|
|
3527
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
3528
|
+
* completed - Number of URLs scraped
|
|
3529
|
+
* total - Total URLs to scrape
|
|
3530
|
+
* creditsUsed - API credits consumed
|
|
3531
|
+
* expiresAt - Data expiration timestamp
|
|
3532
|
+
|
|
3533
|
+
Results:
|
|
3534
|
+
* data - List of scraped documents
|
|
3535
|
+
* next - URL for next page of results (if paginated)
|
|
3536
|
+
* success - Whether status check succeeded
|
|
3537
|
+
* error - Error message if failed
|
|
3538
|
+
|
|
3539
|
+
Raises:
|
|
3540
|
+
Exception: If status check fails
|
|
3541
|
+
"""
|
|
3542
|
+
headers = self._prepare_headers()
|
|
3543
|
+
endpoint = f'/v1/batch/scrape/{id}'
|
|
3544
|
+
|
|
3545
|
+
status_data = await self._async_get_request(
|
|
3546
|
+
f'{self.api_url}{endpoint}',
|
|
3547
|
+
headers
|
|
3548
|
+
)
|
|
3549
|
+
|
|
3550
|
+
if status_data['status'] == 'completed':
|
|
3551
|
+
if 'data' in status_data:
|
|
3552
|
+
data = status_data['data']
|
|
3553
|
+
while 'next' in status_data:
|
|
3554
|
+
if len(status_data['data']) == 0:
|
|
3555
|
+
break
|
|
3556
|
+
next_url = status_data.get('next')
|
|
3557
|
+
if not next_url:
|
|
3558
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3559
|
+
break
|
|
3560
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3561
|
+
data.extend(next_data.get('data', []))
|
|
3562
|
+
status_data = next_data
|
|
3563
|
+
status_data['data'] = data
|
|
3564
|
+
|
|
3565
|
+
response = {
|
|
3566
|
+
'status': status_data.get('status'),
|
|
3567
|
+
'total': status_data.get('total'),
|
|
3568
|
+
'completed': status_data.get('completed'),
|
|
3569
|
+
'creditsUsed': status_data.get('creditsUsed'),
|
|
3570
|
+
'expiresAt': status_data.get('expiresAt'),
|
|
3571
|
+
'data': status_data.get('data')
|
|
3572
|
+
}
|
|
3573
|
+
|
|
3574
|
+
if 'error' in status_data:
|
|
3575
|
+
response['error'] = status_data['error']
|
|
3576
|
+
|
|
3577
|
+
if 'next' in status_data:
|
|
3578
|
+
response['next'] = status_data['next']
|
|
3579
|
+
|
|
3580
|
+
return {
|
|
3581
|
+
'success': False if 'error' in status_data else True,
|
|
3582
|
+
**response
|
|
3583
|
+
}
|
|
3584
|
+
|
|
3585
|
+
async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
|
|
3586
|
+
"""
|
|
3587
|
+
Get information about errors from an asynchronous batch scrape job.
|
|
3588
|
+
|
|
3589
|
+
Args:
|
|
3590
|
+
id (str): The ID of the batch scrape job
|
|
3591
|
+
|
|
3592
|
+
Returns:
|
|
3593
|
+
CrawlErrorsResponse containing:
|
|
3594
|
+
errors (List[Dict[str, str]]): List of errors with fields:
|
|
3595
|
+
* id (str): Error ID
|
|
3596
|
+
* timestamp (str): When the error occurred
|
|
3597
|
+
* url (str): URL that caused the error
|
|
3598
|
+
* error (str): Error message
|
|
3599
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
3600
|
+
|
|
3601
|
+
Raises:
|
|
3602
|
+
Exception: If error check fails
|
|
3603
|
+
"""
|
|
3604
|
+
headers = self._prepare_headers()
|
|
3605
|
+
return await self._async_get_request(
|
|
3606
|
+
f'{self.api_url}/v1/batch/scrape/{id}/errors',
|
|
3607
|
+
headers
|
|
3608
|
+
)
|
|
3609
|
+
|
|
3610
|
+
async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
|
|
3611
|
+
"""
|
|
3612
|
+
Get information about errors from an asynchronous crawl job.
|
|
3613
|
+
|
|
3614
|
+
Args:
|
|
3615
|
+
id (str): The ID of the crawl job
|
|
3616
|
+
|
|
3617
|
+
Returns:
|
|
3618
|
+
CrawlErrorsResponse containing:
|
|
3619
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
3620
|
+
- id (str): Error ID
|
|
3621
|
+
- timestamp (str): When the error occurred
|
|
3622
|
+
- url (str): URL that caused the error
|
|
3623
|
+
- error (str): Error message
|
|
3624
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
3625
|
+
|
|
3626
|
+
Raises:
|
|
3627
|
+
Exception: If error check fails
|
|
3628
|
+
"""
|
|
3629
|
+
headers = self._prepare_headers()
|
|
3630
|
+
return await self._async_get_request(
|
|
3631
|
+
f'{self.api_url}/v1/crawl/{id}/errors',
|
|
3632
|
+
headers
|
|
3633
|
+
)
|
|
3634
|
+
|
|
3635
|
+
async def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
|
3636
|
+
"""
|
|
3637
|
+
Cancel an asynchronous crawl job.
|
|
3638
|
+
|
|
3639
|
+
Args:
|
|
3640
|
+
id (str): The ID of the crawl job to cancel
|
|
3641
|
+
|
|
3642
|
+
Returns:
|
|
3643
|
+
Dict[str, Any] containing:
|
|
3644
|
+
* success (bool): Whether cancellation was successful
|
|
3645
|
+
* error (str, optional): Error message if cancellation failed
|
|
3646
|
+
|
|
3647
|
+
Raises:
|
|
3648
|
+
Exception: If cancellation fails
|
|
3649
|
+
"""
|
|
3650
|
+
headers = self._prepare_headers()
|
|
3651
|
+
async with aiohttp.ClientSession() as session:
|
|
3652
|
+
async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
|
|
3653
|
+
return await response.json()
|
|
3654
|
+
|
|
3655
|
+
async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
|
|
3656
|
+
"""
|
|
3657
|
+
Check the status of an asynchronous extraction job.
|
|
3658
|
+
|
|
3659
|
+
Args:
|
|
3660
|
+
job_id (str): The ID of the extraction job
|
|
3661
|
+
|
|
3662
|
+
Returns:
|
|
3663
|
+
ExtractResponse[Any] with:
|
|
3664
|
+
* success (bool): Whether request succeeded
|
|
3665
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
3666
|
+
* error (Optional[str]): Error message if any
|
|
3667
|
+
* warning (Optional[str]): Warning message if any
|
|
3668
|
+
* sources (Optional[List[str]]): Source URLs if requested
|
|
3669
|
+
|
|
3670
|
+
Raises:
|
|
3671
|
+
ValueError: If status check fails
|
|
3672
|
+
"""
|
|
3673
|
+
headers = self._prepare_headers()
|
|
3674
|
+
try:
|
|
3675
|
+
return await self._async_get_request(
|
|
3676
|
+
f'{self.api_url}/v1/extract/{job_id}',
|
|
3677
|
+
headers
|
|
3678
|
+
)
|
|
3679
|
+
except Exception as e:
|
|
3680
|
+
raise ValueError(str(e))
|
|
3681
|
+
|
|
3682
|
+
async def async_extract(
|
|
3683
|
+
self,
|
|
3684
|
+
urls: Optional[List[str]] = None,
|
|
3685
|
+
*,
|
|
3686
|
+
prompt: Optional[str] = None,
|
|
3687
|
+
schema: Optional[Any] = None,
|
|
3688
|
+
system_prompt: Optional[str] = None,
|
|
3689
|
+
allow_external_links: Optional[bool] = False,
|
|
3690
|
+
enable_web_search: Optional[bool] = False,
|
|
3691
|
+
show_sources: Optional[bool] = False,
|
|
3692
|
+
agent: Optional[Dict[str, Any]] = None,
|
|
3693
|
+
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
3694
|
+
"""
|
|
3695
|
+
Initiate an asynchronous extraction job without waiting for completion.
|
|
3696
|
+
|
|
3697
|
+
Args:
|
|
3698
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
3699
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
3700
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
3701
|
+
system_prompt (Optional[str]): System context
|
|
3702
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
3703
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
3704
|
+
show_sources (Optional[bool]): Include source URLs
|
|
3705
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
3706
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3707
|
+
|
|
3708
|
+
Returns:
|
|
3709
|
+
ExtractResponse[Any] with:
|
|
3710
|
+
* success (bool): Whether request succeeded
|
|
3711
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
3712
|
+
* error (Optional[str]): Error message if any
|
|
3713
|
+
|
|
3714
|
+
Raises:
|
|
3715
|
+
ValueError: If job initiation fails
|
|
3716
|
+
"""
|
|
3717
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3718
|
+
|
|
3719
|
+
if not prompt and not schema:
|
|
3720
|
+
raise ValueError("Either prompt or schema is required")
|
|
3721
|
+
|
|
3722
|
+
if not urls and not prompt:
|
|
3723
|
+
raise ValueError("Either urls or prompt is required")
|
|
3724
|
+
|
|
3725
|
+
if schema:
|
|
3726
|
+
if hasattr(schema, 'model_json_schema'):
|
|
3727
|
+
schema = schema.model_json_schema()
|
|
3728
|
+
|
|
3729
|
+
request_data = {
|
|
3730
|
+
'urls': urls or [],
|
|
3731
|
+
'allowExternalLinks': allow_external_links,
|
|
3732
|
+
'enableWebSearch': enable_web_search,
|
|
3733
|
+
'showSources': show_sources,
|
|
3734
|
+
'schema': schema,
|
|
3735
|
+
'origin': f'python-sdk@{version}'
|
|
3736
|
+
}
|
|
3737
|
+
|
|
3738
|
+
if prompt:
|
|
3739
|
+
request_data['prompt'] = prompt
|
|
3740
|
+
if system_prompt:
|
|
3741
|
+
request_data['systemPrompt'] = system_prompt
|
|
3742
|
+
if agent:
|
|
3743
|
+
request_data['agent'] = agent
|
|
3744
|
+
|
|
3745
|
+
try:
|
|
3746
|
+
return await self._async_post_request(
|
|
3747
|
+
f'{self.api_url}/v1/extract',
|
|
3748
|
+
request_data,
|
|
3749
|
+
headers
|
|
3750
|
+
)
|
|
3751
|
+
except Exception as e:
|
|
3752
|
+
raise ValueError(str(e))
|
|
3753
|
+
|
|
3754
|
+
async def generate_llms_text(
|
|
3755
|
+
self,
|
|
3756
|
+
url: str,
|
|
3757
|
+
*,
|
|
3758
|
+
max_urls: Optional[int] = None,
|
|
3759
|
+
show_full_text: Optional[bool] = None,
|
|
3760
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
|
3761
|
+
"""
|
|
3762
|
+
Generate LLMs.txt for a given URL and monitor until completion.
|
|
3763
|
+
|
|
3764
|
+
Args:
|
|
3765
|
+
url (str): Target URL to generate LLMs.txt from
|
|
3766
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
3767
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
3768
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
3769
|
+
|
|
3770
|
+
Returns:
|
|
3771
|
+
GenerateLLMsTextStatusResponse containing:
|
|
3772
|
+
* success (bool): Whether generation completed successfully
|
|
3773
|
+
* status (str): Status of generation (processing/completed/failed)
|
|
3774
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
3775
|
+
- llmstxt (str): Generated LLMs.txt content
|
|
3776
|
+
- llmsfulltxt (str, optional): Full version if requested
|
|
3777
|
+
* error (str, optional): Error message if generation failed
|
|
3778
|
+
* expiresAt (str): When the generated data expires
|
|
3779
|
+
|
|
3780
|
+
Raises:
|
|
3781
|
+
Exception: If generation fails
|
|
3782
|
+
"""
|
|
3783
|
+
params = {}
|
|
3784
|
+
if max_urls is not None:
|
|
3785
|
+
params['maxUrls'] = max_urls
|
|
3786
|
+
if show_full_text is not None:
|
|
3787
|
+
params['showFullText'] = show_full_text
|
|
3788
|
+
if experimental_stream is not None:
|
|
3789
|
+
params['__experimental_stream'] = experimental_stream
|
|
3790
|
+
|
|
3791
|
+
response = await self.async_generate_llms_text(
|
|
3792
|
+
url,
|
|
3793
|
+
max_urls=max_urls,
|
|
3794
|
+
show_full_text=show_full_text,
|
|
3795
|
+
experimental_stream=experimental_stream
|
|
3796
|
+
)
|
|
3797
|
+
if not response.get('success') or 'id' not in response:
|
|
3798
|
+
return response
|
|
3799
|
+
|
|
3800
|
+
job_id = response['id']
|
|
3801
|
+
while True:
|
|
3802
|
+
status = await self.check_generate_llms_text_status(job_id)
|
|
3803
|
+
|
|
3804
|
+
if status['status'] == 'completed':
|
|
3805
|
+
return status
|
|
3806
|
+
elif status['status'] == 'failed':
|
|
3807
|
+
raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
|
|
3808
|
+
elif status['status'] != 'processing':
|
|
3809
|
+
break
|
|
3810
|
+
|
|
3811
|
+
await asyncio.sleep(2)
|
|
3812
|
+
|
|
3813
|
+
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
|
3814
|
+
|
|
3815
|
+
async def async_generate_llms_text(
|
|
3816
|
+
self,
|
|
3817
|
+
url: str,
|
|
3818
|
+
*,
|
|
3819
|
+
max_urls: Optional[int] = None,
|
|
3820
|
+
show_full_text: Optional[bool] = None,
|
|
3821
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
3822
|
+
"""
|
|
3823
|
+
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
|
3824
|
+
|
|
3825
|
+
Args:
|
|
3826
|
+
url (str): Target URL to generate LLMs.txt from
|
|
3827
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
3828
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
3829
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
3830
|
+
|
|
3831
|
+
Returns:
|
|
3832
|
+
GenerateLLMsTextResponse containing:
|
|
3833
|
+
* success (bool): Whether job started successfully
|
|
3834
|
+
* id (str): Unique identifier for the job
|
|
3835
|
+
* error (str, optional): Error message if start failed
|
|
3836
|
+
|
|
3837
|
+
Raises:
|
|
3838
|
+
ValueError: If job initiation fails
|
|
3839
|
+
"""
|
|
3840
|
+
params = {}
|
|
3841
|
+
if max_urls is not None:
|
|
3842
|
+
params['maxUrls'] = max_urls
|
|
3843
|
+
if show_full_text is not None:
|
|
3844
|
+
params['showFullText'] = show_full_text
|
|
3845
|
+
if experimental_stream is not None:
|
|
3846
|
+
params['__experimental_stream'] = experimental_stream
|
|
3847
|
+
|
|
3848
|
+
headers = self._prepare_headers()
|
|
3849
|
+
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
3850
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
3851
|
+
|
|
3852
|
+
try:
|
|
3853
|
+
return await self._async_post_request(
|
|
3854
|
+
f'{self.api_url}/v1/llmstxt',
|
|
3855
|
+
json_data,
|
|
3856
|
+
headers
|
|
3857
|
+
)
|
|
3858
|
+
except Exception as e:
|
|
3859
|
+
raise ValueError(str(e))
|
|
3860
|
+
|
|
3861
|
+
async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
|
|
3862
|
+
"""
|
|
3863
|
+
Check the status of an asynchronous LLMs.txt generation job.
|
|
3864
|
+
|
|
3865
|
+
Args:
|
|
3866
|
+
id (str): The ID of the generation job
|
|
3867
|
+
|
|
3868
|
+
Returns:
|
|
3869
|
+
GenerateLLMsTextStatusResponse containing:
|
|
3870
|
+
* success (bool): Whether generation completed successfully
|
|
3871
|
+
* status (str): Status of generation (processing/completed/failed)
|
|
3872
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
3873
|
+
- llmstxt (str): Generated LLMs.txt content
|
|
3874
|
+
- llmsfulltxt (str, optional): Full version if requested
|
|
3875
|
+
* error (str, optional): Error message if generation failed
|
|
3876
|
+
* expiresAt (str): When the generated data expires
|
|
3877
|
+
|
|
3878
|
+
Raises:
|
|
3879
|
+
ValueError: If status check fails
|
|
3880
|
+
"""
|
|
3881
|
+
headers = self._prepare_headers()
|
|
3882
|
+
try:
|
|
3883
|
+
return await self._async_get_request(
|
|
3884
|
+
f'{self.api_url}/v1/llmstxt/{id}',
|
|
3885
|
+
headers
|
|
3886
|
+
)
|
|
3887
|
+
except Exception as e:
|
|
3888
|
+
raise ValueError(str(e))
|
|
3889
|
+
|
|
3890
|
+
async def deep_research(
|
|
3891
|
+
self,
|
|
3892
|
+
query: str,
|
|
3893
|
+
*,
|
|
3894
|
+
max_depth: Optional[int] = None,
|
|
3895
|
+
time_limit: Optional[int] = None,
|
|
3896
|
+
max_urls: Optional[int] = None,
|
|
3897
|
+
analysis_prompt: Optional[str] = None,
|
|
3898
|
+
system_prompt: Optional[str] = None,
|
|
3899
|
+
__experimental_stream_steps: Optional[bool] = None,
|
|
3900
|
+
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
3901
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
|
|
3902
|
+
"""
|
|
3903
|
+
Initiates a deep research operation on a given query and polls until completion.
|
|
3904
|
+
|
|
3905
|
+
Args:
|
|
3906
|
+
query (str): Research query or topic to investigate
|
|
3907
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
3908
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
3909
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
3910
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
3911
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
3912
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
3913
|
+
on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
|
|
3914
|
+
on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
|
|
3915
|
+
|
|
3916
|
+
Returns:
|
|
3917
|
+
DeepResearchStatusResponse containing:
|
|
3918
|
+
* success (bool): Whether research completed successfully
|
|
3919
|
+
* status (str): Current state (processing/completed/failed)
|
|
3920
|
+
* error (Optional[str]): Error message if failed
|
|
3921
|
+
* id (str): Unique identifier for the research job
|
|
3922
|
+
* data (Any): Research findings and analysis
|
|
3923
|
+
* sources (List[Dict]): List of discovered sources
|
|
3924
|
+
* activities (List[Dict]): Research progress log
|
|
3925
|
+
* summaries (List[str]): Generated research summaries
|
|
3926
|
+
|
|
3927
|
+
Raises:
|
|
3928
|
+
Exception: If research fails
|
|
3929
|
+
"""
|
|
3930
|
+
research_params = {}
|
|
3931
|
+
if max_depth is not None:
|
|
3932
|
+
research_params['maxDepth'] = max_depth
|
|
3933
|
+
if time_limit is not None:
|
|
3934
|
+
research_params['timeLimit'] = time_limit
|
|
3935
|
+
if max_urls is not None:
|
|
3936
|
+
research_params['maxUrls'] = max_urls
|
|
3937
|
+
if analysis_prompt is not None:
|
|
3938
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
3939
|
+
if system_prompt is not None:
|
|
3940
|
+
research_params['systemPrompt'] = system_prompt
|
|
3941
|
+
if __experimental_stream_steps is not None:
|
|
3942
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
3943
|
+
research_params = DeepResearchParams(**research_params)
|
|
3944
|
+
|
|
3945
|
+
response = await self.async_deep_research(
|
|
3946
|
+
query,
|
|
3947
|
+
max_depth=max_depth,
|
|
3948
|
+
time_limit=time_limit,
|
|
3949
|
+
max_urls=max_urls,
|
|
3950
|
+
analysis_prompt=analysis_prompt,
|
|
3951
|
+
system_prompt=system_prompt
|
|
3952
|
+
)
|
|
3953
|
+
if not response.get('success') or 'id' not in response:
|
|
3954
|
+
return response
|
|
3955
|
+
|
|
3956
|
+
job_id = response['id']
|
|
3957
|
+
last_activity_count = 0
|
|
3958
|
+
last_source_count = 0
|
|
3959
|
+
|
|
3960
|
+
while True:
|
|
3961
|
+
status = await self.check_deep_research_status(job_id)
|
|
3962
|
+
|
|
3963
|
+
if on_activity and 'activities' in status:
|
|
3964
|
+
new_activities = status['activities'][last_activity_count:]
|
|
3965
|
+
for activity in new_activities:
|
|
3966
|
+
on_activity(activity)
|
|
3967
|
+
last_activity_count = len(status['activities'])
|
|
3968
|
+
|
|
3969
|
+
if on_source and 'sources' in status:
|
|
3970
|
+
new_sources = status['sources'][last_source_count:]
|
|
3971
|
+
for source in new_sources:
|
|
3972
|
+
on_source(source)
|
|
3973
|
+
last_source_count = len(status['sources'])
|
|
3974
|
+
|
|
3975
|
+
if status['status'] == 'completed':
|
|
3976
|
+
return status
|
|
3977
|
+
elif status['status'] == 'failed':
|
|
3978
|
+
raise Exception(f'Deep research failed. Error: {status.get("error")}')
|
|
3979
|
+
elif status['status'] != 'processing':
|
|
3980
|
+
break
|
|
3981
|
+
|
|
3982
|
+
await asyncio.sleep(2)
|
|
3983
|
+
|
|
3984
|
+
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
|
3985
|
+
|
|
3986
|
+
async def async_deep_research(
|
|
3987
|
+
self,
|
|
3988
|
+
query: str,
|
|
3989
|
+
*,
|
|
3990
|
+
max_depth: Optional[int] = None,
|
|
3991
|
+
time_limit: Optional[int] = None,
|
|
3992
|
+
max_urls: Optional[int] = None,
|
|
3993
|
+
analysis_prompt: Optional[str] = None,
|
|
3994
|
+
system_prompt: Optional[str] = None,
|
|
3995
|
+
__experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
|
|
3996
|
+
"""
|
|
3997
|
+
Initiates an asynchronous deep research operation.
|
|
3998
|
+
|
|
3999
|
+
Args:
|
|
4000
|
+
query (str): Research query or topic to investigate
|
|
4001
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
4002
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
4003
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
4004
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
4005
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
4006
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
4007
|
+
|
|
4008
|
+
Returns:
|
|
4009
|
+
Dict[str, Any]: A response containing:
|
|
4010
|
+
* success (bool): Whether the research initiation was successful
|
|
4011
|
+
* id (str): The unique identifier for the research job
|
|
4012
|
+
* error (str, optional): Error message if initiation failed
|
|
4013
|
+
|
|
4014
|
+
Raises:
|
|
4015
|
+
Exception: If the research initiation fails.
|
|
4016
|
+
"""
|
|
4017
|
+
research_params = {}
|
|
4018
|
+
if max_depth is not None:
|
|
4019
|
+
research_params['maxDepth'] = max_depth
|
|
4020
|
+
if time_limit is not None:
|
|
4021
|
+
research_params['timeLimit'] = time_limit
|
|
4022
|
+
if max_urls is not None:
|
|
4023
|
+
research_params['maxUrls'] = max_urls
|
|
4024
|
+
if analysis_prompt is not None:
|
|
4025
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
4026
|
+
if system_prompt is not None:
|
|
4027
|
+
research_params['systemPrompt'] = system_prompt
|
|
4028
|
+
if __experimental_stream_steps is not None:
|
|
4029
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
4030
|
+
research_params = DeepResearchParams(**research_params)
|
|
4031
|
+
|
|
4032
|
+
headers = self._prepare_headers()
|
|
4033
|
+
|
|
4034
|
+
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
|
4035
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
4036
|
+
|
|
4037
|
+
try:
|
|
4038
|
+
return await self._async_post_request(
|
|
4039
|
+
f'{self.api_url}/v1/deep-research',
|
|
4040
|
+
json_data,
|
|
4041
|
+
headers
|
|
4042
|
+
)
|
|
4043
|
+
except Exception as e:
|
|
4044
|
+
raise ValueError(str(e))
|
|
4045
|
+
|
|
4046
|
+
async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
|
|
4047
|
+
"""
|
|
4048
|
+
Check the status of a deep research operation.
|
|
4049
|
+
|
|
4050
|
+
Args:
|
|
4051
|
+
id (str): The ID of the deep research operation.
|
|
4052
|
+
|
|
4053
|
+
Returns:
|
|
4054
|
+
DeepResearchResponse containing:
|
|
4055
|
+
|
|
4056
|
+
Status:
|
|
4057
|
+
* success - Whether research completed successfully
|
|
4058
|
+
* status - Current state (processing/completed/failed)
|
|
4059
|
+
* error - Error message if failed
|
|
4060
|
+
|
|
4061
|
+
Results:
|
|
4062
|
+
* id - Unique identifier for the research job
|
|
4063
|
+
* data - Research findings and analysis
|
|
4064
|
+
* sources - List of discovered sources
|
|
4065
|
+
* activities - Research progress log
|
|
4066
|
+
* summaries - Generated research summaries
|
|
4067
|
+
|
|
4068
|
+
Raises:
|
|
4069
|
+
Exception: If the status check fails.
|
|
4070
|
+
"""
|
|
4071
|
+
headers = self._prepare_headers()
|
|
4072
|
+
try:
|
|
4073
|
+
return await self._async_get_request(
|
|
4074
|
+
f'{self.api_url}/v1/deep-research/{id}',
|
|
4075
|
+
headers
|
|
4076
|
+
)
|
|
4077
|
+
except Exception as e:
|
|
4078
|
+
raise ValueError(str(e))
|
|
4079
|
+
|
|
4080
|
+
async def search(
|
|
4081
|
+
self,
|
|
4082
|
+
query: str,
|
|
4083
|
+
*,
|
|
4084
|
+
limit: Optional[int] = None,
|
|
4085
|
+
tbs: Optional[str] = None,
|
|
4086
|
+
filter: Optional[str] = None,
|
|
4087
|
+
lang: Optional[str] = None,
|
|
4088
|
+
country: Optional[str] = None,
|
|
4089
|
+
location: Optional[str] = None,
|
|
4090
|
+
timeout: Optional[int] = None,
|
|
4091
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
4092
|
+
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
4093
|
+
**kwargs) -> SearchResponse:
|
|
4094
|
+
"""
|
|
4095
|
+
Asynchronously search for content using Firecrawl.
|
|
4096
|
+
|
|
4097
|
+
Args:
|
|
4098
|
+
query (str): Search query string
|
|
4099
|
+
limit (Optional[int]): Max results (default: 5)
|
|
4100
|
+
tbs (Optional[str]): Time filter (e.g. "qdr:d")
|
|
4101
|
+
filter (Optional[str]): Custom result filter
|
|
4102
|
+
lang (Optional[str]): Language code (default: "en")
|
|
4103
|
+
country (Optional[str]): Country code (default: "us")
|
|
4104
|
+
location (Optional[str]): Geo-targeting
|
|
4105
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
4106
|
+
scrape_options (Optional[CommonOptions]): Result scraping configuration
|
|
4107
|
+
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
4108
|
+
**kwargs: Additional keyword arguments for future compatibility
|
|
4109
|
+
|
|
4110
|
+
Returns:
|
|
4111
|
+
SearchResponse: Response containing:
|
|
4112
|
+
* success (bool): Whether request succeeded
|
|
4113
|
+
* data (List[FirecrawlDocument]): Search results
|
|
4114
|
+
* warning (Optional[str]): Warning message if any
|
|
4115
|
+
* error (Optional[str]): Error message if any
|
|
4116
|
+
|
|
4117
|
+
Raises:
|
|
4118
|
+
Exception: If search fails or response cannot be parsed
|
|
4119
|
+
"""
|
|
4120
|
+
# Build search parameters
|
|
4121
|
+
search_params = {}
|
|
4122
|
+
if params:
|
|
4123
|
+
if isinstance(params, dict):
|
|
4124
|
+
search_params.update(params)
|
|
4125
|
+
else:
|
|
4126
|
+
search_params.update(params.dict(exclude_none=True))
|
|
4127
|
+
|
|
4128
|
+
# Add individual parameters
|
|
4129
|
+
if limit is not None:
|
|
4130
|
+
search_params['limit'] = limit
|
|
4131
|
+
if tbs is not None:
|
|
4132
|
+
search_params['tbs'] = tbs
|
|
4133
|
+
if filter is not None:
|
|
4134
|
+
search_params['filter'] = filter
|
|
4135
|
+
if lang is not None:
|
|
4136
|
+
search_params['lang'] = lang
|
|
4137
|
+
if country is not None:
|
|
4138
|
+
search_params['country'] = country
|
|
4139
|
+
if location is not None:
|
|
4140
|
+
search_params['location'] = location
|
|
4141
|
+
if timeout is not None:
|
|
4142
|
+
search_params['timeout'] = timeout
|
|
4143
|
+
if scrape_options is not None:
|
|
4144
|
+
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
4145
|
+
|
|
4146
|
+
# Add any additional kwargs
|
|
4147
|
+
search_params.update(kwargs)
|
|
4148
|
+
|
|
4149
|
+
# Create final params object
|
|
4150
|
+
final_params = SearchParams(query=query, **search_params)
|
|
4151
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
4152
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
4153
|
+
|
|
4154
|
+
return await self._async_post_request(
|
|
4155
|
+
f"{self.api_url}/v1/search",
|
|
4156
|
+
params_dict,
|
|
4157
|
+
{"Authorization": f"Bearer {self.api_key}"}
|
|
4158
|
+
)
|
|
4159
|
+
|
|
4160
|
+
class AsyncCrawlWatcher(CrawlWatcher):
|
|
4161
|
+
"""
|
|
4162
|
+
Async version of CrawlWatcher that properly handles async operations.
|
|
4163
|
+
"""
|
|
4164
|
+
def __init__(self, id: str, app: AsyncFirecrawlApp):
|
|
4165
|
+
super().__init__(id, app)
|
|
4166
|
+
|
|
4167
|
+
async def connect(self) -> None:
|
|
4168
|
+
"""
|
|
4169
|
+
Establishes async WebSocket connection and starts listening for messages.
|
|
4170
|
+
"""
|
|
4171
|
+
async with websockets.connect(
|
|
4172
|
+
self.ws_url,
|
|
4173
|
+
additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
|
|
4174
|
+
) as websocket:
|
|
4175
|
+
await self._listen(websocket)
|
|
4176
|
+
|
|
4177
|
+
async def _listen(self, websocket) -> None:
|
|
4178
|
+
"""
|
|
4179
|
+
Listens for incoming WebSocket messages and handles them asynchronously.
|
|
4180
|
+
|
|
4181
|
+
Args:
|
|
4182
|
+
websocket: The WebSocket connection object
|
|
4183
|
+
"""
|
|
4184
|
+
async for message in websocket:
|
|
4185
|
+
msg = json.loads(message)
|
|
4186
|
+
await self._handle_message(msg)
|
|
4187
|
+
|
|
4188
|
+
async def _handle_message(self, msg: Dict[str, Any]) -> None:
|
|
4189
|
+
"""
|
|
4190
|
+
Handles incoming WebSocket messages based on their type asynchronously.
|
|
4191
|
+
|
|
4192
|
+
Args:
|
|
4193
|
+
msg (Dict[str, Any]): The message to handle
|
|
4194
|
+
"""
|
|
4195
|
+
if msg['type'] == 'done':
|
|
4196
|
+
self.status = 'completed'
|
|
4197
|
+
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
|
|
4198
|
+
elif msg['type'] == 'error':
|
|
4199
|
+
self.status = 'failed'
|
|
4200
|
+
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
|
|
4201
|
+
elif msg['type'] == 'catchup':
|
|
4202
|
+
self.status = msg['data']['status']
|
|
4203
|
+
self.data.extend(msg['data'].get('data', []))
|
|
4204
|
+
for doc in self.data:
|
|
4205
|
+
self.dispatch_event('document', {'data': doc, 'id': self.id})
|
|
4206
|
+
elif msg['type'] == 'document':
|
|
4207
|
+
self.data.append(msg['data'])
|
|
4208
|
+
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
|
4209
|
+
|
|
4210
|
+
async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
|
|
4211
|
+
"""
|
|
4212
|
+
Handle errors from async API responses.
|
|
4213
|
+
"""
|
|
4214
|
+
try:
|
|
4215
|
+
error_data = await response.json()
|
|
4216
|
+
error_message = error_data.get('error', 'No error message provided.')
|
|
4217
|
+
error_details = error_data.get('details', 'No additional error details provided.')
|
|
4218
|
+
except:
|
|
4219
|
+
raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
|
|
4220
|
+
|
|
4221
|
+
# Use the app's method to get the error message
|
|
4222
|
+
message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
|
|
4223
|
+
|
|
4224
|
+
raise aiohttp.ClientError(message)
|
|
4225
|
+
|
|
4226
|
+
async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
4227
|
+
"""
|
|
4228
|
+
Generate a standardized error message based on HTTP status code for async operations.
|
|
4229
|
+
|
|
4230
|
+
Args:
|
|
4231
|
+
status_code (int): The HTTP status code from the response
|
|
4232
|
+
action (str): Description of the action that was being performed
|
|
4233
|
+
error_message (str): The error message from the API response
|
|
4234
|
+
error_details (str): Additional error details from the API response
|
|
4235
|
+
|
|
4236
|
+
Returns:
|
|
4237
|
+
str: A formatted error message
|
|
4238
|
+
"""
|
|
4239
|
+
return self._get_error_message(status_code, action, error_message, error_details)
|