firecrawl 1.17.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/firecrawl.py +3337 -416
- {firecrawl-1.17.0.dist-info → firecrawl-2.0.0.dist-info}/METADATA +3 -2
- firecrawl-2.0.0.dist-info/RECORD +12 -0
- firecrawl-1.17.0.dist-info/RECORD +0 -12
- {firecrawl-1.17.0.dist-info → firecrawl-2.0.0.dist-info}/LICENSE +0 -0
- {firecrawl-1.17.0.dist-info → firecrawl-2.0.0.dist-info}/WHEEL +0 -0
- {firecrawl-1.17.0.dist-info → firecrawl-2.0.0.dist-info}/top_level.txt +0 -0
firecrawl/firecrawl.py
CHANGED
|
@@ -12,15 +12,293 @@ Classes:
|
|
|
12
12
|
import logging
|
|
13
13
|
import os
|
|
14
14
|
import time
|
|
15
|
-
from typing import Any, Dict, Optional, List, Union, Callable
|
|
15
|
+
from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
|
|
16
16
|
import json
|
|
17
|
-
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
import re
|
|
19
|
+
import warnings
|
|
18
20
|
import requests
|
|
19
21
|
import pydantic
|
|
20
22
|
import websockets
|
|
23
|
+
import aiohttp
|
|
24
|
+
import asyncio
|
|
25
|
+
from pydantic import Field
|
|
26
|
+
|
|
27
|
+
# Suppress Pydantic warnings about attribute shadowing
|
|
28
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
|
|
29
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
|
30
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractConfig\" shadows an attribute in parent \"BaseModel\"")
|
|
31
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_version():
|
|
35
|
+
try:
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
package_path = os.path.dirname(__file__)
|
|
38
|
+
version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
|
|
39
|
+
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
|
40
|
+
if version_match:
|
|
41
|
+
return version_match.group(1).strip()
|
|
42
|
+
except Exception:
|
|
43
|
+
print("Failed to get version from __init__.py")
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
version = get_version()
|
|
21
47
|
|
|
22
48
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
23
49
|
|
|
50
|
+
T = TypeVar('T')
|
|
51
|
+
|
|
52
|
+
# class FirecrawlDocumentMetadata(pydantic.BaseModel):
|
|
53
|
+
# """Metadata for a Firecrawl document."""
|
|
54
|
+
# title: Optional[str] = None
|
|
55
|
+
# description: Optional[str] = None
|
|
56
|
+
# language: Optional[str] = None
|
|
57
|
+
# keywords: Optional[str] = None
|
|
58
|
+
# robots: Optional[str] = None
|
|
59
|
+
# ogTitle: Optional[str] = None
|
|
60
|
+
# ogDescription: Optional[str] = None
|
|
61
|
+
# ogUrl: Optional[str] = None
|
|
62
|
+
# ogImage: Optional[str] = None
|
|
63
|
+
# ogAudio: Optional[str] = None
|
|
64
|
+
# ogDeterminer: Optional[str] = None
|
|
65
|
+
# ogLocale: Optional[str] = None
|
|
66
|
+
# ogLocaleAlternate: Optional[List[str]] = None
|
|
67
|
+
# ogSiteName: Optional[str] = None
|
|
68
|
+
# ogVideo: Optional[str] = None
|
|
69
|
+
# dctermsCreated: Optional[str] = None
|
|
70
|
+
# dcDateCreated: Optional[str] = None
|
|
71
|
+
# dcDate: Optional[str] = None
|
|
72
|
+
# dctermsType: Optional[str] = None
|
|
73
|
+
# dcType: Optional[str] = None
|
|
74
|
+
# dctermsAudience: Optional[str] = None
|
|
75
|
+
# dctermsSubject: Optional[str] = None
|
|
76
|
+
# dcSubject: Optional[str] = None
|
|
77
|
+
# dcDescription: Optional[str] = None
|
|
78
|
+
# dctermsKeywords: Optional[str] = None
|
|
79
|
+
# modifiedTime: Optional[str] = None
|
|
80
|
+
# publishedTime: Optional[str] = None
|
|
81
|
+
# articleTag: Optional[str] = None
|
|
82
|
+
# articleSection: Optional[str] = None
|
|
83
|
+
# sourceURL: Optional[str] = None
|
|
84
|
+
# statusCode: Optional[int] = None
|
|
85
|
+
# error: Optional[str] = None
|
|
86
|
+
|
|
87
|
+
class AgentOptions(pydantic.BaseModel):
|
|
88
|
+
"""Configuration for the agent."""
|
|
89
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
90
|
+
prompt: Optional[str] = None
|
|
91
|
+
|
|
92
|
+
class AgentOptionsExtract(pydantic.BaseModel):
|
|
93
|
+
"""Configuration for the agent in extract operations."""
|
|
94
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
95
|
+
|
|
96
|
+
class ActionsResult(pydantic.BaseModel):
|
|
97
|
+
"""Result of actions performed during scraping."""
|
|
98
|
+
screenshots: List[str]
|
|
99
|
+
|
|
100
|
+
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
101
|
+
"""Document retrieved or processed by Firecrawl."""
|
|
102
|
+
url: Optional[str] = None
|
|
103
|
+
markdown: Optional[str] = None
|
|
104
|
+
html: Optional[str] = None
|
|
105
|
+
rawHtml: Optional[str] = None
|
|
106
|
+
links: Optional[List[str]] = None
|
|
107
|
+
extract: Optional[T] = None
|
|
108
|
+
json: Optional[T] = None
|
|
109
|
+
screenshot: Optional[str] = None
|
|
110
|
+
metadata: Optional[Any] = None
|
|
111
|
+
actions: Optional[ActionsResult] = None
|
|
112
|
+
title: Optional[str] = None # v1 search only
|
|
113
|
+
description: Optional[str] = None # v1 search only
|
|
114
|
+
|
|
115
|
+
class LocationConfig(pydantic.BaseModel):
|
|
116
|
+
"""Location configuration for scraping."""
|
|
117
|
+
country: Optional[str] = None
|
|
118
|
+
languages: Optional[List[str]] = None
|
|
119
|
+
|
|
120
|
+
class WebhookConfig(pydantic.BaseModel):
|
|
121
|
+
"""Configuration for webhooks."""
|
|
122
|
+
url: str
|
|
123
|
+
headers: Optional[Dict[str, str]] = None
|
|
124
|
+
metadata: Optional[Dict[str, str]] = None
|
|
125
|
+
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
126
|
+
|
|
127
|
+
class CommonOptions(pydantic.BaseModel):
|
|
128
|
+
"""Parameters for scraping operations."""
|
|
129
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None
|
|
130
|
+
headers: Optional[Dict[str, str]] = None
|
|
131
|
+
includeTags: Optional[List[str]] = None
|
|
132
|
+
excludeTags: Optional[List[str]] = None
|
|
133
|
+
onlyMainContent: Optional[bool] = None
|
|
134
|
+
waitFor: Optional[int] = None
|
|
135
|
+
timeout: Optional[int] = None
|
|
136
|
+
location: Optional[LocationConfig] = None
|
|
137
|
+
mobile: Optional[bool] = None
|
|
138
|
+
skipTlsVerification: Optional[bool] = None
|
|
139
|
+
removeBase64Images: Optional[bool] = None
|
|
140
|
+
blockAds: Optional[bool] = None
|
|
141
|
+
proxy: Optional[Literal["basic", "stealth"]] = None
|
|
142
|
+
|
|
143
|
+
class WaitAction(pydantic.BaseModel):
|
|
144
|
+
"""Wait action to perform during scraping."""
|
|
145
|
+
type: Literal["wait"]
|
|
146
|
+
milliseconds: int
|
|
147
|
+
selector: Optional[str] = None
|
|
148
|
+
|
|
149
|
+
class ScreenshotAction(pydantic.BaseModel):
|
|
150
|
+
"""Screenshot action to perform during scraping."""
|
|
151
|
+
type: Literal["screenshot"]
|
|
152
|
+
fullPage: Optional[bool] = None
|
|
153
|
+
|
|
154
|
+
class ClickAction(pydantic.BaseModel):
|
|
155
|
+
"""Click action to perform during scraping."""
|
|
156
|
+
type: Literal["click"]
|
|
157
|
+
selector: str
|
|
158
|
+
|
|
159
|
+
class WriteAction(pydantic.BaseModel):
|
|
160
|
+
"""Write action to perform during scraping."""
|
|
161
|
+
type: Literal["write"]
|
|
162
|
+
text: str
|
|
163
|
+
|
|
164
|
+
class PressAction(pydantic.BaseModel):
|
|
165
|
+
"""Press action to perform during scraping."""
|
|
166
|
+
type: Literal["press"]
|
|
167
|
+
key: str
|
|
168
|
+
|
|
169
|
+
class ScrollAction(pydantic.BaseModel):
|
|
170
|
+
"""Scroll action to perform during scraping."""
|
|
171
|
+
type: Literal["scroll"]
|
|
172
|
+
direction: Literal["up", "down"]
|
|
173
|
+
selector: Optional[str] = None
|
|
174
|
+
|
|
175
|
+
class ScrapeAction(pydantic.BaseModel):
|
|
176
|
+
"""Scrape action to perform during scraping."""
|
|
177
|
+
type: Literal["scrape"]
|
|
178
|
+
|
|
179
|
+
class ExecuteJavascriptAction(pydantic.BaseModel):
|
|
180
|
+
"""Execute javascript action to perform during scraping."""
|
|
181
|
+
type: Literal["executeJavascript"]
|
|
182
|
+
script: str
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class ExtractAgent(pydantic.BaseModel):
|
|
186
|
+
"""Configuration for the agent in extract operations."""
|
|
187
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
188
|
+
|
|
189
|
+
class ExtractConfig(pydantic.BaseModel):
|
|
190
|
+
"""Configuration for extraction."""
|
|
191
|
+
prompt: Optional[str] = None
|
|
192
|
+
schema: Optional[Any] = None
|
|
193
|
+
systemPrompt: Optional[str] = None
|
|
194
|
+
agent: Optional[ExtractAgent] = None
|
|
195
|
+
|
|
196
|
+
class ScrapeParams(CommonOptions):
|
|
197
|
+
"""Parameters for scraping operations."""
|
|
198
|
+
extract: Optional[ExtractConfig] = None
|
|
199
|
+
jsonOptions: Optional[ExtractConfig] = None
|
|
200
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
|
201
|
+
agent: Optional[AgentOptions] = None
|
|
202
|
+
|
|
203
|
+
class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
|
|
204
|
+
"""Response from scraping operations."""
|
|
205
|
+
success: bool = True
|
|
206
|
+
warning: Optional[str] = None
|
|
207
|
+
error: Optional[str] = None
|
|
208
|
+
|
|
209
|
+
class BatchScrapeResponse(pydantic.BaseModel):
|
|
210
|
+
"""Response from batch scrape operations."""
|
|
211
|
+
id: Optional[str] = None
|
|
212
|
+
url: Optional[str] = None
|
|
213
|
+
success: bool = True
|
|
214
|
+
error: Optional[str] = None
|
|
215
|
+
invalidURLs: Optional[List[str]] = None
|
|
216
|
+
|
|
217
|
+
class BatchScrapeStatusResponse(pydantic.BaseModel):
|
|
218
|
+
"""Response from batch scrape status checks."""
|
|
219
|
+
success: bool = True
|
|
220
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
221
|
+
completed: int
|
|
222
|
+
total: int
|
|
223
|
+
creditsUsed: int
|
|
224
|
+
expiresAt: datetime
|
|
225
|
+
next: Optional[str] = None
|
|
226
|
+
data: List[FirecrawlDocument]
|
|
227
|
+
|
|
228
|
+
class CrawlParams(pydantic.BaseModel):
|
|
229
|
+
"""Parameters for crawling operations."""
|
|
230
|
+
includePaths: Optional[List[str]] = None
|
|
231
|
+
excludePaths: Optional[List[str]] = None
|
|
232
|
+
maxDepth: Optional[int] = None
|
|
233
|
+
maxDiscoveryDepth: Optional[int] = None
|
|
234
|
+
limit: Optional[int] = None
|
|
235
|
+
allowBackwardLinks: Optional[bool] = None
|
|
236
|
+
allowExternalLinks: Optional[bool] = None
|
|
237
|
+
ignoreSitemap: Optional[bool] = None
|
|
238
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
239
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
240
|
+
deduplicateSimilarURLs: Optional[bool] = None
|
|
241
|
+
ignoreQueryParameters: Optional[bool] = None
|
|
242
|
+
regexOnFullURL: Optional[bool] = None
|
|
243
|
+
|
|
244
|
+
class CrawlResponse(pydantic.BaseModel):
|
|
245
|
+
"""Response from crawling operations."""
|
|
246
|
+
id: Optional[str] = None
|
|
247
|
+
url: Optional[str] = None
|
|
248
|
+
success: bool = True
|
|
249
|
+
error: Optional[str] = None
|
|
250
|
+
|
|
251
|
+
class CrawlStatusResponse(pydantic.BaseModel):
|
|
252
|
+
"""Response from crawl status checks."""
|
|
253
|
+
success: bool = True
|
|
254
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
255
|
+
completed: int
|
|
256
|
+
total: int
|
|
257
|
+
creditsUsed: int
|
|
258
|
+
expiresAt: datetime
|
|
259
|
+
next: Optional[str] = None
|
|
260
|
+
data: List[FirecrawlDocument]
|
|
261
|
+
|
|
262
|
+
class CrawlErrorsResponse(pydantic.BaseModel):
|
|
263
|
+
"""Response from crawl/batch scrape error monitoring."""
|
|
264
|
+
errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
|
|
265
|
+
robotsBlocked: List[str]
|
|
266
|
+
|
|
267
|
+
class MapParams(pydantic.BaseModel):
|
|
268
|
+
"""Parameters for mapping operations."""
|
|
269
|
+
search: Optional[str] = None
|
|
270
|
+
ignoreSitemap: Optional[bool] = None
|
|
271
|
+
includeSubdomains: Optional[bool] = None
|
|
272
|
+
sitemapOnly: Optional[bool] = None
|
|
273
|
+
limit: Optional[int] = None
|
|
274
|
+
timeout: Optional[int] = None
|
|
275
|
+
|
|
276
|
+
class MapResponse(pydantic.BaseModel):
|
|
277
|
+
"""Response from mapping operations."""
|
|
278
|
+
success: bool = True
|
|
279
|
+
links: Optional[List[str]] = None
|
|
280
|
+
error: Optional[str] = None
|
|
281
|
+
|
|
282
|
+
class ExtractParams(pydantic.BaseModel):
|
|
283
|
+
"""Parameters for extracting information from URLs."""
|
|
284
|
+
prompt: Optional[str] = None
|
|
285
|
+
schema: Optional[Any] = None
|
|
286
|
+
systemPrompt: Optional[str] = None
|
|
287
|
+
allowExternalLinks: Optional[bool] = None
|
|
288
|
+
enableWebSearch: Optional[bool] = None
|
|
289
|
+
includeSubdomains: Optional[bool] = None
|
|
290
|
+
origin: Optional[str] = None
|
|
291
|
+
showSources: Optional[bool] = None
|
|
292
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
293
|
+
|
|
294
|
+
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
295
|
+
"""Response from extract operations."""
|
|
296
|
+
success: bool = True
|
|
297
|
+
data: Optional[T] = None
|
|
298
|
+
error: Optional[str] = None
|
|
299
|
+
warning: Optional[str] = None
|
|
300
|
+
sources: Optional[List[str]] = None
|
|
301
|
+
|
|
24
302
|
class SearchParams(pydantic.BaseModel):
|
|
25
303
|
query: str
|
|
26
304
|
limit: Optional[int] = 5
|
|
@@ -31,7 +309,14 @@ class SearchParams(pydantic.BaseModel):
|
|
|
31
309
|
location: Optional[str] = None
|
|
32
310
|
origin: Optional[str] = "api"
|
|
33
311
|
timeout: Optional[int] = 60000
|
|
34
|
-
scrapeOptions: Optional[
|
|
312
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
313
|
+
|
|
314
|
+
class SearchResponse(pydantic.BaseModel):
|
|
315
|
+
"""Response from search operations."""
|
|
316
|
+
success: bool = True
|
|
317
|
+
data: List[FirecrawlDocument]
|
|
318
|
+
warning: Optional[str] = None
|
|
319
|
+
error: Optional[str] = None
|
|
35
320
|
|
|
36
321
|
class GenerateLLMsTextParams(pydantic.BaseModel):
|
|
37
322
|
"""
|
|
@@ -75,6 +360,24 @@ class DeepResearchStatusResponse(pydantic.BaseModel):
|
|
|
75
360
|
sources: List[Dict[str, Any]]
|
|
76
361
|
summaries: List[str]
|
|
77
362
|
|
|
363
|
+
class GenerateLLMsTextResponse(pydantic.BaseModel):
|
|
364
|
+
"""Response from LLMs.txt generation operations."""
|
|
365
|
+
success: bool = True
|
|
366
|
+
id: str
|
|
367
|
+
error: Optional[str] = None
|
|
368
|
+
|
|
369
|
+
class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
|
|
370
|
+
llmstxt: str
|
|
371
|
+
llmsfulltxt: Optional[str] = None
|
|
372
|
+
|
|
373
|
+
class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
|
374
|
+
"""Status response from LLMs.txt generation operations."""
|
|
375
|
+
success: bool = True
|
|
376
|
+
data: Optional[GenerateLLMsTextStatusResponseData] = None
|
|
377
|
+
status: Literal["processing", "completed", "failed"]
|
|
378
|
+
error: Optional[str] = None
|
|
379
|
+
expiresAt: str
|
|
380
|
+
|
|
78
381
|
class ChangeTrackingData(pydantic.BaseModel):
|
|
79
382
|
"""
|
|
80
383
|
Data for the change tracking format.
|
|
@@ -84,42 +387,39 @@ class ChangeTrackingData(pydantic.BaseModel):
|
|
|
84
387
|
visibility: str # "visible" | "hidden"
|
|
85
388
|
diff: Optional[Dict[str, Any]] = None
|
|
86
389
|
json: Optional[Any] = None
|
|
390
|
+
|
|
391
|
+
class SearchResponse(pydantic.BaseModel):
|
|
392
|
+
"""
|
|
393
|
+
Response from the search operation.
|
|
394
|
+
"""
|
|
395
|
+
success: bool
|
|
396
|
+
data: List[Dict[str, Any]]
|
|
397
|
+
warning: Optional[str] = None
|
|
398
|
+
error: Optional[str] = None
|
|
87
399
|
|
|
88
|
-
class
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
show_sources: Optional[bool] = False
|
|
110
|
-
agent: Optional[Dict[str, Any]] = None
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
class ExtractResponse(pydantic.BaseModel):
|
|
116
|
-
"""
|
|
117
|
-
Response from the extract operation.
|
|
118
|
-
"""
|
|
119
|
-
success: bool
|
|
120
|
-
data: Optional[Any] = None
|
|
121
|
-
error: Optional[str] = None
|
|
400
|
+
class ExtractParams(pydantic.BaseModel):
|
|
401
|
+
"""
|
|
402
|
+
Parameters for the extract operation.
|
|
403
|
+
"""
|
|
404
|
+
prompt: Optional[str] = None
|
|
405
|
+
schema: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
406
|
+
system_prompt: Optional[str] = None
|
|
407
|
+
allow_external_links: Optional[bool] = False
|
|
408
|
+
enable_web_search: Optional[bool] = False
|
|
409
|
+
# Just for backwards compatibility
|
|
410
|
+
enableWebSearch: Optional[bool] = False
|
|
411
|
+
show_sources: Optional[bool] = False
|
|
412
|
+
agent: Optional[Dict[str, Any]] = None
|
|
413
|
+
|
|
414
|
+
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
415
|
+
"""
|
|
416
|
+
Response from the extract operation.
|
|
417
|
+
"""
|
|
418
|
+
success: bool
|
|
419
|
+
data: Optional[T] = None
|
|
420
|
+
error: Optional[str] = None
|
|
122
421
|
|
|
422
|
+
class FirecrawlApp:
|
|
123
423
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
|
124
424
|
"""
|
|
125
425
|
Initialize the FirecrawlApp instance with API key, API URL.
|
|
@@ -138,200 +438,451 @@ class FirecrawlApp:
|
|
|
138
438
|
|
|
139
439
|
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
|
|
140
440
|
|
|
141
|
-
def scrape_url(
|
|
441
|
+
def scrape_url(
|
|
442
|
+
self,
|
|
443
|
+
url: str,
|
|
444
|
+
*,
|
|
445
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
446
|
+
include_tags: Optional[List[str]] = None,
|
|
447
|
+
exclude_tags: Optional[List[str]] = None,
|
|
448
|
+
only_main_content: Optional[bool] = None,
|
|
449
|
+
wait_for: Optional[int] = None,
|
|
450
|
+
timeout: Optional[int] = None,
|
|
451
|
+
location: Optional[LocationConfig] = None,
|
|
452
|
+
mobile: Optional[bool] = None,
|
|
453
|
+
skip_tls_verification: Optional[bool] = None,
|
|
454
|
+
remove_base64_images: Optional[bool] = None,
|
|
455
|
+
block_ads: Optional[bool] = None,
|
|
456
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
457
|
+
extract: Optional[ExtractConfig] = None,
|
|
458
|
+
json_options: Optional[ExtractConfig] = None,
|
|
459
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
460
|
+
**kwargs) -> ScrapeResponse[Any]:
|
|
142
461
|
"""
|
|
143
|
-
Scrape
|
|
462
|
+
Scrape and extract content from a URL.
|
|
144
463
|
|
|
145
464
|
Args:
|
|
146
|
-
|
|
147
|
-
|
|
465
|
+
url (str): Target URL to scrape
|
|
466
|
+
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
|
467
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
468
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
469
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
470
|
+
wait_for (Optional[int]): Wait for a specific element to appear
|
|
471
|
+
timeout (Optional[int]): Request timeout (ms)
|
|
472
|
+
location (Optional[LocationConfig]): Location configuration
|
|
473
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
474
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
475
|
+
remove_base64_images (Optional[bool]): Remove base64 images
|
|
476
|
+
block_ads (Optional[bool]): Block ads
|
|
477
|
+
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
478
|
+
extract (Optional[ExtractConfig]): Content extraction settings
|
|
479
|
+
json_options (Optional[ExtractConfig]): JSON extraction settings
|
|
480
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
481
|
+
|
|
148
482
|
|
|
149
483
|
Returns:
|
|
150
|
-
|
|
484
|
+
ScrapeResponse with:
|
|
485
|
+
* Requested content formats
|
|
486
|
+
* Page metadata
|
|
487
|
+
* Extraction results
|
|
488
|
+
* Success/error status
|
|
151
489
|
|
|
152
490
|
Raises:
|
|
153
|
-
|
|
491
|
+
Exception: If scraping fails
|
|
154
492
|
"""
|
|
155
|
-
|
|
156
493
|
headers = self._prepare_headers()
|
|
157
494
|
|
|
158
|
-
#
|
|
159
|
-
scrape_params = {
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
# Handle extract (for v1)
|
|
164
|
-
extract = params.get('extract', {})
|
|
165
|
-
if extract:
|
|
166
|
-
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
|
|
167
|
-
extract['schema'] = extract['schema'].schema()
|
|
168
|
-
scrape_params['extract'] = extract
|
|
169
|
-
|
|
170
|
-
# Include any other params directly at the top level of scrape_params
|
|
171
|
-
for key, value in params.items():
|
|
172
|
-
if key not in ['extract']:
|
|
173
|
-
scrape_params[key] = value
|
|
174
|
-
|
|
175
|
-
json = params.get("jsonOptions", {})
|
|
176
|
-
if json:
|
|
177
|
-
if 'schema' in json and hasattr(json['schema'], 'schema'):
|
|
178
|
-
json['schema'] = json['schema'].schema()
|
|
179
|
-
scrape_params['jsonOptions'] = json
|
|
180
|
-
|
|
181
|
-
change_tracking = params.get("changeTrackingOptions", {})
|
|
182
|
-
if change_tracking:
|
|
183
|
-
scrape_params['changeTrackingOptions'] = change_tracking
|
|
184
|
-
|
|
185
|
-
# Include any other params directly at the top level of scrape_params
|
|
186
|
-
for key, value in params.items():
|
|
187
|
-
if key not in ['jsonOptions', 'changeTrackingOptions', 'agent']:
|
|
188
|
-
scrape_params[key] = value
|
|
189
|
-
|
|
190
|
-
agent = params.get('agent')
|
|
191
|
-
if agent:
|
|
192
|
-
scrape_params['agent'] = agent
|
|
193
|
-
|
|
495
|
+
# Build scrape parameters
|
|
496
|
+
scrape_params = {
|
|
497
|
+
'url': url,
|
|
498
|
+
'origin': f"python-sdk@{version}"
|
|
499
|
+
}
|
|
194
500
|
|
|
195
|
-
|
|
196
|
-
|
|
501
|
+
# Add optional parameters if provided
|
|
502
|
+
if formats:
|
|
503
|
+
scrape_params['formats'] = formats
|
|
504
|
+
if include_tags:
|
|
505
|
+
scrape_params['includeTags'] = include_tags
|
|
506
|
+
if exclude_tags:
|
|
507
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
508
|
+
if only_main_content is not None:
|
|
509
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
510
|
+
if wait_for:
|
|
511
|
+
scrape_params['waitFor'] = wait_for
|
|
512
|
+
if timeout:
|
|
513
|
+
scrape_params['timeout'] = timeout
|
|
514
|
+
if location:
|
|
515
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
516
|
+
if mobile is not None:
|
|
517
|
+
scrape_params['mobile'] = mobile
|
|
518
|
+
if skip_tls_verification is not None:
|
|
519
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
520
|
+
if remove_base64_images is not None:
|
|
521
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
522
|
+
if block_ads is not None:
|
|
523
|
+
scrape_params['blockAds'] = block_ads
|
|
524
|
+
if proxy:
|
|
525
|
+
scrape_params['proxy'] = proxy
|
|
526
|
+
if extract:
|
|
527
|
+
if hasattr(extract.schema, 'schema'):
|
|
528
|
+
extract.schema = extract.schema.schema()
|
|
529
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
530
|
+
if json_options:
|
|
531
|
+
if hasattr(json_options.schema, 'schema'):
|
|
532
|
+
json_options.schema = json_options.schema.schema()
|
|
533
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
534
|
+
if actions:
|
|
535
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
536
|
+
scrape_params.update(kwargs)
|
|
537
|
+
|
|
538
|
+
# Make request
|
|
197
539
|
response = requests.post(
|
|
198
|
-
f'{self.api_url}
|
|
540
|
+
f'{self.api_url}/v1/scrape',
|
|
199
541
|
headers=headers,
|
|
200
542
|
json=scrape_params,
|
|
201
|
-
timeout=(
|
|
543
|
+
timeout=(timeout + 5000 if timeout else None)
|
|
202
544
|
)
|
|
545
|
+
|
|
203
546
|
if response.status_code == 200:
|
|
204
547
|
try:
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
raise Exception(
|
|
548
|
+
response_json = response.json()
|
|
549
|
+
if response_json.get('success') and 'data' in response_json:
|
|
550
|
+
return ScrapeResponse(**response_json['data'])
|
|
551
|
+
elif "error" in response_json:
|
|
552
|
+
raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
|
|
553
|
+
else:
|
|
554
|
+
raise Exception(f'Failed to scrape URL. Error: {response_json}')
|
|
555
|
+
except ValueError:
|
|
556
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
214
557
|
else:
|
|
215
558
|
self._handle_error(response, 'scrape URL')
|
|
216
559
|
|
|
217
|
-
def search(
|
|
560
|
+
def search(
|
|
561
|
+
self,
|
|
562
|
+
query: str,
|
|
563
|
+
*,
|
|
564
|
+
limit: Optional[int] = None,
|
|
565
|
+
tbs: Optional[str] = None,
|
|
566
|
+
filter: Optional[str] = None,
|
|
567
|
+
lang: Optional[str] = None,
|
|
568
|
+
country: Optional[str] = None,
|
|
569
|
+
location: Optional[str] = None,
|
|
570
|
+
timeout: Optional[int] = None,
|
|
571
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
572
|
+
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
573
|
+
**kwargs) -> SearchResponse:
|
|
218
574
|
"""
|
|
219
|
-
Search for content using
|
|
575
|
+
Search for content using Firecrawl.
|
|
220
576
|
|
|
221
577
|
Args:
|
|
222
|
-
query (str):
|
|
223
|
-
|
|
578
|
+
query (str): Search query string
|
|
579
|
+
limit (Optional[int]): Max results (default: 5)
|
|
580
|
+
tbs (Optional[str]): Time filter (e.g. "qdr:d")
|
|
581
|
+
filter (Optional[str]): Custom result filter
|
|
582
|
+
lang (Optional[str]): Language code (default: "en")
|
|
583
|
+
country (Optional[str]): Country code (default: "us")
|
|
584
|
+
location (Optional[str]): Geo-targeting
|
|
585
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
586
|
+
scrape_options (Optional[CommonOptions]): Result scraping configuration
|
|
587
|
+
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
588
|
+
**kwargs: Additional keyword arguments for future compatibility
|
|
224
589
|
|
|
225
590
|
Returns:
|
|
226
|
-
|
|
591
|
+
SearchResponse: Response containing:
|
|
592
|
+
* success (bool): Whether request succeeded
|
|
593
|
+
* data (List[FirecrawlDocument]): Search results
|
|
594
|
+
* warning (Optional[str]): Warning message if any
|
|
595
|
+
* error (Optional[str]): Error message if any
|
|
596
|
+
|
|
597
|
+
Raises:
|
|
598
|
+
Exception: If search fails or response cannot be parsed
|
|
227
599
|
"""
|
|
228
|
-
|
|
229
|
-
|
|
600
|
+
# Build search parameters
|
|
601
|
+
search_params = {}
|
|
602
|
+
if params:
|
|
603
|
+
if isinstance(params, dict):
|
|
604
|
+
search_params.update(params)
|
|
605
|
+
else:
|
|
606
|
+
search_params.update(params.dict(exclude_none=True))
|
|
607
|
+
|
|
608
|
+
# Add individual parameters
|
|
609
|
+
if limit is not None:
|
|
610
|
+
search_params['limit'] = limit
|
|
611
|
+
if tbs is not None:
|
|
612
|
+
search_params['tbs'] = tbs
|
|
613
|
+
if filter is not None:
|
|
614
|
+
search_params['filter'] = filter
|
|
615
|
+
if lang is not None:
|
|
616
|
+
search_params['lang'] = lang
|
|
617
|
+
if country is not None:
|
|
618
|
+
search_params['country'] = country
|
|
619
|
+
if location is not None:
|
|
620
|
+
search_params['location'] = location
|
|
621
|
+
if timeout is not None:
|
|
622
|
+
search_params['timeout'] = timeout
|
|
623
|
+
if scrape_options is not None:
|
|
624
|
+
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
625
|
+
|
|
626
|
+
# Add any additional kwargs
|
|
627
|
+
search_params.update(kwargs)
|
|
230
628
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
search_params.query = query
|
|
629
|
+
# Create final params object
|
|
630
|
+
final_params = SearchParams(query=query, **search_params)
|
|
631
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
632
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
236
633
|
|
|
634
|
+
# Make request
|
|
237
635
|
response = requests.post(
|
|
238
636
|
f"{self.api_url}/v1/search",
|
|
239
637
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
240
|
-
json=
|
|
638
|
+
json=params_dict
|
|
241
639
|
)
|
|
242
640
|
|
|
243
|
-
if response.status_code
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
641
|
+
if response.status_code == 200:
|
|
642
|
+
try:
|
|
643
|
+
response_json = response.json()
|
|
644
|
+
if response_json.get('success') and 'data' in response_json:
|
|
645
|
+
return SearchResponse(**response_json)
|
|
646
|
+
elif "error" in response_json:
|
|
647
|
+
raise Exception(f'Search failed. Error: {response_json["error"]}')
|
|
648
|
+
else:
|
|
649
|
+
raise Exception(f'Search failed. Error: {response_json}')
|
|
650
|
+
except ValueError:
|
|
651
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
652
|
+
else:
|
|
653
|
+
self._handle_error(response, 'search')
|
|
654
|
+
|
|
655
|
+
def crawl_url(
|
|
656
|
+
self,
|
|
657
|
+
url: str,
|
|
658
|
+
*,
|
|
659
|
+
include_paths: Optional[List[str]] = None,
|
|
660
|
+
exclude_paths: Optional[List[str]] = None,
|
|
661
|
+
max_depth: Optional[int] = None,
|
|
662
|
+
max_discovery_depth: Optional[int] = None,
|
|
663
|
+
limit: Optional[int] = None,
|
|
664
|
+
allow_backward_links: Optional[bool] = None,
|
|
665
|
+
allow_external_links: Optional[bool] = None,
|
|
666
|
+
ignore_sitemap: Optional[bool] = None,
|
|
667
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
668
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
669
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
670
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
671
|
+
regex_on_full_url: Optional[bool] = None,
|
|
672
|
+
poll_interval: Optional[int] = 2,
|
|
673
|
+
idempotency_key: Optional[str] = None,
|
|
674
|
+
**kwargs
|
|
675
|
+
) -> CrawlStatusResponse:
|
|
255
676
|
"""
|
|
256
|
-
|
|
677
|
+
Crawl a website starting from a URL.
|
|
257
678
|
|
|
258
679
|
Args:
|
|
259
|
-
url (str):
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
680
|
+
url (str): Target URL to start crawling from
|
|
681
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
682
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
683
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
684
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
685
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
686
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
687
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
688
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
689
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
690
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
691
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
692
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
693
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
694
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
695
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
696
|
+
**kwargs: Additional parameters to pass to the API
|
|
263
697
|
|
|
264
698
|
Returns:
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
- 'total' (int): Total number of scraped pages.
|
|
270
|
-
- 'creditsUsed' (int): Estimated number of API credits used for this crawl.
|
|
271
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires.
|
|
272
|
-
- 'data' (List[Dict]): List of all the scraped pages.
|
|
699
|
+
CrawlStatusResponse with:
|
|
700
|
+
* Crawling status and progress
|
|
701
|
+
* Crawled page contents
|
|
702
|
+
* Success/error information
|
|
273
703
|
|
|
274
704
|
Raises:
|
|
275
|
-
Exception: If
|
|
705
|
+
Exception: If crawl fails
|
|
276
706
|
"""
|
|
277
|
-
|
|
707
|
+
crawl_params = {}
|
|
708
|
+
|
|
709
|
+
# Add individual parameters
|
|
710
|
+
if include_paths is not None:
|
|
711
|
+
crawl_params['includePaths'] = include_paths
|
|
712
|
+
if exclude_paths is not None:
|
|
713
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
714
|
+
if max_depth is not None:
|
|
715
|
+
crawl_params['maxDepth'] = max_depth
|
|
716
|
+
if max_discovery_depth is not None:
|
|
717
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
718
|
+
if limit is not None:
|
|
719
|
+
crawl_params['limit'] = limit
|
|
720
|
+
if allow_backward_links is not None:
|
|
721
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
722
|
+
if allow_external_links is not None:
|
|
723
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
724
|
+
if ignore_sitemap is not None:
|
|
725
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
726
|
+
if scrape_options is not None:
|
|
727
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
728
|
+
if webhook is not None:
|
|
729
|
+
crawl_params['webhook'] = webhook
|
|
730
|
+
if deduplicate_similar_urls is not None:
|
|
731
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
732
|
+
if ignore_query_parameters is not None:
|
|
733
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
734
|
+
if regex_on_full_url is not None:
|
|
735
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
736
|
+
|
|
737
|
+
# Add any additional kwargs
|
|
738
|
+
crawl_params.update(kwargs)
|
|
739
|
+
|
|
740
|
+
# Create final params object
|
|
741
|
+
final_params = CrawlParams(**crawl_params)
|
|
742
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
743
|
+
params_dict['url'] = url
|
|
744
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
745
|
+
|
|
746
|
+
# Make request
|
|
278
747
|
headers = self._prepare_headers(idempotency_key)
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
json_data.update(params)
|
|
282
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
748
|
+
response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
749
|
+
|
|
283
750
|
if response.status_code == 200:
|
|
284
751
|
try:
|
|
285
752
|
id = response.json().get('id')
|
|
286
753
|
except:
|
|
287
754
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
288
755
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
289
|
-
|
|
290
756
|
else:
|
|
291
757
|
self._handle_error(response, 'start crawl job')
|
|
292
758
|
|
|
293
|
-
|
|
294
|
-
|
|
759
|
+
def async_crawl_url(
|
|
760
|
+
self,
|
|
761
|
+
url: str,
|
|
762
|
+
*,
|
|
763
|
+
include_paths: Optional[List[str]] = None,
|
|
764
|
+
exclude_paths: Optional[List[str]] = None,
|
|
765
|
+
max_depth: Optional[int] = None,
|
|
766
|
+
max_discovery_depth: Optional[int] = None,
|
|
767
|
+
limit: Optional[int] = None,
|
|
768
|
+
allow_backward_links: Optional[bool] = None,
|
|
769
|
+
allow_external_links: Optional[bool] = None,
|
|
770
|
+
ignore_sitemap: Optional[bool] = None,
|
|
771
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
772
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
773
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
774
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
775
|
+
regex_on_full_url: Optional[bool] = None,
|
|
776
|
+
idempotency_key: Optional[str] = None,
|
|
777
|
+
**kwargs
|
|
778
|
+
) -> CrawlResponse:
|
|
295
779
|
"""
|
|
296
|
-
|
|
780
|
+
Start an asynchronous crawl job.
|
|
297
781
|
|
|
298
782
|
Args:
|
|
299
|
-
url (str):
|
|
300
|
-
|
|
301
|
-
|
|
783
|
+
url (str): Target URL to start crawling from
|
|
784
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
785
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
786
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
787
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
788
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
789
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
790
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
791
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
792
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
793
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
794
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
795
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
796
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
797
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
798
|
+
**kwargs: Additional parameters to pass to the API
|
|
302
799
|
|
|
303
800
|
Returns:
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
801
|
+
CrawlResponse with:
|
|
802
|
+
* success - Whether crawl started successfully
|
|
803
|
+
* id - Unique identifier for the crawl job
|
|
804
|
+
* url - Status check URL for the crawl
|
|
805
|
+
* error - Error message if start failed
|
|
806
|
+
|
|
807
|
+
Raises:
|
|
808
|
+
Exception: If crawl initiation fails
|
|
308
809
|
"""
|
|
309
|
-
|
|
810
|
+
crawl_params = {}
|
|
811
|
+
|
|
812
|
+
# Add individual parameters
|
|
813
|
+
if include_paths is not None:
|
|
814
|
+
crawl_params['includePaths'] = include_paths
|
|
815
|
+
if exclude_paths is not None:
|
|
816
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
817
|
+
if max_depth is not None:
|
|
818
|
+
crawl_params['maxDepth'] = max_depth
|
|
819
|
+
if max_discovery_depth is not None:
|
|
820
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
821
|
+
if limit is not None:
|
|
822
|
+
crawl_params['limit'] = limit
|
|
823
|
+
if allow_backward_links is not None:
|
|
824
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
825
|
+
if allow_external_links is not None:
|
|
826
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
827
|
+
if ignore_sitemap is not None:
|
|
828
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
829
|
+
if scrape_options is not None:
|
|
830
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
831
|
+
if webhook is not None:
|
|
832
|
+
crawl_params['webhook'] = webhook
|
|
833
|
+
if deduplicate_similar_urls is not None:
|
|
834
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
835
|
+
if ignore_query_parameters is not None:
|
|
836
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
837
|
+
if regex_on_full_url is not None:
|
|
838
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
839
|
+
|
|
840
|
+
# Add any additional kwargs
|
|
841
|
+
crawl_params.update(kwargs)
|
|
842
|
+
|
|
843
|
+
# Create final params object
|
|
844
|
+
final_params = CrawlParams(**crawl_params)
|
|
845
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
846
|
+
params_dict['url'] = url
|
|
847
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
848
|
+
|
|
849
|
+
# Make request
|
|
310
850
|
headers = self._prepare_headers(idempotency_key)
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
json_data.update(params)
|
|
314
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
851
|
+
response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
852
|
+
|
|
315
853
|
if response.status_code == 200:
|
|
316
854
|
try:
|
|
317
|
-
return response.json()
|
|
855
|
+
return CrawlResponse(**response.json())
|
|
318
856
|
except:
|
|
319
857
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
320
858
|
else:
|
|
321
859
|
self._handle_error(response, 'start crawl job')
|
|
322
860
|
|
|
323
|
-
def check_crawl_status(self, id: str) ->
|
|
861
|
+
def check_crawl_status(self, id: str) -> CrawlStatusResponse:
|
|
324
862
|
"""
|
|
325
|
-
Check the status of a crawl job
|
|
863
|
+
Check the status and results of a crawl job.
|
|
326
864
|
|
|
327
865
|
Args:
|
|
328
|
-
id
|
|
866
|
+
id: Unique identifier for the crawl job
|
|
329
867
|
|
|
330
868
|
Returns:
|
|
331
|
-
|
|
869
|
+
CrawlStatusResponse containing:
|
|
870
|
+
|
|
871
|
+
Status Information:
|
|
872
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
873
|
+
* completed - Number of pages crawled
|
|
874
|
+
* total - Total pages to crawl
|
|
875
|
+
* creditsUsed - API credits consumed
|
|
876
|
+
* expiresAt - Data expiration timestamp
|
|
877
|
+
|
|
878
|
+
Results:
|
|
879
|
+
* data - List of crawled documents
|
|
880
|
+
* next - URL for next page of results (if paginated)
|
|
881
|
+
* success - Whether status check succeeded
|
|
882
|
+
* error - Error message if failed
|
|
332
883
|
|
|
333
884
|
Raises:
|
|
334
|
-
Exception: If
|
|
885
|
+
Exception: If status check fails
|
|
335
886
|
"""
|
|
336
887
|
endpoint = f'/v1/crawl/{id}'
|
|
337
888
|
|
|
@@ -383,28 +934,37 @@ class FirecrawlApp:
|
|
|
383
934
|
if 'next' in status_data:
|
|
384
935
|
response['next'] = status_data['next']
|
|
385
936
|
|
|
386
|
-
return
|
|
387
|
-
|
|
937
|
+
return CrawlStatusResponse(
|
|
938
|
+
success=False if 'error' in status_data else True,
|
|
388
939
|
**response
|
|
389
|
-
|
|
940
|
+
)
|
|
390
941
|
else:
|
|
391
942
|
self._handle_error(response, 'check crawl status')
|
|
392
943
|
|
|
393
|
-
def check_crawl_errors(self, id: str) ->
|
|
944
|
+
def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
|
|
394
945
|
"""
|
|
395
946
|
Returns information about crawl errors.
|
|
396
947
|
|
|
397
948
|
Args:
|
|
398
|
-
id (str): The ID of the crawl job
|
|
949
|
+
id (str): The ID of the crawl job
|
|
399
950
|
|
|
400
951
|
Returns:
|
|
401
|
-
|
|
952
|
+
CrawlErrorsResponse containing:
|
|
953
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
954
|
+
- id (str): Error ID
|
|
955
|
+
- timestamp (str): When the error occurred
|
|
956
|
+
- url (str): URL that caused the error
|
|
957
|
+
- error (str): Error message
|
|
958
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
959
|
+
|
|
960
|
+
Raises:
|
|
961
|
+
Exception: If error check fails
|
|
402
962
|
"""
|
|
403
963
|
headers = self._prepare_headers()
|
|
404
964
|
response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
|
|
405
965
|
if response.status_code == 200:
|
|
406
966
|
try:
|
|
407
|
-
return response.json()
|
|
967
|
+
return CrawlErrorsResponse(**response.json())
|
|
408
968
|
except:
|
|
409
969
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
410
970
|
else:
|
|
@@ -412,13 +972,18 @@ class FirecrawlApp:
|
|
|
412
972
|
|
|
413
973
|
def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
|
414
974
|
"""
|
|
415
|
-
Cancel an asynchronous crawl job
|
|
975
|
+
Cancel an asynchronous crawl job.
|
|
416
976
|
|
|
417
977
|
Args:
|
|
418
|
-
id (str): The ID of the crawl job to cancel
|
|
978
|
+
id (str): The ID of the crawl job to cancel
|
|
419
979
|
|
|
420
980
|
Returns:
|
|
421
|
-
Dict[str, Any]:
|
|
981
|
+
Dict[str, Any] containing:
|
|
982
|
+
* success (bool): Whether cancellation was successful
|
|
983
|
+
* error (str, optional): Error message if cancellation failed
|
|
984
|
+
|
|
985
|
+
Raises:
|
|
986
|
+
Exception: If cancellation fails
|
|
422
987
|
"""
|
|
423
988
|
headers = self._prepare_headers()
|
|
424
989
|
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
|
|
@@ -430,154 +995,524 @@ class FirecrawlApp:
|
|
|
430
995
|
else:
|
|
431
996
|
self._handle_error(response, "cancel crawl job")
|
|
432
997
|
|
|
433
|
-
def crawl_url_and_watch(
|
|
998
|
+
def crawl_url_and_watch(
|
|
999
|
+
self,
|
|
1000
|
+
url: str,
|
|
1001
|
+
*,
|
|
1002
|
+
include_paths: Optional[List[str]] = None,
|
|
1003
|
+
exclude_paths: Optional[List[str]] = None,
|
|
1004
|
+
max_depth: Optional[int] = None,
|
|
1005
|
+
max_discovery_depth: Optional[int] = None,
|
|
1006
|
+
limit: Optional[int] = None,
|
|
1007
|
+
allow_backward_links: Optional[bool] = None,
|
|
1008
|
+
allow_external_links: Optional[bool] = None,
|
|
1009
|
+
ignore_sitemap: Optional[bool] = None,
|
|
1010
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
1011
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
1012
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
1013
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
1014
|
+
regex_on_full_url: Optional[bool] = None,
|
|
1015
|
+
idempotency_key: Optional[str] = None,
|
|
1016
|
+
**kwargs
|
|
1017
|
+
) -> 'CrawlWatcher':
|
|
434
1018
|
"""
|
|
435
1019
|
Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
436
1020
|
|
|
437
1021
|
Args:
|
|
438
|
-
url (str):
|
|
439
|
-
|
|
440
|
-
|
|
1022
|
+
url (str): Target URL to start crawling from
|
|
1023
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
1024
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
1025
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
1026
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1027
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
1028
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
1029
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
1030
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1031
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
1032
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
1033
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1034
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1035
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1036
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1037
|
+
**kwargs: Additional parameters to pass to the API
|
|
441
1038
|
|
|
442
1039
|
Returns:
|
|
443
|
-
CrawlWatcher: An instance
|
|
1040
|
+
CrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
1041
|
+
|
|
1042
|
+
Raises:
|
|
1043
|
+
Exception: If crawl job fails to start
|
|
444
1044
|
"""
|
|
445
|
-
crawl_response = self.async_crawl_url(
|
|
446
|
-
|
|
447
|
-
|
|
1045
|
+
crawl_response = self.async_crawl_url(
|
|
1046
|
+
url,
|
|
1047
|
+
include_paths=include_paths,
|
|
1048
|
+
exclude_paths=exclude_paths,
|
|
1049
|
+
max_depth=max_depth,
|
|
1050
|
+
max_discovery_depth=max_discovery_depth,
|
|
1051
|
+
limit=limit,
|
|
1052
|
+
allow_backward_links=allow_backward_links,
|
|
1053
|
+
allow_external_links=allow_external_links,
|
|
1054
|
+
ignore_sitemap=ignore_sitemap,
|
|
1055
|
+
scrape_options=scrape_options,
|
|
1056
|
+
webhook=webhook,
|
|
1057
|
+
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1058
|
+
ignore_query_parameters=ignore_query_parameters,
|
|
1059
|
+
regex_on_full_url=regex_on_full_url,
|
|
1060
|
+
idempotency_key=idempotency_key,
|
|
1061
|
+
**kwargs
|
|
1062
|
+
)
|
|
1063
|
+
if crawl_response.success and crawl_response.id:
|
|
1064
|
+
return CrawlWatcher(crawl_response.id, self)
|
|
448
1065
|
else:
|
|
449
1066
|
raise Exception("Crawl job failed to start")
|
|
450
1067
|
|
|
451
|
-
def map_url(
|
|
1068
|
+
def map_url(
|
|
1069
|
+
self,
|
|
1070
|
+
url: str,
|
|
1071
|
+
*,
|
|
1072
|
+
search: Optional[str] = None,
|
|
1073
|
+
ignore_sitemap: Optional[bool] = None,
|
|
1074
|
+
include_subdomains: Optional[bool] = None,
|
|
1075
|
+
sitemap_only: Optional[bool] = None,
|
|
1076
|
+
limit: Optional[int] = None,
|
|
1077
|
+
timeout: Optional[int] = None,
|
|
1078
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
452
1079
|
"""
|
|
453
|
-
|
|
1080
|
+
Map and discover links from a URL.
|
|
454
1081
|
|
|
455
1082
|
Args:
|
|
456
|
-
url (str):
|
|
457
|
-
|
|
1083
|
+
url (str): Target URL to map
|
|
1084
|
+
search (Optional[str]): Filter pattern for URLs
|
|
1085
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1086
|
+
include_subdomains (Optional[bool]): Include subdomain links
|
|
1087
|
+
sitemap_only (Optional[bool]): Only use sitemap.xml
|
|
1088
|
+
limit (Optional[int]): Maximum URLs to return
|
|
1089
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1090
|
+
params (Optional[MapParams]): Additional mapping parameters
|
|
458
1091
|
|
|
459
1092
|
Returns:
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
1093
|
+
MapResponse: Response containing:
|
|
1094
|
+
* success (bool): Whether request succeeded
|
|
1095
|
+
* links (List[str]): Discovered URLs
|
|
1096
|
+
* error (Optional[str]): Error message if any
|
|
464
1097
|
|
|
465
|
-
|
|
466
|
-
|
|
1098
|
+
Raises:
|
|
1099
|
+
Exception: If mapping fails or response cannot be parsed
|
|
1100
|
+
"""
|
|
1101
|
+
# Build map parameters
|
|
1102
|
+
map_params = {}
|
|
467
1103
|
if params:
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
#
|
|
1104
|
+
map_params.update(params.dict(exclude_none=True))
|
|
1105
|
+
|
|
1106
|
+
# Add individual parameters
|
|
1107
|
+
if search is not None:
|
|
1108
|
+
map_params['search'] = search
|
|
1109
|
+
if ignore_sitemap is not None:
|
|
1110
|
+
map_params['ignoreSitemap'] = ignore_sitemap
|
|
1111
|
+
if include_subdomains is not None:
|
|
1112
|
+
map_params['includeSubdomains'] = include_subdomains
|
|
1113
|
+
if sitemap_only is not None:
|
|
1114
|
+
map_params['sitemapOnly'] = sitemap_only
|
|
1115
|
+
if limit is not None:
|
|
1116
|
+
map_params['limit'] = limit
|
|
1117
|
+
if timeout is not None:
|
|
1118
|
+
map_params['timeout'] = timeout
|
|
1119
|
+
|
|
1120
|
+
# Create final params object
|
|
1121
|
+
final_params = MapParams(**map_params)
|
|
1122
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1123
|
+
params_dict['url'] = url
|
|
1124
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1125
|
+
|
|
1126
|
+
# Make request
|
|
471
1127
|
response = requests.post(
|
|
472
|
-
f
|
|
473
|
-
headers=
|
|
474
|
-
json=
|
|
1128
|
+
f"{self.api_url}/v1/map",
|
|
1129
|
+
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
1130
|
+
json=params_dict
|
|
475
1131
|
)
|
|
1132
|
+
|
|
476
1133
|
if response.status_code == 200:
|
|
477
1134
|
try:
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
raise Exception(
|
|
1135
|
+
response_json = response.json()
|
|
1136
|
+
if response_json.get('success') and 'links' in response_json:
|
|
1137
|
+
return MapResponse(**response_json)
|
|
1138
|
+
elif "error" in response_json:
|
|
1139
|
+
raise Exception(f'Map failed. Error: {response_json["error"]}')
|
|
1140
|
+
else:
|
|
1141
|
+
raise Exception(f'Map failed. Error: {response_json}')
|
|
1142
|
+
except ValueError:
|
|
1143
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
487
1144
|
else:
|
|
488
1145
|
self._handle_error(response, 'map')
|
|
489
1146
|
|
|
490
|
-
def batch_scrape_urls(
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
1147
|
+
def batch_scrape_urls(
|
|
1148
|
+
self,
|
|
1149
|
+
urls: List[str],
|
|
1150
|
+
*,
|
|
1151
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1152
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1153
|
+
include_tags: Optional[List[str]] = None,
|
|
1154
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1155
|
+
only_main_content: Optional[bool] = None,
|
|
1156
|
+
wait_for: Optional[int] = None,
|
|
1157
|
+
timeout: Optional[int] = None,
|
|
1158
|
+
location: Optional[LocationConfig] = None,
|
|
1159
|
+
mobile: Optional[bool] = None,
|
|
1160
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1161
|
+
remove_base64_images: Optional[bool] = None,
|
|
1162
|
+
block_ads: Optional[bool] = None,
|
|
1163
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1164
|
+
extract: Optional[ExtractConfig] = None,
|
|
1165
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1166
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1167
|
+
agent: Optional[AgentOptions] = None,
|
|
1168
|
+
poll_interval: Optional[int] = 2,
|
|
1169
|
+
idempotency_key: Optional[str] = None,
|
|
1170
|
+
**kwargs
|
|
1171
|
+
) -> BatchScrapeStatusResponse:
|
|
494
1172
|
"""
|
|
495
|
-
|
|
1173
|
+
Batch scrape multiple URLs and monitor until completion.
|
|
496
1174
|
|
|
497
1175
|
Args:
|
|
498
|
-
urls (List[str]):
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
1176
|
+
urls (List[str]): URLs to scrape
|
|
1177
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1178
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1179
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1180
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1181
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1182
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1183
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1184
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1185
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1186
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1187
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1188
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1189
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1190
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1191
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1192
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1193
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1194
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1195
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1196
|
+
**kwargs: Additional parameters to pass to the API
|
|
502
1197
|
|
|
503
1198
|
Returns:
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
- 'total' (int): Total number of scraped pages.
|
|
509
|
-
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
|
|
510
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
|
|
511
|
-
- 'data' (List[Dict]): List of all the scraped pages.
|
|
1199
|
+
BatchScrapeStatusResponse with:
|
|
1200
|
+
* Scraping status and progress
|
|
1201
|
+
* Scraped content for each URL
|
|
1202
|
+
* Success/error information
|
|
512
1203
|
|
|
513
1204
|
Raises:
|
|
514
|
-
Exception: If
|
|
1205
|
+
Exception: If batch scrape fails
|
|
515
1206
|
"""
|
|
516
|
-
|
|
1207
|
+
scrape_params = {}
|
|
1208
|
+
|
|
1209
|
+
# Add individual parameters
|
|
1210
|
+
if formats is not None:
|
|
1211
|
+
scrape_params['formats'] = formats
|
|
1212
|
+
if headers is not None:
|
|
1213
|
+
scrape_params['headers'] = headers
|
|
1214
|
+
if include_tags is not None:
|
|
1215
|
+
scrape_params['includeTags'] = include_tags
|
|
1216
|
+
if exclude_tags is not None:
|
|
1217
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1218
|
+
if only_main_content is not None:
|
|
1219
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1220
|
+
if wait_for is not None:
|
|
1221
|
+
scrape_params['waitFor'] = wait_for
|
|
1222
|
+
if timeout is not None:
|
|
1223
|
+
scrape_params['timeout'] = timeout
|
|
1224
|
+
if location is not None:
|
|
1225
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1226
|
+
if mobile is not None:
|
|
1227
|
+
scrape_params['mobile'] = mobile
|
|
1228
|
+
if skip_tls_verification is not None:
|
|
1229
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1230
|
+
if remove_base64_images is not None:
|
|
1231
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1232
|
+
if block_ads is not None:
|
|
1233
|
+
scrape_params['blockAds'] = block_ads
|
|
1234
|
+
if proxy is not None:
|
|
1235
|
+
scrape_params['proxy'] = proxy
|
|
1236
|
+
if extract is not None:
|
|
1237
|
+
if hasattr(extract.schema, 'schema'):
|
|
1238
|
+
extract.schema = extract.schema.schema()
|
|
1239
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1240
|
+
if json_options is not None:
|
|
1241
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1242
|
+
json_options.schema = json_options.schema.schema()
|
|
1243
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1244
|
+
if actions is not None:
|
|
1245
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1246
|
+
if agent is not None:
|
|
1247
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1248
|
+
|
|
1249
|
+
# Add any additional kwargs
|
|
1250
|
+
scrape_params.update(kwargs)
|
|
1251
|
+
|
|
1252
|
+
# Create final params object
|
|
1253
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1254
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1255
|
+
params_dict['urls'] = urls
|
|
1256
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1257
|
+
|
|
1258
|
+
# Make request
|
|
517
1259
|
headers = self._prepare_headers(idempotency_key)
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
json_data.update(params)
|
|
521
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
1260
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1261
|
+
|
|
522
1262
|
if response.status_code == 200:
|
|
523
1263
|
try:
|
|
524
1264
|
id = response.json().get('id')
|
|
525
1265
|
except:
|
|
526
1266
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
527
1267
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
528
|
-
|
|
529
1268
|
else:
|
|
530
1269
|
self._handle_error(response, 'start batch scrape job')
|
|
531
1270
|
|
|
532
|
-
|
|
533
|
-
|
|
1271
|
+
def async_batch_scrape_urls(
|
|
1272
|
+
self,
|
|
1273
|
+
urls: List[str],
|
|
1274
|
+
*,
|
|
1275
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1276
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1277
|
+
include_tags: Optional[List[str]] = None,
|
|
1278
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1279
|
+
only_main_content: Optional[bool] = None,
|
|
1280
|
+
wait_for: Optional[int] = None,
|
|
1281
|
+
timeout: Optional[int] = None,
|
|
1282
|
+
location: Optional[LocationConfig] = None,
|
|
1283
|
+
mobile: Optional[bool] = None,
|
|
1284
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1285
|
+
remove_base64_images: Optional[bool] = None,
|
|
1286
|
+
block_ads: Optional[bool] = None,
|
|
1287
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1288
|
+
extract: Optional[ExtractConfig] = None,
|
|
1289
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1290
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1291
|
+
agent: Optional[AgentOptions] = None,
|
|
1292
|
+
idempotency_key: Optional[str] = None,
|
|
1293
|
+
**kwargs
|
|
1294
|
+
) -> BatchScrapeResponse:
|
|
534
1295
|
"""
|
|
535
|
-
Initiate a
|
|
1296
|
+
Initiate a batch scrape job asynchronously.
|
|
536
1297
|
|
|
537
1298
|
Args:
|
|
538
|
-
urls (List[str]):
|
|
539
|
-
|
|
540
|
-
|
|
1299
|
+
urls (List[str]): URLs to scrape
|
|
1300
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1301
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1302
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1303
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1304
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1305
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1306
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1307
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1308
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1309
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1310
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1311
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1312
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1313
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1314
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1315
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1316
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1317
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1318
|
+
**kwargs: Additional parameters to pass to the API
|
|
541
1319
|
|
|
542
1320
|
Returns:
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
1321
|
+
BatchScrapeResponse with:
|
|
1322
|
+
* success - Whether job started successfully
|
|
1323
|
+
* id - Unique identifier for the job
|
|
1324
|
+
* url - Status check URL
|
|
1325
|
+
* error - Error message if start failed
|
|
1326
|
+
|
|
1327
|
+
Raises:
|
|
1328
|
+
Exception: If job initiation fails
|
|
547
1329
|
"""
|
|
548
|
-
|
|
1330
|
+
scrape_params = {}
|
|
1331
|
+
|
|
1332
|
+
# Add individual parameters
|
|
1333
|
+
if formats is not None:
|
|
1334
|
+
scrape_params['formats'] = formats
|
|
1335
|
+
if headers is not None:
|
|
1336
|
+
scrape_params['headers'] = headers
|
|
1337
|
+
if include_tags is not None:
|
|
1338
|
+
scrape_params['includeTags'] = include_tags
|
|
1339
|
+
if exclude_tags is not None:
|
|
1340
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1341
|
+
if only_main_content is not None:
|
|
1342
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1343
|
+
if wait_for is not None:
|
|
1344
|
+
scrape_params['waitFor'] = wait_for
|
|
1345
|
+
if timeout is not None:
|
|
1346
|
+
scrape_params['timeout'] = timeout
|
|
1347
|
+
if location is not None:
|
|
1348
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1349
|
+
if mobile is not None:
|
|
1350
|
+
scrape_params['mobile'] = mobile
|
|
1351
|
+
if skip_tls_verification is not None:
|
|
1352
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1353
|
+
if remove_base64_images is not None:
|
|
1354
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1355
|
+
if block_ads is not None:
|
|
1356
|
+
scrape_params['blockAds'] = block_ads
|
|
1357
|
+
if proxy is not None:
|
|
1358
|
+
scrape_params['proxy'] = proxy
|
|
1359
|
+
if extract is not None:
|
|
1360
|
+
if hasattr(extract.schema, 'schema'):
|
|
1361
|
+
extract.schema = extract.schema.schema()
|
|
1362
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1363
|
+
if json_options is not None:
|
|
1364
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1365
|
+
json_options.schema = json_options.schema.schema()
|
|
1366
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1367
|
+
if actions is not None:
|
|
1368
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1369
|
+
if agent is not None:
|
|
1370
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1371
|
+
|
|
1372
|
+
# Add any additional kwargs
|
|
1373
|
+
scrape_params.update(kwargs)
|
|
1374
|
+
|
|
1375
|
+
# Create final params object
|
|
1376
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1377
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1378
|
+
params_dict['urls'] = urls
|
|
1379
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1380
|
+
|
|
1381
|
+
# Make request
|
|
549
1382
|
headers = self._prepare_headers(idempotency_key)
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
json_data.update(params)
|
|
553
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
1383
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1384
|
+
|
|
554
1385
|
if response.status_code == 200:
|
|
555
1386
|
try:
|
|
556
|
-
return response.json()
|
|
1387
|
+
return BatchScrapeResponse(**response.json())
|
|
557
1388
|
except:
|
|
558
1389
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
559
1390
|
else:
|
|
560
1391
|
self._handle_error(response, 'start batch scrape job')
|
|
561
1392
|
|
|
562
|
-
def batch_scrape_urls_and_watch(
|
|
1393
|
+
def batch_scrape_urls_and_watch(
|
|
1394
|
+
self,
|
|
1395
|
+
urls: List[str],
|
|
1396
|
+
*,
|
|
1397
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1398
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1399
|
+
include_tags: Optional[List[str]] = None,
|
|
1400
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1401
|
+
only_main_content: Optional[bool] = None,
|
|
1402
|
+
wait_for: Optional[int] = None,
|
|
1403
|
+
timeout: Optional[int] = None,
|
|
1404
|
+
location: Optional[LocationConfig] = None,
|
|
1405
|
+
mobile: Optional[bool] = None,
|
|
1406
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1407
|
+
remove_base64_images: Optional[bool] = None,
|
|
1408
|
+
block_ads: Optional[bool] = None,
|
|
1409
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1410
|
+
extract: Optional[ExtractConfig] = None,
|
|
1411
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1412
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1413
|
+
agent: Optional[AgentOptions] = None,
|
|
1414
|
+
idempotency_key: Optional[str] = None,
|
|
1415
|
+
**kwargs
|
|
1416
|
+
) -> 'CrawlWatcher':
|
|
563
1417
|
"""
|
|
564
1418
|
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
565
1419
|
|
|
566
1420
|
Args:
|
|
567
|
-
urls (List[str]):
|
|
568
|
-
|
|
569
|
-
|
|
1421
|
+
urls (List[str]): URLs to scrape
|
|
1422
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1423
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1424
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1425
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1426
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1427
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1428
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1429
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1430
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1431
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1432
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1433
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1434
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1435
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1436
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1437
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1438
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1439
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1440
|
+
**kwargs: Additional parameters to pass to the API
|
|
570
1441
|
|
|
571
1442
|
Returns:
|
|
572
|
-
CrawlWatcher: An instance
|
|
1443
|
+
CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
1444
|
+
|
|
1445
|
+
Raises:
|
|
1446
|
+
Exception: If batch scrape job fails to start
|
|
573
1447
|
"""
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
1448
|
+
scrape_params = {}
|
|
1449
|
+
|
|
1450
|
+
# Add individual parameters
|
|
1451
|
+
if formats is not None:
|
|
1452
|
+
scrape_params['formats'] = formats
|
|
1453
|
+
if headers is not None:
|
|
1454
|
+
scrape_params['headers'] = headers
|
|
1455
|
+
if include_tags is not None:
|
|
1456
|
+
scrape_params['includeTags'] = include_tags
|
|
1457
|
+
if exclude_tags is not None:
|
|
1458
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1459
|
+
if only_main_content is not None:
|
|
1460
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1461
|
+
if wait_for is not None:
|
|
1462
|
+
scrape_params['waitFor'] = wait_for
|
|
1463
|
+
if timeout is not None:
|
|
1464
|
+
scrape_params['timeout'] = timeout
|
|
1465
|
+
if location is not None:
|
|
1466
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1467
|
+
if mobile is not None:
|
|
1468
|
+
scrape_params['mobile'] = mobile
|
|
1469
|
+
if skip_tls_verification is not None:
|
|
1470
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1471
|
+
if remove_base64_images is not None:
|
|
1472
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1473
|
+
if block_ads is not None:
|
|
1474
|
+
scrape_params['blockAds'] = block_ads
|
|
1475
|
+
if proxy is not None:
|
|
1476
|
+
scrape_params['proxy'] = proxy
|
|
1477
|
+
if extract is not None:
|
|
1478
|
+
if hasattr(extract.schema, 'schema'):
|
|
1479
|
+
extract.schema = extract.schema.schema()
|
|
1480
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1481
|
+
if json_options is not None:
|
|
1482
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1483
|
+
json_options.schema = json_options.schema.schema()
|
|
1484
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1485
|
+
if actions is not None:
|
|
1486
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1487
|
+
if agent is not None:
|
|
1488
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1489
|
+
|
|
1490
|
+
# Add any additional kwargs
|
|
1491
|
+
scrape_params.update(kwargs)
|
|
1492
|
+
|
|
1493
|
+
# Create final params object
|
|
1494
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1495
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1496
|
+
params_dict['urls'] = urls
|
|
1497
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1498
|
+
|
|
1499
|
+
# Make request
|
|
1500
|
+
headers = self._prepare_headers(idempotency_key)
|
|
1501
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1502
|
+
|
|
1503
|
+
if response.status_code == 200:
|
|
1504
|
+
try:
|
|
1505
|
+
crawl_response = BatchScrapeResponse(**response.json())
|
|
1506
|
+
if crawl_response.success and crawl_response.id:
|
|
1507
|
+
return CrawlWatcher(crawl_response.id, self)
|
|
1508
|
+
else:
|
|
1509
|
+
raise Exception("Batch scrape job failed to start")
|
|
1510
|
+
except:
|
|
1511
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
577
1512
|
else:
|
|
578
|
-
|
|
1513
|
+
self._handle_error(response, 'start batch scrape job')
|
|
579
1514
|
|
|
580
|
-
def check_batch_scrape_status(self, id: str) ->
|
|
1515
|
+
def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
|
|
581
1516
|
"""
|
|
582
1517
|
Check the status of a batch scrape job using the Firecrawl API.
|
|
583
1518
|
|
|
@@ -585,7 +1520,7 @@ class FirecrawlApp:
|
|
|
585
1520
|
id (str): The ID of the batch scrape job.
|
|
586
1521
|
|
|
587
1522
|
Returns:
|
|
588
|
-
|
|
1523
|
+
BatchScrapeStatusResponse: The status of the batch scrape job.
|
|
589
1524
|
|
|
590
1525
|
Raises:
|
|
591
1526
|
Exception: If the status check request fails.
|
|
@@ -625,29 +1560,21 @@ class FirecrawlApp:
|
|
|
625
1560
|
break
|
|
626
1561
|
status_data['data'] = data
|
|
627
1562
|
|
|
628
|
-
|
|
1563
|
+
return BatchScrapeStatusResponse(**{
|
|
1564
|
+
'success': False if 'error' in status_data else True,
|
|
629
1565
|
'status': status_data.get('status'),
|
|
630
1566
|
'total': status_data.get('total'),
|
|
631
1567
|
'completed': status_data.get('completed'),
|
|
632
1568
|
'creditsUsed': status_data.get('creditsUsed'),
|
|
633
1569
|
'expiresAt': status_data.get('expiresAt'),
|
|
634
|
-
'data': status_data.get('data')
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
response['error'] = status_data['error']
|
|
639
|
-
|
|
640
|
-
if 'next' in status_data:
|
|
641
|
-
response['next'] = status_data['next']
|
|
642
|
-
|
|
643
|
-
return {
|
|
644
|
-
'success': False if 'error' in status_data else True,
|
|
645
|
-
**response
|
|
646
|
-
}
|
|
1570
|
+
'data': status_data.get('data'),
|
|
1571
|
+
'next': status_data.get('next'),
|
|
1572
|
+
'error': status_data.get('error')
|
|
1573
|
+
})
|
|
647
1574
|
else:
|
|
648
1575
|
self._handle_error(response, 'check batch scrape status')
|
|
649
1576
|
|
|
650
|
-
def check_batch_scrape_errors(self, id: str) ->
|
|
1577
|
+
def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
|
|
651
1578
|
"""
|
|
652
1579
|
Returns information about batch scrape errors.
|
|
653
1580
|
|
|
@@ -655,38 +1582,68 @@ class FirecrawlApp:
|
|
|
655
1582
|
id (str): The ID of the crawl job.
|
|
656
1583
|
|
|
657
1584
|
Returns:
|
|
658
|
-
|
|
1585
|
+
CrawlErrorsResponse: A response containing:
|
|
1586
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
1587
|
+
* id (str): Error ID
|
|
1588
|
+
* timestamp (str): When the error occurred
|
|
1589
|
+
* url (str): URL that caused the error
|
|
1590
|
+
* error (str): Error message
|
|
1591
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
1592
|
+
|
|
1593
|
+
Raises:
|
|
1594
|
+
Exception: If the error check request fails
|
|
659
1595
|
"""
|
|
660
1596
|
headers = self._prepare_headers()
|
|
661
1597
|
response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
|
|
662
1598
|
if response.status_code == 200:
|
|
663
1599
|
try:
|
|
664
|
-
return response.json()
|
|
1600
|
+
return CrawlErrorsResponse(**response.json())
|
|
665
1601
|
except:
|
|
666
1602
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
667
1603
|
else:
|
|
668
1604
|
self._handle_error(response, "check batch scrape errors")
|
|
669
1605
|
|
|
670
|
-
def extract(
|
|
1606
|
+
def extract(
|
|
1607
|
+
self,
|
|
1608
|
+
urls: Optional[List[str]] = None,
|
|
1609
|
+
*,
|
|
1610
|
+
prompt: Optional[str] = None,
|
|
1611
|
+
schema: Optional[Any] = None,
|
|
1612
|
+
system_prompt: Optional[str] = None,
|
|
1613
|
+
allow_external_links: Optional[bool] = False,
|
|
1614
|
+
enable_web_search: Optional[bool] = False,
|
|
1615
|
+
show_sources: Optional[bool] = False,
|
|
1616
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
671
1617
|
"""
|
|
672
|
-
|
|
1618
|
+
Extract structured information from URLs.
|
|
673
1619
|
|
|
674
1620
|
Args:
|
|
675
|
-
urls (Optional[List[str]]):
|
|
676
|
-
|
|
1621
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
1622
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
1623
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
1624
|
+
system_prompt (Optional[str]): System context
|
|
1625
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
1626
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
1627
|
+
show_sources (Optional[bool]): Include source URLs
|
|
1628
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
677
1629
|
|
|
678
1630
|
Returns:
|
|
679
|
-
|
|
1631
|
+
ExtractResponse[Any] with:
|
|
1632
|
+
* success (bool): Whether request succeeded
|
|
1633
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
1634
|
+
* error (Optional[str]): Error message if any
|
|
1635
|
+
|
|
1636
|
+
Raises:
|
|
1637
|
+
ValueError: If prompt/schema missing or extraction fails
|
|
680
1638
|
"""
|
|
681
1639
|
headers = self._prepare_headers()
|
|
682
1640
|
|
|
683
|
-
if not
|
|
1641
|
+
if not prompt and not schema:
|
|
684
1642
|
raise ValueError("Either prompt or schema is required")
|
|
685
1643
|
|
|
686
|
-
if not urls and not
|
|
1644
|
+
if not urls and not prompt:
|
|
687
1645
|
raise ValueError("Either urls or prompt is required")
|
|
688
1646
|
|
|
689
|
-
schema = params.get('schema')
|
|
690
1647
|
if schema:
|
|
691
1648
|
if hasattr(schema, 'model_json_schema'):
|
|
692
1649
|
# Convert Pydantic model to JSON schema
|
|
@@ -694,26 +1651,22 @@ class FirecrawlApp:
|
|
|
694
1651
|
# Otherwise assume it's already a JSON schema dict
|
|
695
1652
|
|
|
696
1653
|
request_data = {
|
|
697
|
-
'urls': urls,
|
|
698
|
-
'allowExternalLinks':
|
|
699
|
-
'enableWebSearch':
|
|
700
|
-
'showSources':
|
|
1654
|
+
'urls': urls or [],
|
|
1655
|
+
'allowExternalLinks': allow_external_links,
|
|
1656
|
+
'enableWebSearch': enable_web_search,
|
|
1657
|
+
'showSources': show_sources,
|
|
701
1658
|
'schema': schema,
|
|
702
|
-
'origin': '
|
|
1659
|
+
'origin': f'python-sdk@{get_version()}'
|
|
703
1660
|
}
|
|
704
1661
|
|
|
705
|
-
if not request_data['urls']:
|
|
706
|
-
request_data['urls'] = []
|
|
707
1662
|
# Only add prompt and systemPrompt if they exist
|
|
708
|
-
if
|
|
709
|
-
request_data['prompt'] =
|
|
710
|
-
if
|
|
711
|
-
request_data['systemPrompt'] =
|
|
712
|
-
elif params.get('systemPrompt'): # Check legacy field name
|
|
713
|
-
request_data['systemPrompt'] = params['systemPrompt']
|
|
1663
|
+
if prompt:
|
|
1664
|
+
request_data['prompt'] = prompt
|
|
1665
|
+
if system_prompt:
|
|
1666
|
+
request_data['systemPrompt'] = system_prompt
|
|
714
1667
|
|
|
715
|
-
if
|
|
716
|
-
request_data['agent'] =
|
|
1668
|
+
if agent:
|
|
1669
|
+
request_data['agent'] = agent
|
|
717
1670
|
|
|
718
1671
|
try:
|
|
719
1672
|
# Send the initial extract request
|
|
@@ -744,10 +1697,7 @@ class FirecrawlApp:
|
|
|
744
1697
|
except:
|
|
745
1698
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
746
1699
|
if status_data['status'] == 'completed':
|
|
747
|
-
|
|
748
|
-
return status_data
|
|
749
|
-
else:
|
|
750
|
-
raise Exception(f'Failed to extract. Error: {status_data["error"]}')
|
|
1700
|
+
return ExtractResponse(**status_data)
|
|
751
1701
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
752
1702
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
753
1703
|
else:
|
|
@@ -761,9 +1711,9 @@ class FirecrawlApp:
|
|
|
761
1711
|
except Exception as e:
|
|
762
1712
|
raise ValueError(str(e), 500)
|
|
763
1713
|
|
|
764
|
-
return
|
|
1714
|
+
return ExtractResponse(success=False, error="Internal server error.")
|
|
765
1715
|
|
|
766
|
-
def get_extract_status(self, job_id: str) ->
|
|
1716
|
+
def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
|
|
767
1717
|
"""
|
|
768
1718
|
Retrieve the status of an extract job.
|
|
769
1719
|
|
|
@@ -771,7 +1721,7 @@ class FirecrawlApp:
|
|
|
771
1721
|
job_id (str): The ID of the extract job.
|
|
772
1722
|
|
|
773
1723
|
Returns:
|
|
774
|
-
|
|
1724
|
+
ExtractResponse[Any]: The status of the extract job.
|
|
775
1725
|
|
|
776
1726
|
Raises:
|
|
777
1727
|
ValueError: If there is an error retrieving the status.
|
|
@@ -781,7 +1731,7 @@ class FirecrawlApp:
|
|
|
781
1731
|
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
|
782
1732
|
if response.status_code == 200:
|
|
783
1733
|
try:
|
|
784
|
-
return response.json()
|
|
1734
|
+
return ExtractResponse(**response.json())
|
|
785
1735
|
except:
|
|
786
1736
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
787
1737
|
else:
|
|
@@ -789,43 +1739,71 @@ class FirecrawlApp:
|
|
|
789
1739
|
except Exception as e:
|
|
790
1740
|
raise ValueError(str(e), 500)
|
|
791
1741
|
|
|
792
|
-
def async_extract(
|
|
1742
|
+
def async_extract(
|
|
1743
|
+
self,
|
|
1744
|
+
urls: List[str],
|
|
1745
|
+
*,
|
|
1746
|
+
prompt: Optional[str] = None,
|
|
1747
|
+
schema: Optional[Any] = None,
|
|
1748
|
+
system_prompt: Optional[str] = None,
|
|
1749
|
+
allow_external_links: Optional[bool] = False,
|
|
1750
|
+
enable_web_search: Optional[bool] = False,
|
|
1751
|
+
show_sources: Optional[bool] = False,
|
|
1752
|
+
agent: Optional[Dict[str, Any]] = None,
|
|
1753
|
+
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
793
1754
|
"""
|
|
794
1755
|
Initiate an asynchronous extract job.
|
|
795
1756
|
|
|
796
1757
|
Args:
|
|
797
|
-
urls (List[str]):
|
|
798
|
-
|
|
799
|
-
|
|
1758
|
+
urls (List[str]): URLs to extract information from
|
|
1759
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
1760
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
1761
|
+
system_prompt (Optional[str]): System context
|
|
1762
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
1763
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
1764
|
+
show_sources (Optional[bool]): Include source URLs
|
|
1765
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
1766
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
800
1767
|
|
|
801
1768
|
Returns:
|
|
802
|
-
|
|
1769
|
+
ExtractResponse[Any] with:
|
|
1770
|
+
* success (bool): Whether request succeeded
|
|
1771
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
1772
|
+
* error (Optional[str]): Error message if any
|
|
803
1773
|
|
|
804
1774
|
Raises:
|
|
805
|
-
ValueError: If
|
|
1775
|
+
ValueError: If job initiation fails
|
|
806
1776
|
"""
|
|
807
1777
|
headers = self._prepare_headers(idempotency_key)
|
|
808
1778
|
|
|
809
|
-
schema =
|
|
1779
|
+
schema = schema
|
|
810
1780
|
if schema:
|
|
811
1781
|
if hasattr(schema, 'model_json_schema'):
|
|
812
1782
|
# Convert Pydantic model to JSON schema
|
|
813
1783
|
schema = schema.model_json_schema()
|
|
814
1784
|
# Otherwise assume it's already a JSON schema dict
|
|
815
1785
|
|
|
816
|
-
jsonData = {'urls': urls, **(params or {})}
|
|
817
1786
|
request_data = {
|
|
818
|
-
|
|
819
|
-
'allowExternalLinks':
|
|
1787
|
+
'urls': urls,
|
|
1788
|
+
'allowExternalLinks': allow_external_links,
|
|
1789
|
+
'enableWebSearch': enable_web_search,
|
|
1790
|
+
'showSources': show_sources,
|
|
820
1791
|
'schema': schema,
|
|
821
|
-
'origin': '
|
|
1792
|
+
'origin': f'python-sdk@{version}'
|
|
822
1793
|
}
|
|
823
1794
|
|
|
1795
|
+
if prompt:
|
|
1796
|
+
request_data['prompt'] = prompt
|
|
1797
|
+
if system_prompt:
|
|
1798
|
+
request_data['systemPrompt'] = system_prompt
|
|
1799
|
+
if agent:
|
|
1800
|
+
request_data['agent'] = agent
|
|
1801
|
+
|
|
824
1802
|
try:
|
|
825
1803
|
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
|
826
1804
|
if response.status_code == 200:
|
|
827
1805
|
try:
|
|
828
|
-
return response.json()
|
|
1806
|
+
return ExtractResponse(**response.json())
|
|
829
1807
|
except:
|
|
830
1808
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
831
1809
|
else:
|
|
@@ -833,34 +1811,44 @@ class FirecrawlApp:
|
|
|
833
1811
|
except Exception as e:
|
|
834
1812
|
raise ValueError(str(e), 500)
|
|
835
1813
|
|
|
836
|
-
def generate_llms_text(
|
|
1814
|
+
def generate_llms_text(
|
|
1815
|
+
self,
|
|
1816
|
+
url: str,
|
|
1817
|
+
*,
|
|
1818
|
+
max_urls: Optional[int] = None,
|
|
1819
|
+
show_full_text: Optional[bool] = None,
|
|
1820
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
|
837
1821
|
"""
|
|
838
1822
|
Generate LLMs.txt for a given URL and poll until completion.
|
|
839
1823
|
|
|
840
1824
|
Args:
|
|
841
|
-
url (str):
|
|
842
|
-
|
|
1825
|
+
url (str): Target URL to generate LLMs.txt from
|
|
1826
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1827
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1828
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
843
1829
|
|
|
844
1830
|
Returns:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires.
|
|
1831
|
+
GenerateLLMsTextStatusResponse with:
|
|
1832
|
+
* Generated LLMs.txt content
|
|
1833
|
+
* Full version if requested
|
|
1834
|
+
* Generation status
|
|
1835
|
+
* Success/error information
|
|
851
1836
|
|
|
852
1837
|
Raises:
|
|
853
|
-
Exception: If
|
|
1838
|
+
Exception: If generation fails
|
|
854
1839
|
"""
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
else:
|
|
861
|
-
generation_params = params
|
|
1840
|
+
params = GenerateLLMsTextParams(
|
|
1841
|
+
maxUrls=max_urls,
|
|
1842
|
+
showFullText=show_full_text,
|
|
1843
|
+
__experimental_stream=experimental_stream
|
|
1844
|
+
)
|
|
862
1845
|
|
|
863
|
-
response = self.async_generate_llms_text(
|
|
1846
|
+
response = self.async_generate_llms_text(
|
|
1847
|
+
url,
|
|
1848
|
+
max_urls=max_urls,
|
|
1849
|
+
show_full_text=show_full_text,
|
|
1850
|
+
experimental_stream=experimental_stream
|
|
1851
|
+
)
|
|
864
1852
|
if not response.get('success') or 'id' not in response:
|
|
865
1853
|
return response
|
|
866
1854
|
|
|
@@ -879,32 +1867,40 @@ class FirecrawlApp:
|
|
|
879
1867
|
|
|
880
1868
|
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
|
881
1869
|
|
|
882
|
-
def async_generate_llms_text(
|
|
1870
|
+
def async_generate_llms_text(
|
|
1871
|
+
self,
|
|
1872
|
+
url: str,
|
|
1873
|
+
*,
|
|
1874
|
+
max_urls: Optional[int] = None,
|
|
1875
|
+
show_full_text: Optional[bool] = None,
|
|
1876
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
883
1877
|
"""
|
|
884
1878
|
Initiate an asynchronous LLMs.txt generation operation.
|
|
885
1879
|
|
|
886
1880
|
Args:
|
|
887
|
-
url (str): The URL to generate LLMs.txt from.
|
|
888
|
-
|
|
1881
|
+
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
|
|
1882
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1883
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1884
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
889
1885
|
|
|
890
1886
|
Returns:
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
1887
|
+
GenerateLLMsTextResponse: A response containing:
|
|
1888
|
+
* success (bool): Whether the generation initiation was successful
|
|
1889
|
+
* id (str): The unique identifier for the generation job
|
|
1890
|
+
* error (str, optional): Error message if initiation failed
|
|
894
1891
|
|
|
895
1892
|
Raises:
|
|
896
1893
|
Exception: If the generation job initiation fails.
|
|
897
1894
|
"""
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
else:
|
|
904
|
-
generation_params = params
|
|
1895
|
+
params = GenerateLLMsTextParams(
|
|
1896
|
+
maxUrls=max_urls,
|
|
1897
|
+
showFullText=show_full_text,
|
|
1898
|
+
__experimental_stream=experimental_stream
|
|
1899
|
+
)
|
|
905
1900
|
|
|
906
1901
|
headers = self._prepare_headers()
|
|
907
|
-
json_data = {'url': url, **
|
|
1902
|
+
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
1903
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
908
1904
|
|
|
909
1905
|
try:
|
|
910
1906
|
response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
|
|
@@ -920,15 +1916,22 @@ class FirecrawlApp:
|
|
|
920
1916
|
|
|
921
1917
|
return {'success': False, 'error': 'Internal server error'}
|
|
922
1918
|
|
|
923
|
-
def check_generate_llms_text_status(self, id: str) ->
|
|
1919
|
+
def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
|
|
924
1920
|
"""
|
|
925
1921
|
Check the status of a LLMs.txt generation operation.
|
|
926
1922
|
|
|
927
1923
|
Args:
|
|
928
|
-
id (str): The
|
|
1924
|
+
id (str): The unique identifier of the LLMs.txt generation job to check status for.
|
|
929
1925
|
|
|
930
1926
|
Returns:
|
|
931
|
-
|
|
1927
|
+
GenerateLLMsTextStatusResponse: A response containing:
|
|
1928
|
+
* success (bool): Whether the generation was successful
|
|
1929
|
+
* status (str): Status of generation ("processing", "completed", "failed")
|
|
1930
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
1931
|
+
* llmstxt (str): Generated LLMs.txt content
|
|
1932
|
+
* llmsfulltxt (str, optional): Full version if requested
|
|
1933
|
+
* error (str, optional): Error message if generation failed
|
|
1934
|
+
* expiresAt (str): When the generated data expires
|
|
932
1935
|
|
|
933
1936
|
Raises:
|
|
934
1937
|
Exception: If the status check fails.
|
|
@@ -950,7 +1953,9 @@ class FirecrawlApp:
|
|
|
950
1953
|
|
|
951
1954
|
return {'success': False, 'error': 'Internal server error'}
|
|
952
1955
|
|
|
953
|
-
def _prepare_headers(
|
|
1956
|
+
def _prepare_headers(
|
|
1957
|
+
self,
|
|
1958
|
+
idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
|
954
1959
|
"""
|
|
955
1960
|
Prepare the headers for API requests.
|
|
956
1961
|
|
|
@@ -972,11 +1977,13 @@ class FirecrawlApp:
|
|
|
972
1977
|
'Authorization': f'Bearer {self.api_key}',
|
|
973
1978
|
}
|
|
974
1979
|
|
|
975
|
-
def _post_request(
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
1980
|
+
def _post_request(
|
|
1981
|
+
self,
|
|
1982
|
+
url: str,
|
|
1983
|
+
data: Dict[str, Any],
|
|
1984
|
+
headers: Dict[str, str],
|
|
1985
|
+
retries: int = 3,
|
|
1986
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
980
1987
|
"""
|
|
981
1988
|
Make a POST request with retries.
|
|
982
1989
|
|
|
@@ -1001,10 +2008,12 @@ class FirecrawlApp:
|
|
|
1001
2008
|
return response
|
|
1002
2009
|
return response
|
|
1003
2010
|
|
|
1004
|
-
def _get_request(
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
2011
|
+
def _get_request(
|
|
2012
|
+
self,
|
|
2013
|
+
url: str,
|
|
2014
|
+
headers: Dict[str, str],
|
|
2015
|
+
retries: int = 3,
|
|
2016
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
1008
2017
|
"""
|
|
1009
2018
|
Make a GET request with retries.
|
|
1010
2019
|
|
|
@@ -1028,10 +2037,12 @@ class FirecrawlApp:
|
|
|
1028
2037
|
return response
|
|
1029
2038
|
return response
|
|
1030
2039
|
|
|
1031
|
-
def _delete_request(
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
2040
|
+
def _delete_request(
|
|
2041
|
+
self,
|
|
2042
|
+
url: str,
|
|
2043
|
+
headers: Dict[str, str],
|
|
2044
|
+
retries: int = 3,
|
|
2045
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
1035
2046
|
"""
|
|
1036
2047
|
Make a DELETE request with retries.
|
|
1037
2048
|
|
|
@@ -1055,16 +2066,21 @@ class FirecrawlApp:
|
|
|
1055
2066
|
return response
|
|
1056
2067
|
return response
|
|
1057
2068
|
|
|
1058
|
-
def _monitor_job_status(
|
|
2069
|
+
def _monitor_job_status(
|
|
2070
|
+
self,
|
|
2071
|
+
id: str,
|
|
2072
|
+
headers: Dict[str, str],
|
|
2073
|
+
poll_interval: int) -> CrawlStatusResponse:
|
|
1059
2074
|
"""
|
|
1060
2075
|
Monitor the status of a crawl job until completion.
|
|
1061
2076
|
|
|
1062
2077
|
Args:
|
|
1063
2078
|
id (str): The ID of the crawl job.
|
|
1064
2079
|
headers (Dict[str, str]): The headers to include in the status check requests.
|
|
1065
|
-
poll_interval (int):
|
|
2080
|
+
poll_interval (int): Seconds between status checks.
|
|
2081
|
+
|
|
1066
2082
|
Returns:
|
|
1067
|
-
|
|
2083
|
+
CrawlStatusResponse: The crawl results if the job is completed successfully.
|
|
1068
2084
|
|
|
1069
2085
|
Raises:
|
|
1070
2086
|
Exception: If the job fails or an error occurs during status checks.
|
|
@@ -1091,7 +2107,7 @@ class FirecrawlApp:
|
|
|
1091
2107
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1092
2108
|
data.extend(status_data.get('data', []))
|
|
1093
2109
|
status_data['data'] = data
|
|
1094
|
-
return status_data
|
|
2110
|
+
return CrawlStatusResponse(**status_data)
|
|
1095
2111
|
else:
|
|
1096
2112
|
raise Exception('Crawl job completed but no data was returned')
|
|
1097
2113
|
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
@@ -1102,7 +2118,10 @@ class FirecrawlApp:
|
|
|
1102
2118
|
else:
|
|
1103
2119
|
self._handle_error(status_response, 'check crawl status')
|
|
1104
2120
|
|
|
1105
|
-
def _handle_error(
|
|
2121
|
+
def _handle_error(
|
|
2122
|
+
self,
|
|
2123
|
+
response: requests.Response,
|
|
2124
|
+
action: str) -> None:
|
|
1106
2125
|
"""
|
|
1107
2126
|
Handle errors from API responses.
|
|
1108
2127
|
|
|
@@ -1119,49 +2138,100 @@ class FirecrawlApp:
|
|
|
1119
2138
|
except:
|
|
1120
2139
|
raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
|
|
1121
2140
|
|
|
1122
|
-
|
|
1123
|
-
if response.status_code == 402:
|
|
1124
|
-
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
1125
|
-
elif response.status_code == 403:
|
|
1126
|
-
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
1127
|
-
elif response.status_code == 408:
|
|
1128
|
-
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
1129
|
-
elif response.status_code == 409:
|
|
1130
|
-
message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
|
|
1131
|
-
elif response.status_code == 500:
|
|
1132
|
-
message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
|
1133
|
-
else:
|
|
1134
|
-
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
|
|
2141
|
+
message = self._get_error_message(response.status_code, action, error_message, error_details)
|
|
1135
2142
|
|
|
1136
2143
|
# Raise an HTTPError with the custom message and attach the response
|
|
1137
2144
|
raise requests.exceptions.HTTPError(message, response=response)
|
|
1138
2145
|
|
|
1139
|
-
def
|
|
1140
|
-
|
|
1141
|
-
|
|
2146
|
+
def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
2147
|
+
"""
|
|
2148
|
+
Generate a standardized error message based on HTTP status code.
|
|
2149
|
+
|
|
2150
|
+
Args:
|
|
2151
|
+
status_code (int): The HTTP status code from the response
|
|
2152
|
+
action (str): Description of the action that was being performed
|
|
2153
|
+
error_message (str): The error message from the API response
|
|
2154
|
+
error_details (str): Additional error details from the API response
|
|
2155
|
+
|
|
2156
|
+
Returns:
|
|
2157
|
+
str: A formatted error message
|
|
2158
|
+
"""
|
|
2159
|
+
if status_code == 402:
|
|
2160
|
+
return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
2161
|
+
elif status_code == 403:
|
|
2162
|
+
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
2163
|
+
elif status_code == 408:
|
|
2164
|
+
return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
2165
|
+
elif status_code == 409:
|
|
2166
|
+
return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
|
|
2167
|
+
elif status_code == 500:
|
|
2168
|
+
return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
|
2169
|
+
else:
|
|
2170
|
+
return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
|
|
2171
|
+
|
|
2172
|
+
def deep_research(
|
|
2173
|
+
self,
|
|
2174
|
+
query: str,
|
|
2175
|
+
*,
|
|
2176
|
+
max_depth: Optional[int] = None,
|
|
2177
|
+
time_limit: Optional[int] = None,
|
|
2178
|
+
max_urls: Optional[int] = None,
|
|
2179
|
+
analysis_prompt: Optional[str] = None,
|
|
2180
|
+
system_prompt: Optional[str] = None,
|
|
2181
|
+
__experimental_stream_steps: Optional[bool] = None,
|
|
2182
|
+
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
2183
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
|
|
1142
2184
|
"""
|
|
1143
2185
|
Initiates a deep research operation on a given query and polls until completion.
|
|
1144
2186
|
|
|
1145
2187
|
Args:
|
|
1146
|
-
query (str):
|
|
1147
|
-
|
|
1148
|
-
|
|
2188
|
+
query (str): Research query or topic to investigate
|
|
2189
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
2190
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
2191
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
2192
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
2193
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
2194
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
2195
|
+
on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
|
|
2196
|
+
on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
|
|
1149
2197
|
|
|
1150
2198
|
Returns:
|
|
1151
|
-
|
|
2199
|
+
DeepResearchStatusResponse containing:
|
|
2200
|
+
* success (bool): Whether research completed successfully
|
|
2201
|
+
* status (str): Current state (processing/completed/failed)
|
|
2202
|
+
* error (Optional[str]): Error message if failed
|
|
2203
|
+
* id (str): Unique identifier for the research job
|
|
2204
|
+
* data (Any): Research findings and analysis
|
|
2205
|
+
* sources (List[Dict]): List of discovered sources
|
|
2206
|
+
* activities (List[Dict]): Research progress log
|
|
2207
|
+
* summaries (List[str]): Generated research summaries
|
|
1152
2208
|
|
|
1153
2209
|
Raises:
|
|
1154
|
-
Exception: If
|
|
2210
|
+
Exception: If research fails
|
|
1155
2211
|
"""
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
if
|
|
1160
|
-
research_params =
|
|
1161
|
-
|
|
1162
|
-
research_params =
|
|
1163
|
-
|
|
1164
|
-
|
|
2212
|
+
research_params = {}
|
|
2213
|
+
if max_depth is not None:
|
|
2214
|
+
research_params['maxDepth'] = max_depth
|
|
2215
|
+
if time_limit is not None:
|
|
2216
|
+
research_params['timeLimit'] = time_limit
|
|
2217
|
+
if max_urls is not None:
|
|
2218
|
+
research_params['maxUrls'] = max_urls
|
|
2219
|
+
if analysis_prompt is not None:
|
|
2220
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
2221
|
+
if system_prompt is not None:
|
|
2222
|
+
research_params['systemPrompt'] = system_prompt
|
|
2223
|
+
if __experimental_stream_steps is not None:
|
|
2224
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2225
|
+
research_params = DeepResearchParams(**research_params)
|
|
2226
|
+
|
|
2227
|
+
response = self.async_deep_research(
|
|
2228
|
+
query,
|
|
2229
|
+
max_depth=max_depth,
|
|
2230
|
+
time_limit=time_limit,
|
|
2231
|
+
max_urls=max_urls,
|
|
2232
|
+
analysis_prompt=analysis_prompt,
|
|
2233
|
+
system_prompt=system_prompt
|
|
2234
|
+
)
|
|
1165
2235
|
if not response.get('success') or 'id' not in response:
|
|
1166
2236
|
return response
|
|
1167
2237
|
|
|
@@ -1194,31 +2264,57 @@ class FirecrawlApp:
|
|
|
1194
2264
|
time.sleep(2) # Polling interval
|
|
1195
2265
|
|
|
1196
2266
|
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
|
1197
|
-
|
|
2267
|
+
|
|
2268
|
+
def async_deep_research(
|
|
2269
|
+
self,
|
|
2270
|
+
query: str,
|
|
2271
|
+
*,
|
|
2272
|
+
max_depth: Optional[int] = None,
|
|
2273
|
+
time_limit: Optional[int] = None,
|
|
2274
|
+
max_urls: Optional[int] = None,
|
|
2275
|
+
analysis_prompt: Optional[str] = None,
|
|
2276
|
+
system_prompt: Optional[str] = None,
|
|
2277
|
+
__experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
|
|
1198
2278
|
"""
|
|
1199
2279
|
Initiates an asynchronous deep research operation.
|
|
1200
2280
|
|
|
1201
2281
|
Args:
|
|
1202
|
-
query (str):
|
|
1203
|
-
|
|
2282
|
+
query (str): Research query or topic to investigate
|
|
2283
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
2284
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
2285
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
2286
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
2287
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
2288
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
1204
2289
|
|
|
1205
2290
|
Returns:
|
|
1206
|
-
Dict[str, Any]:
|
|
2291
|
+
Dict[str, Any]: A response containing:
|
|
2292
|
+
* success (bool): Whether the research initiation was successful
|
|
2293
|
+
* id (str): The unique identifier for the research job
|
|
2294
|
+
* error (str, optional): Error message if initiation failed
|
|
1207
2295
|
|
|
1208
2296
|
Raises:
|
|
1209
2297
|
Exception: If the research initiation fails.
|
|
1210
2298
|
"""
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
if
|
|
1215
|
-
research_params =
|
|
1216
|
-
|
|
1217
|
-
research_params =
|
|
2299
|
+
research_params = {}
|
|
2300
|
+
if max_depth is not None:
|
|
2301
|
+
research_params['maxDepth'] = max_depth
|
|
2302
|
+
if time_limit is not None:
|
|
2303
|
+
research_params['timeLimit'] = time_limit
|
|
2304
|
+
if max_urls is not None:
|
|
2305
|
+
research_params['maxUrls'] = max_urls
|
|
2306
|
+
if analysis_prompt is not None:
|
|
2307
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
2308
|
+
if system_prompt is not None:
|
|
2309
|
+
research_params['systemPrompt'] = system_prompt
|
|
2310
|
+
if __experimental_stream_steps is not None:
|
|
2311
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2312
|
+
research_params = DeepResearchParams(**research_params)
|
|
1218
2313
|
|
|
1219
2314
|
headers = self._prepare_headers()
|
|
1220
2315
|
|
|
1221
2316
|
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
|
2317
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
1222
2318
|
|
|
1223
2319
|
# Handle json options schema if present
|
|
1224
2320
|
if 'jsonOptions' in json_data:
|
|
@@ -1240,7 +2336,7 @@ class FirecrawlApp:
|
|
|
1240
2336
|
|
|
1241
2337
|
return {'success': False, 'error': 'Internal server error'}
|
|
1242
2338
|
|
|
1243
|
-
def check_deep_research_status(self, id: str) ->
|
|
2339
|
+
def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
|
|
1244
2340
|
"""
|
|
1245
2341
|
Check the status of a deep research operation.
|
|
1246
2342
|
|
|
@@ -1248,7 +2344,19 @@ class FirecrawlApp:
|
|
|
1248
2344
|
id (str): The ID of the deep research operation.
|
|
1249
2345
|
|
|
1250
2346
|
Returns:
|
|
1251
|
-
|
|
2347
|
+
DeepResearchResponse containing:
|
|
2348
|
+
|
|
2349
|
+
Status:
|
|
2350
|
+
* success - Whether research completed successfully
|
|
2351
|
+
* status - Current state (processing/completed/failed)
|
|
2352
|
+
* error - Error message if failed
|
|
2353
|
+
|
|
2354
|
+
Results:
|
|
2355
|
+
* id - Unique identifier for the research job
|
|
2356
|
+
* data - Research findings and analysis
|
|
2357
|
+
* sources - List of discovered sources
|
|
2358
|
+
* activities - Research progress log
|
|
2359
|
+
* summaries - Generated research summaries
|
|
1252
2360
|
|
|
1253
2361
|
Raises:
|
|
1254
2362
|
Exception: If the status check fails.
|
|
@@ -1271,6 +2379,17 @@ class FirecrawlApp:
|
|
|
1271
2379
|
return {'success': False, 'error': 'Internal server error'}
|
|
1272
2380
|
|
|
1273
2381
|
class CrawlWatcher:
|
|
2382
|
+
"""
|
|
2383
|
+
A class to watch and handle crawl job events via WebSocket connection.
|
|
2384
|
+
|
|
2385
|
+
Attributes:
|
|
2386
|
+
id (str): The ID of the crawl job to watch
|
|
2387
|
+
app (FirecrawlApp): The FirecrawlApp instance
|
|
2388
|
+
data (List[Dict[str, Any]]): List of crawled documents/data
|
|
2389
|
+
status (str): Current status of the crawl job
|
|
2390
|
+
ws_url (str): WebSocket URL for the crawl job
|
|
2391
|
+
event_handlers (dict): Dictionary of event type to list of handler functions
|
|
2392
|
+
"""
|
|
1274
2393
|
def __init__(self, id: str, app: FirecrawlApp):
|
|
1275
2394
|
self.id = id
|
|
1276
2395
|
self.app = app
|
|
@@ -1283,25 +2402,57 @@ class CrawlWatcher:
|
|
|
1283
2402
|
'document': []
|
|
1284
2403
|
}
|
|
1285
2404
|
|
|
1286
|
-
async def connect(self):
|
|
1287
|
-
|
|
2405
|
+
async def connect(self) -> None:
|
|
2406
|
+
"""
|
|
2407
|
+
Establishes WebSocket connection and starts listening for messages.
|
|
2408
|
+
"""
|
|
2409
|
+
async with websockets.connect(
|
|
2410
|
+
self.ws_url,
|
|
2411
|
+
additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
|
|
2412
|
+
) as websocket:
|
|
1288
2413
|
await self._listen(websocket)
|
|
1289
2414
|
|
|
1290
|
-
async def _listen(self, websocket):
|
|
2415
|
+
async def _listen(self, websocket) -> None:
|
|
2416
|
+
"""
|
|
2417
|
+
Listens for incoming WebSocket messages and handles them.
|
|
2418
|
+
|
|
2419
|
+
Args:
|
|
2420
|
+
websocket: The WebSocket connection object
|
|
2421
|
+
"""
|
|
1291
2422
|
async for message in websocket:
|
|
1292
2423
|
msg = json.loads(message)
|
|
1293
2424
|
await self._handle_message(msg)
|
|
1294
2425
|
|
|
1295
|
-
def add_event_listener(self, event_type: str, handler):
|
|
2426
|
+
def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
|
|
2427
|
+
"""
|
|
2428
|
+
Adds an event handler function for a specific event type.
|
|
2429
|
+
|
|
2430
|
+
Args:
|
|
2431
|
+
event_type (str): Type of event to listen for ('done', 'error', or 'document')
|
|
2432
|
+
handler (Callable): Function to handle the event
|
|
2433
|
+
"""
|
|
1296
2434
|
if event_type in self.event_handlers:
|
|
1297
2435
|
self.event_handlers[event_type].append(handler)
|
|
1298
2436
|
|
|
1299
|
-
def dispatch_event(self, event_type: str, detail: Dict[str, Any]):
|
|
2437
|
+
def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
|
|
2438
|
+
"""
|
|
2439
|
+
Dispatches an event to all registered handlers for that event type.
|
|
2440
|
+
|
|
2441
|
+
Args:
|
|
2442
|
+
event_type (str): Type of event to dispatch
|
|
2443
|
+
detail (Dict[str, Any]): Event details/data to pass to handlers
|
|
2444
|
+
"""
|
|
1300
2445
|
if event_type in self.event_handlers:
|
|
1301
2446
|
for handler in self.event_handlers[event_type]:
|
|
1302
2447
|
handler(detail)
|
|
1303
2448
|
|
|
1304
|
-
async def _handle_message(self, msg: Dict[str, Any]):
|
|
2449
|
+
async def _handle_message(self, msg: Dict[str, Any]) -> None:
|
|
2450
|
+
"""
|
|
2451
|
+
Handles incoming WebSocket messages based on their type.
|
|
2452
|
+
|
|
2453
|
+
Args:
|
|
2454
|
+
msg (Dict[str, Any]): The message to handle
|
|
2455
|
+
"""
|
|
1305
2456
|
if msg['type'] == 'done':
|
|
1306
2457
|
self.status = 'completed'
|
|
1307
2458
|
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
|
|
@@ -1316,3 +2467,1773 @@ class CrawlWatcher:
|
|
|
1316
2467
|
elif msg['type'] == 'document':
|
|
1317
2468
|
self.data.append(msg['data'])
|
|
1318
2469
|
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
|
2470
|
+
|
|
2471
|
+
class AsyncFirecrawlApp(FirecrawlApp):
|
|
2472
|
+
"""
|
|
2473
|
+
Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
|
|
2474
|
+
Provides non-blocking alternatives to all FirecrawlApp operations.
|
|
2475
|
+
"""
|
|
2476
|
+
|
|
2477
|
+
async def _async_request(
|
|
2478
|
+
self,
|
|
2479
|
+
method: str,
|
|
2480
|
+
url: str,
|
|
2481
|
+
headers: Dict[str, str],
|
|
2482
|
+
data: Optional[Dict[str, Any]] = None,
|
|
2483
|
+
retries: int = 3,
|
|
2484
|
+
backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2485
|
+
"""
|
|
2486
|
+
Generic async request method with exponential backoff retry logic.
|
|
2487
|
+
|
|
2488
|
+
Args:
|
|
2489
|
+
method (str): The HTTP method to use (e.g., "GET" or "POST").
|
|
2490
|
+
url (str): The URL to send the request to.
|
|
2491
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2492
|
+
data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
|
|
2493
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2494
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2495
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2496
|
+
|
|
2497
|
+
Returns:
|
|
2498
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2499
|
+
|
|
2500
|
+
Raises:
|
|
2501
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2502
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2503
|
+
"""
|
|
2504
|
+
async with aiohttp.ClientSession() as session:
|
|
2505
|
+
for attempt in range(retries):
|
|
2506
|
+
try:
|
|
2507
|
+
async with session.request(
|
|
2508
|
+
method=method, url=url, headers=headers, json=data
|
|
2509
|
+
) as response:
|
|
2510
|
+
if response.status == 502:
|
|
2511
|
+
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
|
2512
|
+
continue
|
|
2513
|
+
if response.status >= 300:
|
|
2514
|
+
await self._handle_error(response, f"make {method} request")
|
|
2515
|
+
return await response.json()
|
|
2516
|
+
except aiohttp.ClientError as e:
|
|
2517
|
+
if attempt == retries - 1:
|
|
2518
|
+
raise e
|
|
2519
|
+
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
|
2520
|
+
raise Exception("Max retries exceeded")
|
|
2521
|
+
|
|
2522
|
+
async def _async_post_request(
|
|
2523
|
+
self, url: str, data: Dict[str, Any], headers: Dict[str, str],
|
|
2524
|
+
retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2525
|
+
"""
|
|
2526
|
+
Make an async POST request with exponential backoff retry logic.
|
|
2527
|
+
|
|
2528
|
+
Args:
|
|
2529
|
+
url (str): The URL to send the POST request to.
|
|
2530
|
+
data (Dict[str, Any]): The JSON data to include in the request body.
|
|
2531
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2532
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2533
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2534
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2535
|
+
|
|
2536
|
+
Returns:
|
|
2537
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2538
|
+
|
|
2539
|
+
Raises:
|
|
2540
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2541
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2542
|
+
"""
|
|
2543
|
+
return await self._async_request("POST", url, headers, data, retries, backoff_factor)
|
|
2544
|
+
|
|
2545
|
+
async def _async_get_request(
|
|
2546
|
+
self, url: str, headers: Dict[str, str],
|
|
2547
|
+
retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2548
|
+
"""
|
|
2549
|
+
Make an async GET request with exponential backoff retry logic.
|
|
2550
|
+
|
|
2551
|
+
Args:
|
|
2552
|
+
url (str): The URL to send the GET request to.
|
|
2553
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2554
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2555
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2556
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2557
|
+
|
|
2558
|
+
Returns:
|
|
2559
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2560
|
+
|
|
2561
|
+
Raises:
|
|
2562
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2563
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2564
|
+
"""
|
|
2565
|
+
return await self._async_request("GET", url, headers, None, retries, backoff_factor)
|
|
2566
|
+
|
|
2567
|
+
async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
|
|
2568
|
+
"""
|
|
2569
|
+
Handle errors from async API responses with detailed error messages.
|
|
2570
|
+
|
|
2571
|
+
Args:
|
|
2572
|
+
response (aiohttp.ClientResponse): The response object from the failed request
|
|
2573
|
+
action (str): Description of the action that was being attempted
|
|
2574
|
+
|
|
2575
|
+
Raises:
|
|
2576
|
+
aiohttp.ClientError: With a detailed error message based on the response status:
|
|
2577
|
+
- 402: Payment Required
|
|
2578
|
+
- 408: Request Timeout
|
|
2579
|
+
- 409: Conflict
|
|
2580
|
+
- 500: Internal Server Error
|
|
2581
|
+
- Other: Unexpected error with status code
|
|
2582
|
+
"""
|
|
2583
|
+
try:
|
|
2584
|
+
error_data = await response.json()
|
|
2585
|
+
error_message = error_data.get('error', 'No error message provided.')
|
|
2586
|
+
error_details = error_data.get('details', 'No additional error details provided.')
|
|
2587
|
+
except:
|
|
2588
|
+
raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
|
|
2589
|
+
|
|
2590
|
+
message = await self._get_async_error_message(response.status, action, error_message, error_details)
|
|
2591
|
+
|
|
2592
|
+
raise aiohttp.ClientError(message)
|
|
2593
|
+
|
|
2594
|
+
async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
2595
|
+
"""
|
|
2596
|
+
Generate a standardized error message based on HTTP status code for async operations.
|
|
2597
|
+
|
|
2598
|
+
Args:
|
|
2599
|
+
status_code (int): The HTTP status code from the response
|
|
2600
|
+
action (str): Description of the action that was being performed
|
|
2601
|
+
error_message (str): The error message from the API response
|
|
2602
|
+
error_details (str): Additional error details from the API response
|
|
2603
|
+
|
|
2604
|
+
Returns:
|
|
2605
|
+
str: A formatted error message
|
|
2606
|
+
"""
|
|
2607
|
+
return self._get_error_message(status_code, action, error_message, error_details)
|
|
2608
|
+
|
|
2609
|
+
async def crawl_url_and_watch(
|
|
2610
|
+
self,
|
|
2611
|
+
url: str,
|
|
2612
|
+
params: Optional[CrawlParams] = None,
|
|
2613
|
+
idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
|
|
2614
|
+
"""
|
|
2615
|
+
Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
|
|
2616
|
+
|
|
2617
|
+
Args:
|
|
2618
|
+
url (str): Target URL to start crawling from
|
|
2619
|
+
params (Optional[CrawlParams]): See CrawlParams model for configuration:
|
|
2620
|
+
URL Discovery:
|
|
2621
|
+
* includePaths - Patterns of URLs to include
|
|
2622
|
+
* excludePaths - Patterns of URLs to exclude
|
|
2623
|
+
* maxDepth - Maximum crawl depth
|
|
2624
|
+
* maxDiscoveryDepth - Maximum depth for finding new URLs
|
|
2625
|
+
* limit - Maximum pages to crawl
|
|
2626
|
+
|
|
2627
|
+
Link Following:
|
|
2628
|
+
* allowBackwardLinks - Follow parent directory links
|
|
2629
|
+
* allowExternalLinks - Follow external domain links
|
|
2630
|
+
* ignoreSitemap - Skip sitemap.xml processing
|
|
2631
|
+
|
|
2632
|
+
Advanced:
|
|
2633
|
+
* scrapeOptions - Page scraping configuration
|
|
2634
|
+
* webhook - Notification webhook settings
|
|
2635
|
+
* deduplicateSimilarURLs - Remove similar URLs
|
|
2636
|
+
* ignoreQueryParameters - Ignore URL parameters
|
|
2637
|
+
* regexOnFullURL - Apply regex to full URLs
|
|
2638
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2639
|
+
|
|
2640
|
+
Returns:
|
|
2641
|
+
AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
2642
|
+
|
|
2643
|
+
Raises:
|
|
2644
|
+
Exception: If crawl job fails to start
|
|
2645
|
+
"""
|
|
2646
|
+
crawl_response = await self.async_crawl_url(url, params, idempotency_key)
|
|
2647
|
+
if crawl_response.get('success') and 'id' in crawl_response:
|
|
2648
|
+
return AsyncCrawlWatcher(crawl_response['id'], self)
|
|
2649
|
+
else:
|
|
2650
|
+
raise Exception("Crawl job failed to start")
|
|
2651
|
+
|
|
2652
|
+
async def batch_scrape_urls_and_watch(
|
|
2653
|
+
self,
|
|
2654
|
+
urls: List[str],
|
|
2655
|
+
params: Optional[ScrapeParams] = None,
|
|
2656
|
+
idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
|
|
2657
|
+
"""
|
|
2658
|
+
Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
|
|
2659
|
+
|
|
2660
|
+
Args:
|
|
2661
|
+
urls (List[str]): List of URLs to scrape
|
|
2662
|
+
params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
|
|
2663
|
+
|
|
2664
|
+
Content Options:
|
|
2665
|
+
* formats - Content formats to retrieve
|
|
2666
|
+
* includeTags - HTML tags to include
|
|
2667
|
+
* excludeTags - HTML tags to exclude
|
|
2668
|
+
* onlyMainContent - Extract main content only
|
|
2669
|
+
|
|
2670
|
+
Request Options:
|
|
2671
|
+
* headers - Custom HTTP headers
|
|
2672
|
+
* timeout - Request timeout (ms)
|
|
2673
|
+
* mobile - Use mobile user agent
|
|
2674
|
+
* proxy - Proxy type
|
|
2675
|
+
|
|
2676
|
+
Extraction Options:
|
|
2677
|
+
* extract - Content extraction config
|
|
2678
|
+
* jsonOptions - JSON extraction config
|
|
2679
|
+
* actions - Actions to perform
|
|
2680
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2681
|
+
|
|
2682
|
+
Returns:
|
|
2683
|
+
AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
2684
|
+
|
|
2685
|
+
Raises:
|
|
2686
|
+
Exception: If batch scrape job fails to start
|
|
2687
|
+
"""
|
|
2688
|
+
batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
|
|
2689
|
+
if batch_response.get('success') and 'id' in batch_response:
|
|
2690
|
+
return AsyncCrawlWatcher(batch_response['id'], self)
|
|
2691
|
+
else:
|
|
2692
|
+
raise Exception("Batch scrape job failed to start")
|
|
2693
|
+
|
|
2694
|
+
async def scrape_url(
|
|
2695
|
+
self,
|
|
2696
|
+
url: str,
|
|
2697
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2698
|
+
include_tags: Optional[List[str]] = None,
|
|
2699
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2700
|
+
only_main_content: Optional[bool] = None,
|
|
2701
|
+
wait_for: Optional[int] = None,
|
|
2702
|
+
timeout: Optional[int] = None,
|
|
2703
|
+
location: Optional[LocationConfig] = None,
|
|
2704
|
+
mobile: Optional[bool] = None,
|
|
2705
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2706
|
+
remove_base64_images: Optional[bool] = None,
|
|
2707
|
+
block_ads: Optional[bool] = None,
|
|
2708
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2709
|
+
extract: Optional[ExtractConfig] = None,
|
|
2710
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2711
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
|
|
2712
|
+
"""
|
|
2713
|
+
Scrape and extract content from a URL asynchronously.
|
|
2714
|
+
|
|
2715
|
+
Args:
|
|
2716
|
+
url (str): Target URL to scrape
|
|
2717
|
+
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
|
2718
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2719
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2720
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2721
|
+
wait_for (Optional[int]): Wait for a specific element to appear
|
|
2722
|
+
timeout (Optional[int]): Request timeout (ms)
|
|
2723
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2724
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2725
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2726
|
+
remove_base64_images (Optional[bool]): Remove base64 images
|
|
2727
|
+
block_ads (Optional[bool]): Block ads
|
|
2728
|
+
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
2729
|
+
extract (Optional[ExtractConfig]): Content extraction settings
|
|
2730
|
+
json_options (Optional[ExtractConfig]): JSON extraction settings
|
|
2731
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
2732
|
+
|
|
2733
|
+
Returns:
|
|
2734
|
+
ScrapeResponse with:
|
|
2735
|
+
* Requested content formats
|
|
2736
|
+
* Page metadata
|
|
2737
|
+
* Extraction results
|
|
2738
|
+
* Success/error status
|
|
2739
|
+
|
|
2740
|
+
Raises:
|
|
2741
|
+
Exception: If scraping fails
|
|
2742
|
+
"""
|
|
2743
|
+
headers = self._prepare_headers()
|
|
2744
|
+
|
|
2745
|
+
# Build scrape parameters
|
|
2746
|
+
scrape_params = {
|
|
2747
|
+
'url': url,
|
|
2748
|
+
'origin': f"python-sdk@{version}"
|
|
2749
|
+
}
|
|
2750
|
+
|
|
2751
|
+
# Add optional parameters if provided and not None
|
|
2752
|
+
if formats:
|
|
2753
|
+
scrape_params['formats'] = formats
|
|
2754
|
+
if include_tags:
|
|
2755
|
+
scrape_params['includeTags'] = include_tags
|
|
2756
|
+
if exclude_tags:
|
|
2757
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
2758
|
+
if only_main_content is not None:
|
|
2759
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
2760
|
+
if wait_for:
|
|
2761
|
+
scrape_params['waitFor'] = wait_for
|
|
2762
|
+
if timeout:
|
|
2763
|
+
scrape_params['timeout'] = timeout
|
|
2764
|
+
if location:
|
|
2765
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
2766
|
+
if mobile is not None:
|
|
2767
|
+
scrape_params['mobile'] = mobile
|
|
2768
|
+
if skip_tls_verification is not None:
|
|
2769
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
2770
|
+
if remove_base64_images is not None:
|
|
2771
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
2772
|
+
if block_ads is not None:
|
|
2773
|
+
scrape_params['blockAds'] = block_ads
|
|
2774
|
+
if proxy:
|
|
2775
|
+
scrape_params['proxy'] = proxy
|
|
2776
|
+
if extract:
|
|
2777
|
+
extract_dict = extract.dict(exclude_none=True)
|
|
2778
|
+
if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
|
|
2779
|
+
extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
|
|
2780
|
+
scrape_params['extract'] = extract_dict
|
|
2781
|
+
if json_options:
|
|
2782
|
+
json_options_dict = json_options.dict(exclude_none=True)
|
|
2783
|
+
if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
|
|
2784
|
+
json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
|
|
2785
|
+
scrape_params['jsonOptions'] = json_options_dict
|
|
2786
|
+
if actions:
|
|
2787
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
2788
|
+
|
|
2789
|
+
# Make async request
|
|
2790
|
+
endpoint = f'/v1/scrape'
|
|
2791
|
+
response = await self._async_post_request(
|
|
2792
|
+
f'{self.api_url}{endpoint}',
|
|
2793
|
+
scrape_params,
|
|
2794
|
+
headers
|
|
2795
|
+
)
|
|
2796
|
+
|
|
2797
|
+
if response.get('success') and 'data' in response:
|
|
2798
|
+
return ScrapeResponse(**response['data'])
|
|
2799
|
+
elif "error" in response:
|
|
2800
|
+
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
2801
|
+
else:
|
|
2802
|
+
# Use the response content directly if possible, otherwise a generic message
|
|
2803
|
+
error_content = response.get('error', str(response))
|
|
2804
|
+
raise Exception(f'Failed to scrape URL. Error: {error_content}')
|
|
2805
|
+
|
|
2806
|
+
async def batch_scrape_urls(
|
|
2807
|
+
self,
|
|
2808
|
+
urls: List[str],
|
|
2809
|
+
*,
|
|
2810
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2811
|
+
headers: Optional[Dict[str, str]] = None,
|
|
2812
|
+
include_tags: Optional[List[str]] = None,
|
|
2813
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2814
|
+
only_main_content: Optional[bool] = None,
|
|
2815
|
+
wait_for: Optional[int] = None,
|
|
2816
|
+
timeout: Optional[int] = None,
|
|
2817
|
+
location: Optional[LocationConfig] = None,
|
|
2818
|
+
mobile: Optional[bool] = None,
|
|
2819
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2820
|
+
remove_base64_images: Optional[bool] = None,
|
|
2821
|
+
block_ads: Optional[bool] = None,
|
|
2822
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2823
|
+
extract: Optional[ExtractConfig] = None,
|
|
2824
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2825
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2826
|
+
agent: Optional[AgentOptions] = None,
|
|
2827
|
+
poll_interval: Optional[int] = 2,
|
|
2828
|
+
idempotency_key: Optional[str] = None,
|
|
2829
|
+
**kwargs
|
|
2830
|
+
) -> BatchScrapeStatusResponse:
|
|
2831
|
+
"""
|
|
2832
|
+
Asynchronously scrape multiple URLs and monitor until completion.
|
|
2833
|
+
|
|
2834
|
+
Args:
|
|
2835
|
+
urls (List[str]): URLs to scrape
|
|
2836
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
2837
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
2838
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2839
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2840
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2841
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
2842
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
2843
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2844
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2845
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2846
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
2847
|
+
block_ads (Optional[bool]): Block advertisements
|
|
2848
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
2849
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
2850
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
2851
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
2852
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
2853
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
2854
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2855
|
+
**kwargs: Additional parameters to pass to the API
|
|
2856
|
+
|
|
2857
|
+
Returns:
|
|
2858
|
+
BatchScrapeStatusResponse with:
|
|
2859
|
+
* Scraping status and progress
|
|
2860
|
+
* Scraped content for each URL
|
|
2861
|
+
* Success/error information
|
|
2862
|
+
|
|
2863
|
+
Raises:
|
|
2864
|
+
Exception: If batch scrape fails
|
|
2865
|
+
"""
|
|
2866
|
+
scrape_params = {}
|
|
2867
|
+
|
|
2868
|
+
# Add individual parameters
|
|
2869
|
+
if formats is not None:
|
|
2870
|
+
scrape_params['formats'] = formats
|
|
2871
|
+
if headers is not None:
|
|
2872
|
+
scrape_params['headers'] = headers
|
|
2873
|
+
if include_tags is not None:
|
|
2874
|
+
scrape_params['includeTags'] = include_tags
|
|
2875
|
+
if exclude_tags is not None:
|
|
2876
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
2877
|
+
if only_main_content is not None:
|
|
2878
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
2879
|
+
if wait_for is not None:
|
|
2880
|
+
scrape_params['waitFor'] = wait_for
|
|
2881
|
+
if timeout is not None:
|
|
2882
|
+
scrape_params['timeout'] = timeout
|
|
2883
|
+
if location is not None:
|
|
2884
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
2885
|
+
if mobile is not None:
|
|
2886
|
+
scrape_params['mobile'] = mobile
|
|
2887
|
+
if skip_tls_verification is not None:
|
|
2888
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
2889
|
+
if remove_base64_images is not None:
|
|
2890
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
2891
|
+
if block_ads is not None:
|
|
2892
|
+
scrape_params['blockAds'] = block_ads
|
|
2893
|
+
if proxy is not None:
|
|
2894
|
+
scrape_params['proxy'] = proxy
|
|
2895
|
+
if extract is not None:
|
|
2896
|
+
if hasattr(extract.schema, 'schema'):
|
|
2897
|
+
extract.schema = extract.schema.schema()
|
|
2898
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
2899
|
+
if json_options is not None:
|
|
2900
|
+
if hasattr(json_options.schema, 'schema'):
|
|
2901
|
+
json_options.schema = json_options.schema.schema()
|
|
2902
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
2903
|
+
if actions is not None:
|
|
2904
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
2905
|
+
if agent is not None:
|
|
2906
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
2907
|
+
|
|
2908
|
+
# Add any additional kwargs
|
|
2909
|
+
scrape_params.update(kwargs)
|
|
2910
|
+
|
|
2911
|
+
# Create final params object
|
|
2912
|
+
final_params = ScrapeParams(**scrape_params)
|
|
2913
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
2914
|
+
params_dict['urls'] = urls
|
|
2915
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
2916
|
+
|
|
2917
|
+
# Make request
|
|
2918
|
+
headers = self._prepare_headers(idempotency_key)
|
|
2919
|
+
response = await self._async_post_request(
|
|
2920
|
+
f'{self.api_url}/v1/batch/scrape',
|
|
2921
|
+
params_dict,
|
|
2922
|
+
headers
|
|
2923
|
+
)
|
|
2924
|
+
|
|
2925
|
+
if response.status_code == 200:
|
|
2926
|
+
try:
|
|
2927
|
+
id = response.json().get('id')
|
|
2928
|
+
except:
|
|
2929
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
2930
|
+
return self._monitor_job_status(id, headers, poll_interval)
|
|
2931
|
+
else:
|
|
2932
|
+
self._handle_error(response, 'start batch scrape job')
|
|
2933
|
+
|
|
2934
|
+
|
|
2935
|
+
async def async_batch_scrape_urls(
|
|
2936
|
+
self,
|
|
2937
|
+
urls: List[str],
|
|
2938
|
+
*,
|
|
2939
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2940
|
+
headers: Optional[Dict[str, str]] = None,
|
|
2941
|
+
include_tags: Optional[List[str]] = None,
|
|
2942
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2943
|
+
only_main_content: Optional[bool] = None,
|
|
2944
|
+
wait_for: Optional[int] = None,
|
|
2945
|
+
timeout: Optional[int] = None,
|
|
2946
|
+
location: Optional[LocationConfig] = None,
|
|
2947
|
+
mobile: Optional[bool] = None,
|
|
2948
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2949
|
+
remove_base64_images: Optional[bool] = None,
|
|
2950
|
+
block_ads: Optional[bool] = None,
|
|
2951
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2952
|
+
extract: Optional[ExtractConfig] = None,
|
|
2953
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2954
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2955
|
+
agent: Optional[AgentOptions] = None,
|
|
2956
|
+
idempotency_key: Optional[str] = None,
|
|
2957
|
+
**kwargs
|
|
2958
|
+
) -> BatchScrapeResponse:
|
|
2959
|
+
"""
|
|
2960
|
+
Initiate a batch scrape job asynchronously.
|
|
2961
|
+
|
|
2962
|
+
Args:
|
|
2963
|
+
urls (List[str]): URLs to scrape
|
|
2964
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
2965
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
2966
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2967
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2968
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2969
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
2970
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
2971
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2972
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2973
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2974
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
2975
|
+
block_ads (Optional[bool]): Block advertisements
|
|
2976
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
2977
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
2978
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
2979
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
2980
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
2981
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2982
|
+
**kwargs: Additional parameters to pass to the API
|
|
2983
|
+
|
|
2984
|
+
Returns:
|
|
2985
|
+
BatchScrapeResponse with:
|
|
2986
|
+
* success - Whether job started successfully
|
|
2987
|
+
* id - Unique identifier for the job
|
|
2988
|
+
* url - Status check URL
|
|
2989
|
+
* error - Error message if start failed
|
|
2990
|
+
|
|
2991
|
+
Raises:
|
|
2992
|
+
Exception: If job initiation fails
|
|
2993
|
+
"""
|
|
2994
|
+
scrape_params = {}
|
|
2995
|
+
|
|
2996
|
+
# Add individual parameters
|
|
2997
|
+
if formats is not None:
|
|
2998
|
+
scrape_params['formats'] = formats
|
|
2999
|
+
if headers is not None:
|
|
3000
|
+
scrape_params['headers'] = headers
|
|
3001
|
+
if include_tags is not None:
|
|
3002
|
+
scrape_params['includeTags'] = include_tags
|
|
3003
|
+
if exclude_tags is not None:
|
|
3004
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
3005
|
+
if only_main_content is not None:
|
|
3006
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
3007
|
+
if wait_for is not None:
|
|
3008
|
+
scrape_params['waitFor'] = wait_for
|
|
3009
|
+
if timeout is not None:
|
|
3010
|
+
scrape_params['timeout'] = timeout
|
|
3011
|
+
if location is not None:
|
|
3012
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
3013
|
+
if mobile is not None:
|
|
3014
|
+
scrape_params['mobile'] = mobile
|
|
3015
|
+
if skip_tls_verification is not None:
|
|
3016
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
3017
|
+
if remove_base64_images is not None:
|
|
3018
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
3019
|
+
if block_ads is not None:
|
|
3020
|
+
scrape_params['blockAds'] = block_ads
|
|
3021
|
+
if proxy is not None:
|
|
3022
|
+
scrape_params['proxy'] = proxy
|
|
3023
|
+
if extract is not None:
|
|
3024
|
+
if hasattr(extract.schema, 'schema'):
|
|
3025
|
+
extract.schema = extract.schema.schema()
|
|
3026
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
3027
|
+
if json_options is not None:
|
|
3028
|
+
if hasattr(json_options.schema, 'schema'):
|
|
3029
|
+
json_options.schema = json_options.schema.schema()
|
|
3030
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
3031
|
+
if actions is not None:
|
|
3032
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
3033
|
+
if agent is not None:
|
|
3034
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
3035
|
+
|
|
3036
|
+
# Add any additional kwargs
|
|
3037
|
+
scrape_params.update(kwargs)
|
|
3038
|
+
|
|
3039
|
+
# Create final params object
|
|
3040
|
+
final_params = ScrapeParams(**scrape_params)
|
|
3041
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3042
|
+
params_dict['urls'] = urls
|
|
3043
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3044
|
+
|
|
3045
|
+
# Make request
|
|
3046
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3047
|
+
response = await self._async_post_request(
|
|
3048
|
+
f'{self.api_url}/v1/batch/scrape',
|
|
3049
|
+
params_dict,
|
|
3050
|
+
headers
|
|
3051
|
+
)
|
|
3052
|
+
|
|
3053
|
+
if response.status_code == 200:
|
|
3054
|
+
try:
|
|
3055
|
+
return BatchScrapeResponse(**response.json())
|
|
3056
|
+
except:
|
|
3057
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3058
|
+
else:
|
|
3059
|
+
self._handle_error(response, 'start batch scrape job')
|
|
3060
|
+
|
|
3061
|
+
async def crawl_url(
|
|
3062
|
+
self,
|
|
3063
|
+
url: str,
|
|
3064
|
+
*,
|
|
3065
|
+
include_paths: Optional[List[str]] = None,
|
|
3066
|
+
exclude_paths: Optional[List[str]] = None,
|
|
3067
|
+
max_depth: Optional[int] = None,
|
|
3068
|
+
max_discovery_depth: Optional[int] = None,
|
|
3069
|
+
limit: Optional[int] = None,
|
|
3070
|
+
allow_backward_links: Optional[bool] = None,
|
|
3071
|
+
allow_external_links: Optional[bool] = None,
|
|
3072
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3073
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
3074
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3075
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
3076
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
3077
|
+
regex_on_full_url: Optional[bool] = None,
|
|
3078
|
+
poll_interval: Optional[int] = 2,
|
|
3079
|
+
idempotency_key: Optional[str] = None,
|
|
3080
|
+
**kwargs
|
|
3081
|
+
) -> CrawlStatusResponse:
|
|
3082
|
+
"""
|
|
3083
|
+
Crawl a website starting from a URL.
|
|
3084
|
+
|
|
3085
|
+
Args:
|
|
3086
|
+
url (str): Target URL to start crawling from
|
|
3087
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
3088
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
3089
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
3090
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3091
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
3092
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3093
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
3094
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3095
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
3096
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3097
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3098
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3099
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
3100
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
3101
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3102
|
+
**kwargs: Additional parameters to pass to the API
|
|
3103
|
+
|
|
3104
|
+
Returns:
|
|
3105
|
+
CrawlStatusResponse with:
|
|
3106
|
+
* Crawling status and progress
|
|
3107
|
+
* Crawled page contents
|
|
3108
|
+
* Success/error information
|
|
3109
|
+
|
|
3110
|
+
Raises:
|
|
3111
|
+
Exception: If crawl fails
|
|
3112
|
+
"""
|
|
3113
|
+
crawl_params = {}
|
|
3114
|
+
|
|
3115
|
+
# Add individual parameters
|
|
3116
|
+
if include_paths is not None:
|
|
3117
|
+
crawl_params['includePaths'] = include_paths
|
|
3118
|
+
if exclude_paths is not None:
|
|
3119
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
3120
|
+
if max_depth is not None:
|
|
3121
|
+
crawl_params['maxDepth'] = max_depth
|
|
3122
|
+
if max_discovery_depth is not None:
|
|
3123
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3124
|
+
if limit is not None:
|
|
3125
|
+
crawl_params['limit'] = limit
|
|
3126
|
+
if allow_backward_links is not None:
|
|
3127
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3128
|
+
if allow_external_links is not None:
|
|
3129
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
3130
|
+
if ignore_sitemap is not None:
|
|
3131
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
3132
|
+
if scrape_options is not None:
|
|
3133
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
3134
|
+
if webhook is not None:
|
|
3135
|
+
crawl_params['webhook'] = webhook
|
|
3136
|
+
if deduplicate_similar_urls is not None:
|
|
3137
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
3138
|
+
if ignore_query_parameters is not None:
|
|
3139
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
3140
|
+
if regex_on_full_url is not None:
|
|
3141
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3142
|
+
|
|
3143
|
+
# Add any additional kwargs
|
|
3144
|
+
crawl_params.update(kwargs)
|
|
3145
|
+
|
|
3146
|
+
# Create final params object
|
|
3147
|
+
final_params = CrawlParams(**crawl_params)
|
|
3148
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3149
|
+
params_dict['url'] = url
|
|
3150
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3151
|
+
|
|
3152
|
+
# Make request
|
|
3153
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3154
|
+
response = await self._async_post_request(
|
|
3155
|
+
f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
3156
|
+
|
|
3157
|
+
if response.status_code == 200:
|
|
3158
|
+
try:
|
|
3159
|
+
id = response.json().get('id')
|
|
3160
|
+
except:
|
|
3161
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3162
|
+
return self._monitor_job_status(id, headers, poll_interval)
|
|
3163
|
+
else:
|
|
3164
|
+
self._handle_error(response, 'start crawl job')
|
|
3165
|
+
|
|
3166
|
+
|
|
3167
|
+
async def async_crawl_url(
|
|
3168
|
+
self,
|
|
3169
|
+
url: str,
|
|
3170
|
+
*,
|
|
3171
|
+
include_paths: Optional[List[str]] = None,
|
|
3172
|
+
exclude_paths: Optional[List[str]] = None,
|
|
3173
|
+
max_depth: Optional[int] = None,
|
|
3174
|
+
max_discovery_depth: Optional[int] = None,
|
|
3175
|
+
limit: Optional[int] = None,
|
|
3176
|
+
allow_backward_links: Optional[bool] = None,
|
|
3177
|
+
allow_external_links: Optional[bool] = None,
|
|
3178
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3179
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
3180
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3181
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
3182
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
3183
|
+
regex_on_full_url: Optional[bool] = None,
|
|
3184
|
+
idempotency_key: Optional[str] = None,
|
|
3185
|
+
**kwargs
|
|
3186
|
+
) -> CrawlResponse:
|
|
3187
|
+
"""
|
|
3188
|
+
Start an asynchronous crawl job.
|
|
3189
|
+
|
|
3190
|
+
Args:
|
|
3191
|
+
url (str): Target URL to start crawling from
|
|
3192
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
3193
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
3194
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
3195
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3196
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
3197
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3198
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
3199
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3200
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
3201
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3202
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3203
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3204
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
3205
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3206
|
+
**kwargs: Additional parameters to pass to the API
|
|
3207
|
+
|
|
3208
|
+
Returns:
|
|
3209
|
+
CrawlResponse with:
|
|
3210
|
+
* success - Whether crawl started successfully
|
|
3211
|
+
* id - Unique identifier for the crawl job
|
|
3212
|
+
* url - Status check URL for the crawl
|
|
3213
|
+
* error - Error message if start failed
|
|
3214
|
+
|
|
3215
|
+
Raises:
|
|
3216
|
+
Exception: If crawl initiation fails
|
|
3217
|
+
"""
|
|
3218
|
+
crawl_params = {}
|
|
3219
|
+
|
|
3220
|
+
# Add individual parameters
|
|
3221
|
+
if include_paths is not None:
|
|
3222
|
+
crawl_params['includePaths'] = include_paths
|
|
3223
|
+
if exclude_paths is not None:
|
|
3224
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
3225
|
+
if max_depth is not None:
|
|
3226
|
+
crawl_params['maxDepth'] = max_depth
|
|
3227
|
+
if max_discovery_depth is not None:
|
|
3228
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3229
|
+
if limit is not None:
|
|
3230
|
+
crawl_params['limit'] = limit
|
|
3231
|
+
if allow_backward_links is not None:
|
|
3232
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3233
|
+
if allow_external_links is not None:
|
|
3234
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
3235
|
+
if ignore_sitemap is not None:
|
|
3236
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
3237
|
+
if scrape_options is not None:
|
|
3238
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
3239
|
+
if webhook is not None:
|
|
3240
|
+
crawl_params['webhook'] = webhook
|
|
3241
|
+
if deduplicate_similar_urls is not None:
|
|
3242
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
3243
|
+
if ignore_query_parameters is not None:
|
|
3244
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
3245
|
+
if regex_on_full_url is not None:
|
|
3246
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3247
|
+
|
|
3248
|
+
# Add any additional kwargs
|
|
3249
|
+
crawl_params.update(kwargs)
|
|
3250
|
+
|
|
3251
|
+
# Create final params object
|
|
3252
|
+
final_params = CrawlParams(**crawl_params)
|
|
3253
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3254
|
+
params_dict['url'] = url
|
|
3255
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3256
|
+
|
|
3257
|
+
# Make request
|
|
3258
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3259
|
+
response = await self._async_post_request(
|
|
3260
|
+
f'{self.api_url}/v1/crawl',
|
|
3261
|
+
params_dict,
|
|
3262
|
+
headers
|
|
3263
|
+
)
|
|
3264
|
+
|
|
3265
|
+
if response.status_code == 200:
|
|
3266
|
+
try:
|
|
3267
|
+
return CrawlResponse(**response.json())
|
|
3268
|
+
except:
|
|
3269
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3270
|
+
else:
|
|
3271
|
+
self._handle_error(response, 'start crawl job')
|
|
3272
|
+
|
|
3273
|
+
async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
|
|
3274
|
+
"""
|
|
3275
|
+
Check the status and results of an asynchronous crawl job.
|
|
3276
|
+
|
|
3277
|
+
Args:
|
|
3278
|
+
id (str): Unique identifier for the crawl job
|
|
3279
|
+
|
|
3280
|
+
Returns:
|
|
3281
|
+
CrawlStatusResponse containing:
|
|
3282
|
+
Status Information:
|
|
3283
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
3284
|
+
* completed - Number of pages crawled
|
|
3285
|
+
* total - Total pages to crawl
|
|
3286
|
+
* creditsUsed - API credits consumed
|
|
3287
|
+
* expiresAt - Data expiration timestamp
|
|
3288
|
+
|
|
3289
|
+
Results:
|
|
3290
|
+
* data - List of crawled documents
|
|
3291
|
+
* next - URL for next page of results (if paginated)
|
|
3292
|
+
* success - Whether status check succeeded
|
|
3293
|
+
* error - Error message if failed
|
|
3294
|
+
|
|
3295
|
+
Raises:
|
|
3296
|
+
Exception: If status check fails
|
|
3297
|
+
"""
|
|
3298
|
+
headers = self._prepare_headers()
|
|
3299
|
+
endpoint = f'/v1/crawl/{id}'
|
|
3300
|
+
|
|
3301
|
+
status_data = await self._async_get_request(
|
|
3302
|
+
f'{self.api_url}{endpoint}',
|
|
3303
|
+
headers
|
|
3304
|
+
)
|
|
3305
|
+
|
|
3306
|
+
if status_data['status'] == 'completed':
|
|
3307
|
+
if 'data' in status_data:
|
|
3308
|
+
data = status_data['data']
|
|
3309
|
+
while 'next' in status_data:
|
|
3310
|
+
if len(status_data['data']) == 0:
|
|
3311
|
+
break
|
|
3312
|
+
next_url = status_data.get('next')
|
|
3313
|
+
if not next_url:
|
|
3314
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3315
|
+
break
|
|
3316
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3317
|
+
data.extend(next_data.get('data', []))
|
|
3318
|
+
status_data = next_data
|
|
3319
|
+
status_data['data'] = data
|
|
3320
|
+
|
|
3321
|
+
response = {
|
|
3322
|
+
'status': status_data.get('status'),
|
|
3323
|
+
'total': status_data.get('total'),
|
|
3324
|
+
'completed': status_data.get('completed'),
|
|
3325
|
+
'creditsUsed': status_data.get('creditsUsed'),
|
|
3326
|
+
'expiresAt': status_data.get('expiresAt'),
|
|
3327
|
+
'data': status_data.get('data')
|
|
3328
|
+
}
|
|
3329
|
+
|
|
3330
|
+
if 'error' in status_data:
|
|
3331
|
+
response['error'] = status_data['error']
|
|
3332
|
+
|
|
3333
|
+
if 'next' in status_data:
|
|
3334
|
+
response['next'] = status_data['next']
|
|
3335
|
+
|
|
3336
|
+
return {
|
|
3337
|
+
'success': False if 'error' in status_data else True,
|
|
3338
|
+
**response
|
|
3339
|
+
}
|
|
3340
|
+
|
|
3341
|
+
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
|
|
3342
|
+
"""
|
|
3343
|
+
Monitor the status of an asynchronous job until completion.
|
|
3344
|
+
|
|
3345
|
+
Args:
|
|
3346
|
+
id (str): The ID of the job to monitor
|
|
3347
|
+
headers (Dict[str, str]): Headers to include in status check requests
|
|
3348
|
+
poll_interval (int): Seconds between status checks (default: 2)
|
|
3349
|
+
|
|
3350
|
+
Returns:
|
|
3351
|
+
CrawlStatusResponse: The job results if completed successfully
|
|
3352
|
+
|
|
3353
|
+
Raises:
|
|
3354
|
+
Exception: If the job fails or an error occurs during status checks
|
|
3355
|
+
"""
|
|
3356
|
+
while True:
|
|
3357
|
+
status_data = await self._async_get_request(
|
|
3358
|
+
f'{self.api_url}/v1/crawl/{id}',
|
|
3359
|
+
headers
|
|
3360
|
+
)
|
|
3361
|
+
|
|
3362
|
+
if status_data['status'] == 'completed':
|
|
3363
|
+
if 'data' in status_data:
|
|
3364
|
+
data = status_data['data']
|
|
3365
|
+
while 'next' in status_data:
|
|
3366
|
+
if len(status_data['data']) == 0:
|
|
3367
|
+
break
|
|
3368
|
+
next_url = status_data.get('next')
|
|
3369
|
+
if not next_url:
|
|
3370
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3371
|
+
break
|
|
3372
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3373
|
+
data.extend(next_data.get('data', []))
|
|
3374
|
+
status_data = next_data
|
|
3375
|
+
status_data['data'] = data
|
|
3376
|
+
return status_data
|
|
3377
|
+
else:
|
|
3378
|
+
raise Exception('Job completed but no data was returned')
|
|
3379
|
+
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
3380
|
+
await asyncio.sleep(max(poll_interval, 2))
|
|
3381
|
+
else:
|
|
3382
|
+
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
|
|
3383
|
+
|
|
3384
|
+
async def map_url(
|
|
3385
|
+
self,
|
|
3386
|
+
url: str,
|
|
3387
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
3388
|
+
"""
|
|
3389
|
+
Asynchronously map and discover links from a URL.
|
|
3390
|
+
|
|
3391
|
+
Args:
|
|
3392
|
+
url (str): Target URL to map
|
|
3393
|
+
params (Optional[MapParams]): See MapParams model:
|
|
3394
|
+
Discovery Options:
|
|
3395
|
+
* search - Filter pattern for URLs
|
|
3396
|
+
* ignoreSitemap - Skip sitemap.xml
|
|
3397
|
+
* includeSubdomains - Include subdomain links
|
|
3398
|
+
* sitemapOnly - Only use sitemap.xml
|
|
3399
|
+
|
|
3400
|
+
Limits:
|
|
3401
|
+
* limit - Max URLs to return
|
|
3402
|
+
* timeout - Request timeout (ms)
|
|
3403
|
+
|
|
3404
|
+
Returns:
|
|
3405
|
+
MapResponse with:
|
|
3406
|
+
* Discovered URLs
|
|
3407
|
+
* Success/error status
|
|
3408
|
+
|
|
3409
|
+
Raises:
|
|
3410
|
+
Exception: If mapping fails
|
|
3411
|
+
"""
|
|
3412
|
+
headers = self._prepare_headers()
|
|
3413
|
+
json_data = {'url': url}
|
|
3414
|
+
if params:
|
|
3415
|
+
json_data.update(params)
|
|
3416
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
3417
|
+
|
|
3418
|
+
endpoint = f'/v1/map'
|
|
3419
|
+
response = await self._async_post_request(
|
|
3420
|
+
f'{self.api_url}{endpoint}',
|
|
3421
|
+
json_data,
|
|
3422
|
+
headers
|
|
3423
|
+
)
|
|
3424
|
+
|
|
3425
|
+
if response.get('success') and 'links' in response:
|
|
3426
|
+
return response
|
|
3427
|
+
elif 'error' in response:
|
|
3428
|
+
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
|
3429
|
+
else:
|
|
3430
|
+
raise Exception(f'Failed to map URL. Error: {response}')
|
|
3431
|
+
|
|
3432
|
+
async def extract(
|
|
3433
|
+
self,
|
|
3434
|
+
urls: List[str],
|
|
3435
|
+
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
|
|
3436
|
+
"""
|
|
3437
|
+
Asynchronously extract structured information from URLs.
|
|
3438
|
+
|
|
3439
|
+
Args:
|
|
3440
|
+
urls (List[str]): URLs to extract from
|
|
3441
|
+
params (Optional[ExtractParams]): See ExtractParams model:
|
|
3442
|
+
Extraction Config:
|
|
3443
|
+
* prompt - Custom extraction prompt
|
|
3444
|
+
* schema - JSON schema/Pydantic model
|
|
3445
|
+
* systemPrompt - System context
|
|
3446
|
+
|
|
3447
|
+
Behavior Options:
|
|
3448
|
+
* allowExternalLinks - Follow external links
|
|
3449
|
+
* enableWebSearch - Enable web search
|
|
3450
|
+
* includeSubdomains - Include subdomains
|
|
3451
|
+
* showSources - Include source URLs
|
|
3452
|
+
|
|
3453
|
+
Scraping Options:
|
|
3454
|
+
* scrapeOptions - Page scraping config
|
|
3455
|
+
|
|
3456
|
+
Returns:
|
|
3457
|
+
ExtractResponse with:
|
|
3458
|
+
* Structured data matching schema
|
|
3459
|
+
* Source information if requested
|
|
3460
|
+
* Success/error status
|
|
3461
|
+
|
|
3462
|
+
Raises:
|
|
3463
|
+
ValueError: If prompt/schema missing or extraction fails
|
|
3464
|
+
"""
|
|
3465
|
+
headers = self._prepare_headers()
|
|
3466
|
+
|
|
3467
|
+
if not params or (not params.get('prompt') and not params.get('schema')):
|
|
3468
|
+
raise ValueError("Either prompt or schema is required")
|
|
3469
|
+
|
|
3470
|
+
schema = params.get('schema')
|
|
3471
|
+
if schema:
|
|
3472
|
+
if hasattr(schema, 'model_json_schema'):
|
|
3473
|
+
schema = schema.model_json_schema()
|
|
3474
|
+
|
|
3475
|
+
request_data = {
|
|
3476
|
+
'urls': urls,
|
|
3477
|
+
'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)),
|
|
3478
|
+
'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)),
|
|
3479
|
+
'showSources': params.get('show_sources', params.get('showSources', False)),
|
|
3480
|
+
'schema': schema,
|
|
3481
|
+
'origin': f'python-sdk@{version}'
|
|
3482
|
+
}
|
|
3483
|
+
|
|
3484
|
+
if params.get('prompt'):
|
|
3485
|
+
request_data['prompt'] = params['prompt']
|
|
3486
|
+
if params.get('system_prompt'):
|
|
3487
|
+
request_data['systemPrompt'] = params['system_prompt']
|
|
3488
|
+
elif params.get('systemPrompt'):
|
|
3489
|
+
request_data['systemPrompt'] = params['systemPrompt']
|
|
3490
|
+
|
|
3491
|
+
response = await self._async_post_request(
|
|
3492
|
+
f'{self.api_url}/v1/extract',
|
|
3493
|
+
request_data,
|
|
3494
|
+
headers
|
|
3495
|
+
)
|
|
3496
|
+
|
|
3497
|
+
if response.get('success'):
|
|
3498
|
+
job_id = response.get('id')
|
|
3499
|
+
if not job_id:
|
|
3500
|
+
raise Exception('Job ID not returned from extract request.')
|
|
3501
|
+
|
|
3502
|
+
while True:
|
|
3503
|
+
status_data = await self._async_get_request(
|
|
3504
|
+
f'{self.api_url}/v1/extract/{job_id}',
|
|
3505
|
+
headers
|
|
3506
|
+
)
|
|
3507
|
+
|
|
3508
|
+
if status_data['status'] == 'completed':
|
|
3509
|
+
return status_data
|
|
3510
|
+
elif status_data['status'] in ['failed', 'cancelled']:
|
|
3511
|
+
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
3512
|
+
|
|
3513
|
+
await asyncio.sleep(2)
|
|
3514
|
+
else:
|
|
3515
|
+
raise Exception(f'Failed to extract. Error: {response.get("error")}')
|
|
3516
|
+
|
|
3517
|
+
async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
|
|
3518
|
+
"""
|
|
3519
|
+
Check the status of an asynchronous batch scrape job.
|
|
3520
|
+
|
|
3521
|
+
Args:
|
|
3522
|
+
id (str): The ID of the batch scrape job
|
|
3523
|
+
|
|
3524
|
+
Returns:
|
|
3525
|
+
BatchScrapeStatusResponse containing:
|
|
3526
|
+
Status Information:
|
|
3527
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
3528
|
+
* completed - Number of URLs scraped
|
|
3529
|
+
* total - Total URLs to scrape
|
|
3530
|
+
* creditsUsed - API credits consumed
|
|
3531
|
+
* expiresAt - Data expiration timestamp
|
|
3532
|
+
|
|
3533
|
+
Results:
|
|
3534
|
+
* data - List of scraped documents
|
|
3535
|
+
* next - URL for next page of results (if paginated)
|
|
3536
|
+
* success - Whether status check succeeded
|
|
3537
|
+
* error - Error message if failed
|
|
3538
|
+
|
|
3539
|
+
Raises:
|
|
3540
|
+
Exception: If status check fails
|
|
3541
|
+
"""
|
|
3542
|
+
headers = self._prepare_headers()
|
|
3543
|
+
endpoint = f'/v1/batch/scrape/{id}'
|
|
3544
|
+
|
|
3545
|
+
status_data = await self._async_get_request(
|
|
3546
|
+
f'{self.api_url}{endpoint}',
|
|
3547
|
+
headers
|
|
3548
|
+
)
|
|
3549
|
+
|
|
3550
|
+
if status_data['status'] == 'completed':
|
|
3551
|
+
if 'data' in status_data:
|
|
3552
|
+
data = status_data['data']
|
|
3553
|
+
while 'next' in status_data:
|
|
3554
|
+
if len(status_data['data']) == 0:
|
|
3555
|
+
break
|
|
3556
|
+
next_url = status_data.get('next')
|
|
3557
|
+
if not next_url:
|
|
3558
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3559
|
+
break
|
|
3560
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3561
|
+
data.extend(next_data.get('data', []))
|
|
3562
|
+
status_data = next_data
|
|
3563
|
+
status_data['data'] = data
|
|
3564
|
+
|
|
3565
|
+
response = {
|
|
3566
|
+
'status': status_data.get('status'),
|
|
3567
|
+
'total': status_data.get('total'),
|
|
3568
|
+
'completed': status_data.get('completed'),
|
|
3569
|
+
'creditsUsed': status_data.get('creditsUsed'),
|
|
3570
|
+
'expiresAt': status_data.get('expiresAt'),
|
|
3571
|
+
'data': status_data.get('data')
|
|
3572
|
+
}
|
|
3573
|
+
|
|
3574
|
+
if 'error' in status_data:
|
|
3575
|
+
response['error'] = status_data['error']
|
|
3576
|
+
|
|
3577
|
+
if 'next' in status_data:
|
|
3578
|
+
response['next'] = status_data['next']
|
|
3579
|
+
|
|
3580
|
+
return {
|
|
3581
|
+
'success': False if 'error' in status_data else True,
|
|
3582
|
+
**response
|
|
3583
|
+
}
|
|
3584
|
+
|
|
3585
|
+
async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
|
|
3586
|
+
"""
|
|
3587
|
+
Get information about errors from an asynchronous batch scrape job.
|
|
3588
|
+
|
|
3589
|
+
Args:
|
|
3590
|
+
id (str): The ID of the batch scrape job
|
|
3591
|
+
|
|
3592
|
+
Returns:
|
|
3593
|
+
CrawlErrorsResponse containing:
|
|
3594
|
+
errors (List[Dict[str, str]]): List of errors with fields:
|
|
3595
|
+
* id (str): Error ID
|
|
3596
|
+
* timestamp (str): When the error occurred
|
|
3597
|
+
* url (str): URL that caused the error
|
|
3598
|
+
* error (str): Error message
|
|
3599
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
3600
|
+
|
|
3601
|
+
Raises:
|
|
3602
|
+
Exception: If error check fails
|
|
3603
|
+
"""
|
|
3604
|
+
headers = self._prepare_headers()
|
|
3605
|
+
return await self._async_get_request(
|
|
3606
|
+
f'{self.api_url}/v1/batch/scrape/{id}/errors',
|
|
3607
|
+
headers
|
|
3608
|
+
)
|
|
3609
|
+
|
|
3610
|
+
async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
|
|
3611
|
+
"""
|
|
3612
|
+
Get information about errors from an asynchronous crawl job.
|
|
3613
|
+
|
|
3614
|
+
Args:
|
|
3615
|
+
id (str): The ID of the crawl job
|
|
3616
|
+
|
|
3617
|
+
Returns:
|
|
3618
|
+
CrawlErrorsResponse containing:
|
|
3619
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
3620
|
+
- id (str): Error ID
|
|
3621
|
+
- timestamp (str): When the error occurred
|
|
3622
|
+
- url (str): URL that caused the error
|
|
3623
|
+
- error (str): Error message
|
|
3624
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
3625
|
+
|
|
3626
|
+
Raises:
|
|
3627
|
+
Exception: If error check fails
|
|
3628
|
+
"""
|
|
3629
|
+
headers = self._prepare_headers()
|
|
3630
|
+
return await self._async_get_request(
|
|
3631
|
+
f'{self.api_url}/v1/crawl/{id}/errors',
|
|
3632
|
+
headers
|
|
3633
|
+
)
|
|
3634
|
+
|
|
3635
|
+
async def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
|
3636
|
+
"""
|
|
3637
|
+
Cancel an asynchronous crawl job.
|
|
3638
|
+
|
|
3639
|
+
Args:
|
|
3640
|
+
id (str): The ID of the crawl job to cancel
|
|
3641
|
+
|
|
3642
|
+
Returns:
|
|
3643
|
+
Dict[str, Any] containing:
|
|
3644
|
+
* success (bool): Whether cancellation was successful
|
|
3645
|
+
* error (str, optional): Error message if cancellation failed
|
|
3646
|
+
|
|
3647
|
+
Raises:
|
|
3648
|
+
Exception: If cancellation fails
|
|
3649
|
+
"""
|
|
3650
|
+
headers = self._prepare_headers()
|
|
3651
|
+
async with aiohttp.ClientSession() as session:
|
|
3652
|
+
async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
|
|
3653
|
+
return await response.json()
|
|
3654
|
+
|
|
3655
|
+
async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
|
|
3656
|
+
"""
|
|
3657
|
+
Check the status of an asynchronous extraction job.
|
|
3658
|
+
|
|
3659
|
+
Args:
|
|
3660
|
+
job_id (str): The ID of the extraction job
|
|
3661
|
+
|
|
3662
|
+
Returns:
|
|
3663
|
+
ExtractResponse[Any] with:
|
|
3664
|
+
* success (bool): Whether request succeeded
|
|
3665
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
3666
|
+
* error (Optional[str]): Error message if any
|
|
3667
|
+
* warning (Optional[str]): Warning message if any
|
|
3668
|
+
* sources (Optional[List[str]]): Source URLs if requested
|
|
3669
|
+
|
|
3670
|
+
Raises:
|
|
3671
|
+
ValueError: If status check fails
|
|
3672
|
+
"""
|
|
3673
|
+
headers = self._prepare_headers()
|
|
3674
|
+
try:
|
|
3675
|
+
return await self._async_get_request(
|
|
3676
|
+
f'{self.api_url}/v1/extract/{job_id}',
|
|
3677
|
+
headers
|
|
3678
|
+
)
|
|
3679
|
+
except Exception as e:
|
|
3680
|
+
raise ValueError(str(e))
|
|
3681
|
+
|
|
3682
|
+
async def async_extract(
|
|
3683
|
+
self,
|
|
3684
|
+
urls: Optional[List[str]] = None,
|
|
3685
|
+
*,
|
|
3686
|
+
prompt: Optional[str] = None,
|
|
3687
|
+
schema: Optional[Any] = None,
|
|
3688
|
+
system_prompt: Optional[str] = None,
|
|
3689
|
+
allow_external_links: Optional[bool] = False,
|
|
3690
|
+
enable_web_search: Optional[bool] = False,
|
|
3691
|
+
show_sources: Optional[bool] = False,
|
|
3692
|
+
agent: Optional[Dict[str, Any]] = None,
|
|
3693
|
+
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
3694
|
+
"""
|
|
3695
|
+
Initiate an asynchronous extraction job without waiting for completion.
|
|
3696
|
+
|
|
3697
|
+
Args:
|
|
3698
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
3699
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
3700
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
3701
|
+
system_prompt (Optional[str]): System context
|
|
3702
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
3703
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
3704
|
+
show_sources (Optional[bool]): Include source URLs
|
|
3705
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
3706
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3707
|
+
|
|
3708
|
+
Returns:
|
|
3709
|
+
ExtractResponse[Any] with:
|
|
3710
|
+
* success (bool): Whether request succeeded
|
|
3711
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
3712
|
+
* error (Optional[str]): Error message if any
|
|
3713
|
+
|
|
3714
|
+
Raises:
|
|
3715
|
+
ValueError: If job initiation fails
|
|
3716
|
+
"""
|
|
3717
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3718
|
+
|
|
3719
|
+
if not prompt and not schema:
|
|
3720
|
+
raise ValueError("Either prompt or schema is required")
|
|
3721
|
+
|
|
3722
|
+
if not urls and not prompt:
|
|
3723
|
+
raise ValueError("Either urls or prompt is required")
|
|
3724
|
+
|
|
3725
|
+
if schema:
|
|
3726
|
+
if hasattr(schema, 'model_json_schema'):
|
|
3727
|
+
schema = schema.model_json_schema()
|
|
3728
|
+
|
|
3729
|
+
request_data = {
|
|
3730
|
+
'urls': urls or [],
|
|
3731
|
+
'allowExternalLinks': allow_external_links,
|
|
3732
|
+
'enableWebSearch': enable_web_search,
|
|
3733
|
+
'showSources': show_sources,
|
|
3734
|
+
'schema': schema,
|
|
3735
|
+
'origin': f'python-sdk@{version}'
|
|
3736
|
+
}
|
|
3737
|
+
|
|
3738
|
+
if prompt:
|
|
3739
|
+
request_data['prompt'] = prompt
|
|
3740
|
+
if system_prompt:
|
|
3741
|
+
request_data['systemPrompt'] = system_prompt
|
|
3742
|
+
if agent:
|
|
3743
|
+
request_data['agent'] = agent
|
|
3744
|
+
|
|
3745
|
+
try:
|
|
3746
|
+
return await self._async_post_request(
|
|
3747
|
+
f'{self.api_url}/v1/extract',
|
|
3748
|
+
request_data,
|
|
3749
|
+
headers
|
|
3750
|
+
)
|
|
3751
|
+
except Exception as e:
|
|
3752
|
+
raise ValueError(str(e))
|
|
3753
|
+
|
|
3754
|
+
async def generate_llms_text(
|
|
3755
|
+
self,
|
|
3756
|
+
url: str,
|
|
3757
|
+
*,
|
|
3758
|
+
max_urls: Optional[int] = None,
|
|
3759
|
+
show_full_text: Optional[bool] = None,
|
|
3760
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
|
3761
|
+
"""
|
|
3762
|
+
Generate LLMs.txt for a given URL and monitor until completion.
|
|
3763
|
+
|
|
3764
|
+
Args:
|
|
3765
|
+
url (str): Target URL to generate LLMs.txt from
|
|
3766
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
3767
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
3768
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
3769
|
+
|
|
3770
|
+
Returns:
|
|
3771
|
+
GenerateLLMsTextStatusResponse containing:
|
|
3772
|
+
* success (bool): Whether generation completed successfully
|
|
3773
|
+
* status (str): Status of generation (processing/completed/failed)
|
|
3774
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
3775
|
+
- llmstxt (str): Generated LLMs.txt content
|
|
3776
|
+
- llmsfulltxt (str, optional): Full version if requested
|
|
3777
|
+
* error (str, optional): Error message if generation failed
|
|
3778
|
+
* expiresAt (str): When the generated data expires
|
|
3779
|
+
|
|
3780
|
+
Raises:
|
|
3781
|
+
Exception: If generation fails
|
|
3782
|
+
"""
|
|
3783
|
+
params = {}
|
|
3784
|
+
if max_urls is not None:
|
|
3785
|
+
params['maxUrls'] = max_urls
|
|
3786
|
+
if show_full_text is not None:
|
|
3787
|
+
params['showFullText'] = show_full_text
|
|
3788
|
+
if experimental_stream is not None:
|
|
3789
|
+
params['__experimental_stream'] = experimental_stream
|
|
3790
|
+
|
|
3791
|
+
response = await self.async_generate_llms_text(
|
|
3792
|
+
url,
|
|
3793
|
+
max_urls=max_urls,
|
|
3794
|
+
show_full_text=show_full_text,
|
|
3795
|
+
experimental_stream=experimental_stream
|
|
3796
|
+
)
|
|
3797
|
+
if not response.get('success') or 'id' not in response:
|
|
3798
|
+
return response
|
|
3799
|
+
|
|
3800
|
+
job_id = response['id']
|
|
3801
|
+
while True:
|
|
3802
|
+
status = await self.check_generate_llms_text_status(job_id)
|
|
3803
|
+
|
|
3804
|
+
if status['status'] == 'completed':
|
|
3805
|
+
return status
|
|
3806
|
+
elif status['status'] == 'failed':
|
|
3807
|
+
raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
|
|
3808
|
+
elif status['status'] != 'processing':
|
|
3809
|
+
break
|
|
3810
|
+
|
|
3811
|
+
await asyncio.sleep(2)
|
|
3812
|
+
|
|
3813
|
+
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
|
3814
|
+
|
|
3815
|
+
async def async_generate_llms_text(
|
|
3816
|
+
self,
|
|
3817
|
+
url: str,
|
|
3818
|
+
*,
|
|
3819
|
+
max_urls: Optional[int] = None,
|
|
3820
|
+
show_full_text: Optional[bool] = None,
|
|
3821
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
3822
|
+
"""
|
|
3823
|
+
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
|
3824
|
+
|
|
3825
|
+
Args:
|
|
3826
|
+
url (str): Target URL to generate LLMs.txt from
|
|
3827
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
3828
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
3829
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
3830
|
+
|
|
3831
|
+
Returns:
|
|
3832
|
+
GenerateLLMsTextResponse containing:
|
|
3833
|
+
* success (bool): Whether job started successfully
|
|
3834
|
+
* id (str): Unique identifier for the job
|
|
3835
|
+
* error (str, optional): Error message if start failed
|
|
3836
|
+
|
|
3837
|
+
Raises:
|
|
3838
|
+
ValueError: If job initiation fails
|
|
3839
|
+
"""
|
|
3840
|
+
params = {}
|
|
3841
|
+
if max_urls is not None:
|
|
3842
|
+
params['maxUrls'] = max_urls
|
|
3843
|
+
if show_full_text is not None:
|
|
3844
|
+
params['showFullText'] = show_full_text
|
|
3845
|
+
if experimental_stream is not None:
|
|
3846
|
+
params['__experimental_stream'] = experimental_stream
|
|
3847
|
+
|
|
3848
|
+
headers = self._prepare_headers()
|
|
3849
|
+
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
3850
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
3851
|
+
|
|
3852
|
+
try:
|
|
3853
|
+
return await self._async_post_request(
|
|
3854
|
+
f'{self.api_url}/v1/llmstxt',
|
|
3855
|
+
json_data,
|
|
3856
|
+
headers
|
|
3857
|
+
)
|
|
3858
|
+
except Exception as e:
|
|
3859
|
+
raise ValueError(str(e))
|
|
3860
|
+
|
|
3861
|
+
async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
|
|
3862
|
+
"""
|
|
3863
|
+
Check the status of an asynchronous LLMs.txt generation job.
|
|
3864
|
+
|
|
3865
|
+
Args:
|
|
3866
|
+
id (str): The ID of the generation job
|
|
3867
|
+
|
|
3868
|
+
Returns:
|
|
3869
|
+
GenerateLLMsTextStatusResponse containing:
|
|
3870
|
+
* success (bool): Whether generation completed successfully
|
|
3871
|
+
* status (str): Status of generation (processing/completed/failed)
|
|
3872
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
3873
|
+
- llmstxt (str): Generated LLMs.txt content
|
|
3874
|
+
- llmsfulltxt (str, optional): Full version if requested
|
|
3875
|
+
* error (str, optional): Error message if generation failed
|
|
3876
|
+
* expiresAt (str): When the generated data expires
|
|
3877
|
+
|
|
3878
|
+
Raises:
|
|
3879
|
+
ValueError: If status check fails
|
|
3880
|
+
"""
|
|
3881
|
+
headers = self._prepare_headers()
|
|
3882
|
+
try:
|
|
3883
|
+
return await self._async_get_request(
|
|
3884
|
+
f'{self.api_url}/v1/llmstxt/{id}',
|
|
3885
|
+
headers
|
|
3886
|
+
)
|
|
3887
|
+
except Exception as e:
|
|
3888
|
+
raise ValueError(str(e))
|
|
3889
|
+
|
|
3890
|
+
async def deep_research(
|
|
3891
|
+
self,
|
|
3892
|
+
query: str,
|
|
3893
|
+
*,
|
|
3894
|
+
max_depth: Optional[int] = None,
|
|
3895
|
+
time_limit: Optional[int] = None,
|
|
3896
|
+
max_urls: Optional[int] = None,
|
|
3897
|
+
analysis_prompt: Optional[str] = None,
|
|
3898
|
+
system_prompt: Optional[str] = None,
|
|
3899
|
+
__experimental_stream_steps: Optional[bool] = None,
|
|
3900
|
+
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
3901
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
|
|
3902
|
+
"""
|
|
3903
|
+
Initiates a deep research operation on a given query and polls until completion.
|
|
3904
|
+
|
|
3905
|
+
Args:
|
|
3906
|
+
query (str): Research query or topic to investigate
|
|
3907
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
3908
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
3909
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
3910
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
3911
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
3912
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
3913
|
+
on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
|
|
3914
|
+
on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
|
|
3915
|
+
|
|
3916
|
+
Returns:
|
|
3917
|
+
DeepResearchStatusResponse containing:
|
|
3918
|
+
* success (bool): Whether research completed successfully
|
|
3919
|
+
* status (str): Current state (processing/completed/failed)
|
|
3920
|
+
* error (Optional[str]): Error message if failed
|
|
3921
|
+
* id (str): Unique identifier for the research job
|
|
3922
|
+
* data (Any): Research findings and analysis
|
|
3923
|
+
* sources (List[Dict]): List of discovered sources
|
|
3924
|
+
* activities (List[Dict]): Research progress log
|
|
3925
|
+
* summaries (List[str]): Generated research summaries
|
|
3926
|
+
|
|
3927
|
+
Raises:
|
|
3928
|
+
Exception: If research fails
|
|
3929
|
+
"""
|
|
3930
|
+
research_params = {}
|
|
3931
|
+
if max_depth is not None:
|
|
3932
|
+
research_params['maxDepth'] = max_depth
|
|
3933
|
+
if time_limit is not None:
|
|
3934
|
+
research_params['timeLimit'] = time_limit
|
|
3935
|
+
if max_urls is not None:
|
|
3936
|
+
research_params['maxUrls'] = max_urls
|
|
3937
|
+
if analysis_prompt is not None:
|
|
3938
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
3939
|
+
if system_prompt is not None:
|
|
3940
|
+
research_params['systemPrompt'] = system_prompt
|
|
3941
|
+
if __experimental_stream_steps is not None:
|
|
3942
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
3943
|
+
research_params = DeepResearchParams(**research_params)
|
|
3944
|
+
|
|
3945
|
+
response = await self.async_deep_research(
|
|
3946
|
+
query,
|
|
3947
|
+
max_depth=max_depth,
|
|
3948
|
+
time_limit=time_limit,
|
|
3949
|
+
max_urls=max_urls,
|
|
3950
|
+
analysis_prompt=analysis_prompt,
|
|
3951
|
+
system_prompt=system_prompt
|
|
3952
|
+
)
|
|
3953
|
+
if not response.get('success') or 'id' not in response:
|
|
3954
|
+
return response
|
|
3955
|
+
|
|
3956
|
+
job_id = response['id']
|
|
3957
|
+
last_activity_count = 0
|
|
3958
|
+
last_source_count = 0
|
|
3959
|
+
|
|
3960
|
+
while True:
|
|
3961
|
+
status = await self.check_deep_research_status(job_id)
|
|
3962
|
+
|
|
3963
|
+
if on_activity and 'activities' in status:
|
|
3964
|
+
new_activities = status['activities'][last_activity_count:]
|
|
3965
|
+
for activity in new_activities:
|
|
3966
|
+
on_activity(activity)
|
|
3967
|
+
last_activity_count = len(status['activities'])
|
|
3968
|
+
|
|
3969
|
+
if on_source and 'sources' in status:
|
|
3970
|
+
new_sources = status['sources'][last_source_count:]
|
|
3971
|
+
for source in new_sources:
|
|
3972
|
+
on_source(source)
|
|
3973
|
+
last_source_count = len(status['sources'])
|
|
3974
|
+
|
|
3975
|
+
if status['status'] == 'completed':
|
|
3976
|
+
return status
|
|
3977
|
+
elif status['status'] == 'failed':
|
|
3978
|
+
raise Exception(f'Deep research failed. Error: {status.get("error")}')
|
|
3979
|
+
elif status['status'] != 'processing':
|
|
3980
|
+
break
|
|
3981
|
+
|
|
3982
|
+
await asyncio.sleep(2)
|
|
3983
|
+
|
|
3984
|
+
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
|
3985
|
+
|
|
3986
|
+
async def async_deep_research(
|
|
3987
|
+
self,
|
|
3988
|
+
query: str,
|
|
3989
|
+
*,
|
|
3990
|
+
max_depth: Optional[int] = None,
|
|
3991
|
+
time_limit: Optional[int] = None,
|
|
3992
|
+
max_urls: Optional[int] = None,
|
|
3993
|
+
analysis_prompt: Optional[str] = None,
|
|
3994
|
+
system_prompt: Optional[str] = None,
|
|
3995
|
+
__experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
|
|
3996
|
+
"""
|
|
3997
|
+
Initiates an asynchronous deep research operation.
|
|
3998
|
+
|
|
3999
|
+
Args:
|
|
4000
|
+
query (str): Research query or topic to investigate
|
|
4001
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
4002
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
4003
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
4004
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
4005
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
4006
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
4007
|
+
|
|
4008
|
+
Returns:
|
|
4009
|
+
Dict[str, Any]: A response containing:
|
|
4010
|
+
* success (bool): Whether the research initiation was successful
|
|
4011
|
+
* id (str): The unique identifier for the research job
|
|
4012
|
+
* error (str, optional): Error message if initiation failed
|
|
4013
|
+
|
|
4014
|
+
Raises:
|
|
4015
|
+
Exception: If the research initiation fails.
|
|
4016
|
+
"""
|
|
4017
|
+
research_params = {}
|
|
4018
|
+
if max_depth is not None:
|
|
4019
|
+
research_params['maxDepth'] = max_depth
|
|
4020
|
+
if time_limit is not None:
|
|
4021
|
+
research_params['timeLimit'] = time_limit
|
|
4022
|
+
if max_urls is not None:
|
|
4023
|
+
research_params['maxUrls'] = max_urls
|
|
4024
|
+
if analysis_prompt is not None:
|
|
4025
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
4026
|
+
if system_prompt is not None:
|
|
4027
|
+
research_params['systemPrompt'] = system_prompt
|
|
4028
|
+
if __experimental_stream_steps is not None:
|
|
4029
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
4030
|
+
research_params = DeepResearchParams(**research_params)
|
|
4031
|
+
|
|
4032
|
+
headers = self._prepare_headers()
|
|
4033
|
+
|
|
4034
|
+
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
|
4035
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
4036
|
+
|
|
4037
|
+
try:
|
|
4038
|
+
return await self._async_post_request(
|
|
4039
|
+
f'{self.api_url}/v1/deep-research',
|
|
4040
|
+
json_data,
|
|
4041
|
+
headers
|
|
4042
|
+
)
|
|
4043
|
+
except Exception as e:
|
|
4044
|
+
raise ValueError(str(e))
|
|
4045
|
+
|
|
4046
|
+
async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
|
|
4047
|
+
"""
|
|
4048
|
+
Check the status of a deep research operation.
|
|
4049
|
+
|
|
4050
|
+
Args:
|
|
4051
|
+
id (str): The ID of the deep research operation.
|
|
4052
|
+
|
|
4053
|
+
Returns:
|
|
4054
|
+
DeepResearchResponse containing:
|
|
4055
|
+
|
|
4056
|
+
Status:
|
|
4057
|
+
* success - Whether research completed successfully
|
|
4058
|
+
* status - Current state (processing/completed/failed)
|
|
4059
|
+
* error - Error message if failed
|
|
4060
|
+
|
|
4061
|
+
Results:
|
|
4062
|
+
* id - Unique identifier for the research job
|
|
4063
|
+
* data - Research findings and analysis
|
|
4064
|
+
* sources - List of discovered sources
|
|
4065
|
+
* activities - Research progress log
|
|
4066
|
+
* summaries - Generated research summaries
|
|
4067
|
+
|
|
4068
|
+
Raises:
|
|
4069
|
+
Exception: If the status check fails.
|
|
4070
|
+
"""
|
|
4071
|
+
headers = self._prepare_headers()
|
|
4072
|
+
try:
|
|
4073
|
+
return await self._async_get_request(
|
|
4074
|
+
f'{self.api_url}/v1/deep-research/{id}',
|
|
4075
|
+
headers
|
|
4076
|
+
)
|
|
4077
|
+
except Exception as e:
|
|
4078
|
+
raise ValueError(str(e))
|
|
4079
|
+
|
|
4080
|
+
async def search(
|
|
4081
|
+
self,
|
|
4082
|
+
query: str,
|
|
4083
|
+
*,
|
|
4084
|
+
limit: Optional[int] = None,
|
|
4085
|
+
tbs: Optional[str] = None,
|
|
4086
|
+
filter: Optional[str] = None,
|
|
4087
|
+
lang: Optional[str] = None,
|
|
4088
|
+
country: Optional[str] = None,
|
|
4089
|
+
location: Optional[str] = None,
|
|
4090
|
+
timeout: Optional[int] = None,
|
|
4091
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
4092
|
+
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
4093
|
+
**kwargs) -> SearchResponse:
|
|
4094
|
+
"""
|
|
4095
|
+
Asynchronously search for content using Firecrawl.
|
|
4096
|
+
|
|
4097
|
+
Args:
|
|
4098
|
+
query (str): Search query string
|
|
4099
|
+
limit (Optional[int]): Max results (default: 5)
|
|
4100
|
+
tbs (Optional[str]): Time filter (e.g. "qdr:d")
|
|
4101
|
+
filter (Optional[str]): Custom result filter
|
|
4102
|
+
lang (Optional[str]): Language code (default: "en")
|
|
4103
|
+
country (Optional[str]): Country code (default: "us")
|
|
4104
|
+
location (Optional[str]): Geo-targeting
|
|
4105
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
4106
|
+
scrape_options (Optional[CommonOptions]): Result scraping configuration
|
|
4107
|
+
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
4108
|
+
**kwargs: Additional keyword arguments for future compatibility
|
|
4109
|
+
|
|
4110
|
+
Returns:
|
|
4111
|
+
SearchResponse: Response containing:
|
|
4112
|
+
* success (bool): Whether request succeeded
|
|
4113
|
+
* data (List[FirecrawlDocument]): Search results
|
|
4114
|
+
* warning (Optional[str]): Warning message if any
|
|
4115
|
+
* error (Optional[str]): Error message if any
|
|
4116
|
+
|
|
4117
|
+
Raises:
|
|
4118
|
+
Exception: If search fails or response cannot be parsed
|
|
4119
|
+
"""
|
|
4120
|
+
# Build search parameters
|
|
4121
|
+
search_params = {}
|
|
4122
|
+
if params:
|
|
4123
|
+
if isinstance(params, dict):
|
|
4124
|
+
search_params.update(params)
|
|
4125
|
+
else:
|
|
4126
|
+
search_params.update(params.dict(exclude_none=True))
|
|
4127
|
+
|
|
4128
|
+
# Add individual parameters
|
|
4129
|
+
if limit is not None:
|
|
4130
|
+
search_params['limit'] = limit
|
|
4131
|
+
if tbs is not None:
|
|
4132
|
+
search_params['tbs'] = tbs
|
|
4133
|
+
if filter is not None:
|
|
4134
|
+
search_params['filter'] = filter
|
|
4135
|
+
if lang is not None:
|
|
4136
|
+
search_params['lang'] = lang
|
|
4137
|
+
if country is not None:
|
|
4138
|
+
search_params['country'] = country
|
|
4139
|
+
if location is not None:
|
|
4140
|
+
search_params['location'] = location
|
|
4141
|
+
if timeout is not None:
|
|
4142
|
+
search_params['timeout'] = timeout
|
|
4143
|
+
if scrape_options is not None:
|
|
4144
|
+
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
4145
|
+
|
|
4146
|
+
# Add any additional kwargs
|
|
4147
|
+
search_params.update(kwargs)
|
|
4148
|
+
|
|
4149
|
+
# Create final params object
|
|
4150
|
+
final_params = SearchParams(query=query, **search_params)
|
|
4151
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
4152
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
4153
|
+
|
|
4154
|
+
return await self._async_post_request(
|
|
4155
|
+
f"{self.api_url}/v1/search",
|
|
4156
|
+
params_dict,
|
|
4157
|
+
{"Authorization": f"Bearer {self.api_key}"}
|
|
4158
|
+
)
|
|
4159
|
+
|
|
4160
|
+
class AsyncCrawlWatcher(CrawlWatcher):
|
|
4161
|
+
"""
|
|
4162
|
+
Async version of CrawlWatcher that properly handles async operations.
|
|
4163
|
+
"""
|
|
4164
|
+
def __init__(self, id: str, app: AsyncFirecrawlApp):
|
|
4165
|
+
super().__init__(id, app)
|
|
4166
|
+
|
|
4167
|
+
async def connect(self) -> None:
|
|
4168
|
+
"""
|
|
4169
|
+
Establishes async WebSocket connection and starts listening for messages.
|
|
4170
|
+
"""
|
|
4171
|
+
async with websockets.connect(
|
|
4172
|
+
self.ws_url,
|
|
4173
|
+
additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
|
|
4174
|
+
) as websocket:
|
|
4175
|
+
await self._listen(websocket)
|
|
4176
|
+
|
|
4177
|
+
async def _listen(self, websocket) -> None:
|
|
4178
|
+
"""
|
|
4179
|
+
Listens for incoming WebSocket messages and handles them asynchronously.
|
|
4180
|
+
|
|
4181
|
+
Args:
|
|
4182
|
+
websocket: The WebSocket connection object
|
|
4183
|
+
"""
|
|
4184
|
+
async for message in websocket:
|
|
4185
|
+
msg = json.loads(message)
|
|
4186
|
+
await self._handle_message(msg)
|
|
4187
|
+
|
|
4188
|
+
async def _handle_message(self, msg: Dict[str, Any]) -> None:
|
|
4189
|
+
"""
|
|
4190
|
+
Handles incoming WebSocket messages based on their type asynchronously.
|
|
4191
|
+
|
|
4192
|
+
Args:
|
|
4193
|
+
msg (Dict[str, Any]): The message to handle
|
|
4194
|
+
"""
|
|
4195
|
+
if msg['type'] == 'done':
|
|
4196
|
+
self.status = 'completed'
|
|
4197
|
+
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
|
|
4198
|
+
elif msg['type'] == 'error':
|
|
4199
|
+
self.status = 'failed'
|
|
4200
|
+
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
|
|
4201
|
+
elif msg['type'] == 'catchup':
|
|
4202
|
+
self.status = msg['data']['status']
|
|
4203
|
+
self.data.extend(msg['data'].get('data', []))
|
|
4204
|
+
for doc in self.data:
|
|
4205
|
+
self.dispatch_event('document', {'data': doc, 'id': self.id})
|
|
4206
|
+
elif msg['type'] == 'document':
|
|
4207
|
+
self.data.append(msg['data'])
|
|
4208
|
+
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
|
4209
|
+
|
|
4210
|
+
async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
|
|
4211
|
+
"""
|
|
4212
|
+
Handle errors from async API responses.
|
|
4213
|
+
"""
|
|
4214
|
+
try:
|
|
4215
|
+
error_data = await response.json()
|
|
4216
|
+
error_message = error_data.get('error', 'No error message provided.')
|
|
4217
|
+
error_details = error_data.get('details', 'No additional error details provided.')
|
|
4218
|
+
except:
|
|
4219
|
+
raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
|
|
4220
|
+
|
|
4221
|
+
# Use the app's method to get the error message
|
|
4222
|
+
message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
|
|
4223
|
+
|
|
4224
|
+
raise aiohttp.ClientError(message)
|
|
4225
|
+
|
|
4226
|
+
async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
4227
|
+
"""
|
|
4228
|
+
Generate a standardized error message based on HTTP status code for async operations.
|
|
4229
|
+
|
|
4230
|
+
Args:
|
|
4231
|
+
status_code (int): The HTTP status code from the response
|
|
4232
|
+
action (str): Description of the action that was being performed
|
|
4233
|
+
error_message (str): The error message from the API response
|
|
4234
|
+
error_details (str): Additional error details from the API response
|
|
4235
|
+
|
|
4236
|
+
Returns:
|
|
4237
|
+
str: A formatted error message
|
|
4238
|
+
"""
|
|
4239
|
+
return self._get_error_message(status_code, action, error_message, error_details)
|