firecrawl 1.17.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +2 -2
- firecrawl/firecrawl.py +3338 -416
- {firecrawl-1.17.0.dist-info → firecrawl-2.0.1.dist-info}/METADATA +3 -2
- firecrawl-2.0.1.dist-info/RECORD +12 -0
- firecrawl-1.17.0.dist-info/RECORD +0 -12
- {firecrawl-1.17.0.dist-info → firecrawl-2.0.1.dist-info}/LICENSE +0 -0
- {firecrawl-1.17.0.dist-info → firecrawl-2.0.1.dist-info}/WHEEL +0 -0
- {firecrawl-1.17.0.dist-info → firecrawl-2.0.1.dist-info}/top_level.txt +0 -0
firecrawl/firecrawl.py
CHANGED
|
@@ -12,15 +12,294 @@ Classes:
|
|
|
12
12
|
import logging
|
|
13
13
|
import os
|
|
14
14
|
import time
|
|
15
|
-
from typing import Any, Dict, Optional, List, Union, Callable
|
|
15
|
+
from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic
|
|
16
16
|
import json
|
|
17
|
-
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
import re
|
|
19
|
+
import warnings
|
|
18
20
|
import requests
|
|
19
21
|
import pydantic
|
|
20
22
|
import websockets
|
|
23
|
+
import aiohttp
|
|
24
|
+
import asyncio
|
|
25
|
+
from pydantic import Field
|
|
26
|
+
|
|
27
|
+
# Suppress Pydantic warnings about attribute shadowing
|
|
28
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
|
|
29
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
|
30
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractConfig\" shadows an attribute in parent \"BaseModel\"")
|
|
31
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_version():
|
|
35
|
+
try:
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
package_path = os.path.dirname(__file__)
|
|
38
|
+
version_file = Path(os.path.join(package_path, '__init__.py')).read_text()
|
|
39
|
+
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
|
40
|
+
if version_match:
|
|
41
|
+
return version_match.group(1).strip()
|
|
42
|
+
except Exception:
|
|
43
|
+
print("Failed to get version from __init__.py")
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
version = get_version()
|
|
21
47
|
|
|
22
48
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
23
49
|
|
|
50
|
+
T = TypeVar('T')
|
|
51
|
+
|
|
52
|
+
# class FirecrawlDocumentMetadata(pydantic.BaseModel):
|
|
53
|
+
# """Metadata for a Firecrawl document."""
|
|
54
|
+
# title: Optional[str] = None
|
|
55
|
+
# description: Optional[str] = None
|
|
56
|
+
# language: Optional[str] = None
|
|
57
|
+
# keywords: Optional[str] = None
|
|
58
|
+
# robots: Optional[str] = None
|
|
59
|
+
# ogTitle: Optional[str] = None
|
|
60
|
+
# ogDescription: Optional[str] = None
|
|
61
|
+
# ogUrl: Optional[str] = None
|
|
62
|
+
# ogImage: Optional[str] = None
|
|
63
|
+
# ogAudio: Optional[str] = None
|
|
64
|
+
# ogDeterminer: Optional[str] = None
|
|
65
|
+
# ogLocale: Optional[str] = None
|
|
66
|
+
# ogLocaleAlternate: Optional[List[str]] = None
|
|
67
|
+
# ogSiteName: Optional[str] = None
|
|
68
|
+
# ogVideo: Optional[str] = None
|
|
69
|
+
# dctermsCreated: Optional[str] = None
|
|
70
|
+
# dcDateCreated: Optional[str] = None
|
|
71
|
+
# dcDate: Optional[str] = None
|
|
72
|
+
# dctermsType: Optional[str] = None
|
|
73
|
+
# dcType: Optional[str] = None
|
|
74
|
+
# dctermsAudience: Optional[str] = None
|
|
75
|
+
# dctermsSubject: Optional[str] = None
|
|
76
|
+
# dcSubject: Optional[str] = None
|
|
77
|
+
# dcDescription: Optional[str] = None
|
|
78
|
+
# dctermsKeywords: Optional[str] = None
|
|
79
|
+
# modifiedTime: Optional[str] = None
|
|
80
|
+
# publishedTime: Optional[str] = None
|
|
81
|
+
# articleTag: Optional[str] = None
|
|
82
|
+
# articleSection: Optional[str] = None
|
|
83
|
+
# sourceURL: Optional[str] = None
|
|
84
|
+
# statusCode: Optional[int] = None
|
|
85
|
+
# error: Optional[str] = None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class AgentOptions(pydantic.BaseModel):
|
|
89
|
+
"""Configuration for the agent."""
|
|
90
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
91
|
+
prompt: Optional[str] = None
|
|
92
|
+
|
|
93
|
+
class AgentOptionsExtract(pydantic.BaseModel):
|
|
94
|
+
"""Configuration for the agent in extract operations."""
|
|
95
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
96
|
+
|
|
97
|
+
class ActionsResult(pydantic.BaseModel):
|
|
98
|
+
"""Result of actions performed during scraping."""
|
|
99
|
+
screenshots: List[str]
|
|
100
|
+
|
|
101
|
+
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
102
|
+
"""Document retrieved or processed by Firecrawl."""
|
|
103
|
+
url: Optional[str] = None
|
|
104
|
+
markdown: Optional[str] = None
|
|
105
|
+
html: Optional[str] = None
|
|
106
|
+
rawHtml: Optional[str] = None
|
|
107
|
+
links: Optional[List[str]] = None
|
|
108
|
+
extract: Optional[T] = None
|
|
109
|
+
json: Optional[T] = None
|
|
110
|
+
screenshot: Optional[str] = None
|
|
111
|
+
metadata: Optional[Any] = None
|
|
112
|
+
actions: Optional[ActionsResult] = None
|
|
113
|
+
title: Optional[str] = None # v1 search only
|
|
114
|
+
description: Optional[str] = None # v1 search only
|
|
115
|
+
|
|
116
|
+
class LocationConfig(pydantic.BaseModel):
|
|
117
|
+
"""Location configuration for scraping."""
|
|
118
|
+
country: Optional[str] = None
|
|
119
|
+
languages: Optional[List[str]] = None
|
|
120
|
+
|
|
121
|
+
class WebhookConfig(pydantic.BaseModel):
|
|
122
|
+
"""Configuration for webhooks."""
|
|
123
|
+
url: str
|
|
124
|
+
headers: Optional[Dict[str, str]] = None
|
|
125
|
+
metadata: Optional[Dict[str, str]] = None
|
|
126
|
+
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
127
|
+
|
|
128
|
+
class CommonOptions(pydantic.BaseModel):
|
|
129
|
+
"""Parameters for scraping operations."""
|
|
130
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None
|
|
131
|
+
headers: Optional[Dict[str, str]] = None
|
|
132
|
+
includeTags: Optional[List[str]] = None
|
|
133
|
+
excludeTags: Optional[List[str]] = None
|
|
134
|
+
onlyMainContent: Optional[bool] = None
|
|
135
|
+
waitFor: Optional[int] = None
|
|
136
|
+
timeout: Optional[int] = None
|
|
137
|
+
location: Optional[LocationConfig] = None
|
|
138
|
+
mobile: Optional[bool] = None
|
|
139
|
+
skipTlsVerification: Optional[bool] = None
|
|
140
|
+
removeBase64Images: Optional[bool] = None
|
|
141
|
+
blockAds: Optional[bool] = None
|
|
142
|
+
proxy: Optional[Literal["basic", "stealth"]] = None
|
|
143
|
+
|
|
144
|
+
class WaitAction(pydantic.BaseModel):
|
|
145
|
+
"""Wait action to perform during scraping."""
|
|
146
|
+
type: Literal["wait"]
|
|
147
|
+
milliseconds: int
|
|
148
|
+
selector: Optional[str] = None
|
|
149
|
+
|
|
150
|
+
class ScreenshotAction(pydantic.BaseModel):
|
|
151
|
+
"""Screenshot action to perform during scraping."""
|
|
152
|
+
type: Literal["screenshot"]
|
|
153
|
+
fullPage: Optional[bool] = None
|
|
154
|
+
|
|
155
|
+
class ClickAction(pydantic.BaseModel):
|
|
156
|
+
"""Click action to perform during scraping."""
|
|
157
|
+
type: Literal["click"]
|
|
158
|
+
selector: str
|
|
159
|
+
|
|
160
|
+
class WriteAction(pydantic.BaseModel):
|
|
161
|
+
"""Write action to perform during scraping."""
|
|
162
|
+
type: Literal["write"]
|
|
163
|
+
text: str
|
|
164
|
+
|
|
165
|
+
class PressAction(pydantic.BaseModel):
|
|
166
|
+
"""Press action to perform during scraping."""
|
|
167
|
+
type: Literal["press"]
|
|
168
|
+
key: str
|
|
169
|
+
|
|
170
|
+
class ScrollAction(pydantic.BaseModel):
|
|
171
|
+
"""Scroll action to perform during scraping."""
|
|
172
|
+
type: Literal["scroll"]
|
|
173
|
+
direction: Literal["up", "down"]
|
|
174
|
+
selector: Optional[str] = None
|
|
175
|
+
|
|
176
|
+
class ScrapeAction(pydantic.BaseModel):
|
|
177
|
+
"""Scrape action to perform during scraping."""
|
|
178
|
+
type: Literal["scrape"]
|
|
179
|
+
|
|
180
|
+
class ExecuteJavascriptAction(pydantic.BaseModel):
|
|
181
|
+
"""Execute javascript action to perform during scraping."""
|
|
182
|
+
type: Literal["executeJavascript"]
|
|
183
|
+
script: str
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class ExtractAgent(pydantic.BaseModel):
|
|
187
|
+
"""Configuration for the agent in extract operations."""
|
|
188
|
+
model: Literal["FIRE-1"] = "FIRE-1"
|
|
189
|
+
|
|
190
|
+
class ExtractConfig(pydantic.BaseModel):
|
|
191
|
+
"""Configuration for extraction."""
|
|
192
|
+
prompt: Optional[str] = None
|
|
193
|
+
schema: Optional[Any] = None
|
|
194
|
+
systemPrompt: Optional[str] = None
|
|
195
|
+
agent: Optional[ExtractAgent] = None
|
|
196
|
+
|
|
197
|
+
class ScrapeParams(CommonOptions):
|
|
198
|
+
"""Parameters for scraping operations."""
|
|
199
|
+
extract: Optional[ExtractConfig] = None
|
|
200
|
+
jsonOptions: Optional[ExtractConfig] = None
|
|
201
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
|
202
|
+
agent: Optional[AgentOptions] = None
|
|
203
|
+
|
|
204
|
+
class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
|
|
205
|
+
"""Response from scraping operations."""
|
|
206
|
+
success: bool = True
|
|
207
|
+
warning: Optional[str] = None
|
|
208
|
+
error: Optional[str] = None
|
|
209
|
+
|
|
210
|
+
class BatchScrapeResponse(pydantic.BaseModel):
|
|
211
|
+
"""Response from batch scrape operations."""
|
|
212
|
+
id: Optional[str] = None
|
|
213
|
+
url: Optional[str] = None
|
|
214
|
+
success: bool = True
|
|
215
|
+
error: Optional[str] = None
|
|
216
|
+
invalidURLs: Optional[List[str]] = None
|
|
217
|
+
|
|
218
|
+
class BatchScrapeStatusResponse(pydantic.BaseModel):
|
|
219
|
+
"""Response from batch scrape status checks."""
|
|
220
|
+
success: bool = True
|
|
221
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
222
|
+
completed: int
|
|
223
|
+
total: int
|
|
224
|
+
creditsUsed: int
|
|
225
|
+
expiresAt: datetime
|
|
226
|
+
next: Optional[str] = None
|
|
227
|
+
data: List[FirecrawlDocument]
|
|
228
|
+
|
|
229
|
+
class CrawlParams(pydantic.BaseModel):
|
|
230
|
+
"""Parameters for crawling operations."""
|
|
231
|
+
includePaths: Optional[List[str]] = None
|
|
232
|
+
excludePaths: Optional[List[str]] = None
|
|
233
|
+
maxDepth: Optional[int] = None
|
|
234
|
+
maxDiscoveryDepth: Optional[int] = None
|
|
235
|
+
limit: Optional[int] = None
|
|
236
|
+
allowBackwardLinks: Optional[bool] = None
|
|
237
|
+
allowExternalLinks: Optional[bool] = None
|
|
238
|
+
ignoreSitemap: Optional[bool] = None
|
|
239
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
240
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
241
|
+
deduplicateSimilarURLs: Optional[bool] = None
|
|
242
|
+
ignoreQueryParameters: Optional[bool] = None
|
|
243
|
+
regexOnFullURL: Optional[bool] = None
|
|
244
|
+
|
|
245
|
+
class CrawlResponse(pydantic.BaseModel):
|
|
246
|
+
"""Response from crawling operations."""
|
|
247
|
+
id: Optional[str] = None
|
|
248
|
+
url: Optional[str] = None
|
|
249
|
+
success: bool = True
|
|
250
|
+
error: Optional[str] = None
|
|
251
|
+
|
|
252
|
+
class CrawlStatusResponse(pydantic.BaseModel):
|
|
253
|
+
"""Response from crawl status checks."""
|
|
254
|
+
success: bool = True
|
|
255
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
256
|
+
completed: int
|
|
257
|
+
total: int
|
|
258
|
+
creditsUsed: int
|
|
259
|
+
expiresAt: datetime
|
|
260
|
+
next: Optional[str] = None
|
|
261
|
+
data: List[FirecrawlDocument]
|
|
262
|
+
|
|
263
|
+
class CrawlErrorsResponse(pydantic.BaseModel):
|
|
264
|
+
"""Response from crawl/batch scrape error monitoring."""
|
|
265
|
+
errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
|
|
266
|
+
robotsBlocked: List[str]
|
|
267
|
+
|
|
268
|
+
class MapParams(pydantic.BaseModel):
|
|
269
|
+
"""Parameters for mapping operations."""
|
|
270
|
+
search: Optional[str] = None
|
|
271
|
+
ignoreSitemap: Optional[bool] = None
|
|
272
|
+
includeSubdomains: Optional[bool] = None
|
|
273
|
+
sitemapOnly: Optional[bool] = None
|
|
274
|
+
limit: Optional[int] = None
|
|
275
|
+
timeout: Optional[int] = None
|
|
276
|
+
|
|
277
|
+
class MapResponse(pydantic.BaseModel):
|
|
278
|
+
"""Response from mapping operations."""
|
|
279
|
+
success: bool = True
|
|
280
|
+
links: Optional[List[str]] = None
|
|
281
|
+
error: Optional[str] = None
|
|
282
|
+
|
|
283
|
+
class ExtractParams(pydantic.BaseModel):
|
|
284
|
+
"""Parameters for extracting information from URLs."""
|
|
285
|
+
prompt: Optional[str] = None
|
|
286
|
+
schema: Optional[Any] = None
|
|
287
|
+
systemPrompt: Optional[str] = None
|
|
288
|
+
allowExternalLinks: Optional[bool] = None
|
|
289
|
+
enableWebSearch: Optional[bool] = None
|
|
290
|
+
includeSubdomains: Optional[bool] = None
|
|
291
|
+
origin: Optional[str] = None
|
|
292
|
+
showSources: Optional[bool] = None
|
|
293
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
294
|
+
|
|
295
|
+
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
296
|
+
"""Response from extract operations."""
|
|
297
|
+
success: bool = True
|
|
298
|
+
data: Optional[T] = None
|
|
299
|
+
error: Optional[str] = None
|
|
300
|
+
warning: Optional[str] = None
|
|
301
|
+
sources: Optional[List[str]] = None
|
|
302
|
+
|
|
24
303
|
class SearchParams(pydantic.BaseModel):
|
|
25
304
|
query: str
|
|
26
305
|
limit: Optional[int] = 5
|
|
@@ -31,7 +310,14 @@ class SearchParams(pydantic.BaseModel):
|
|
|
31
310
|
location: Optional[str] = None
|
|
32
311
|
origin: Optional[str] = "api"
|
|
33
312
|
timeout: Optional[int] = 60000
|
|
34
|
-
scrapeOptions: Optional[
|
|
313
|
+
scrapeOptions: Optional[CommonOptions] = None
|
|
314
|
+
|
|
315
|
+
class SearchResponse(pydantic.BaseModel):
|
|
316
|
+
"""Response from search operations."""
|
|
317
|
+
success: bool = True
|
|
318
|
+
data: List[FirecrawlDocument]
|
|
319
|
+
warning: Optional[str] = None
|
|
320
|
+
error: Optional[str] = None
|
|
35
321
|
|
|
36
322
|
class GenerateLLMsTextParams(pydantic.BaseModel):
|
|
37
323
|
"""
|
|
@@ -75,6 +361,24 @@ class DeepResearchStatusResponse(pydantic.BaseModel):
|
|
|
75
361
|
sources: List[Dict[str, Any]]
|
|
76
362
|
summaries: List[str]
|
|
77
363
|
|
|
364
|
+
class GenerateLLMsTextResponse(pydantic.BaseModel):
|
|
365
|
+
"""Response from LLMs.txt generation operations."""
|
|
366
|
+
success: bool = True
|
|
367
|
+
id: str
|
|
368
|
+
error: Optional[str] = None
|
|
369
|
+
|
|
370
|
+
class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
|
|
371
|
+
llmstxt: str
|
|
372
|
+
llmsfulltxt: Optional[str] = None
|
|
373
|
+
|
|
374
|
+
class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
|
375
|
+
"""Status response from LLMs.txt generation operations."""
|
|
376
|
+
success: bool = True
|
|
377
|
+
data: Optional[GenerateLLMsTextStatusResponseData] = None
|
|
378
|
+
status: Literal["processing", "completed", "failed"]
|
|
379
|
+
error: Optional[str] = None
|
|
380
|
+
expiresAt: str
|
|
381
|
+
|
|
78
382
|
class ChangeTrackingData(pydantic.BaseModel):
|
|
79
383
|
"""
|
|
80
384
|
Data for the change tracking format.
|
|
@@ -84,42 +388,39 @@ class ChangeTrackingData(pydantic.BaseModel):
|
|
|
84
388
|
visibility: str # "visible" | "hidden"
|
|
85
389
|
diff: Optional[Dict[str, Any]] = None
|
|
86
390
|
json: Optional[Any] = None
|
|
391
|
+
|
|
392
|
+
class SearchResponse(pydantic.BaseModel):
|
|
393
|
+
"""
|
|
394
|
+
Response from the search operation.
|
|
395
|
+
"""
|
|
396
|
+
success: bool
|
|
397
|
+
data: List[Dict[str, Any]]
|
|
398
|
+
warning: Optional[str] = None
|
|
399
|
+
error: Optional[str] = None
|
|
87
400
|
|
|
88
|
-
class
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
show_sources: Optional[bool] = False
|
|
110
|
-
agent: Optional[Dict[str, Any]] = None
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
class ExtractResponse(pydantic.BaseModel):
|
|
116
|
-
"""
|
|
117
|
-
Response from the extract operation.
|
|
118
|
-
"""
|
|
119
|
-
success: bool
|
|
120
|
-
data: Optional[Any] = None
|
|
121
|
-
error: Optional[str] = None
|
|
401
|
+
class ExtractParams(pydantic.BaseModel):
|
|
402
|
+
"""
|
|
403
|
+
Parameters for the extract operation.
|
|
404
|
+
"""
|
|
405
|
+
prompt: Optional[str] = None
|
|
406
|
+
schema: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
407
|
+
system_prompt: Optional[str] = None
|
|
408
|
+
allow_external_links: Optional[bool] = False
|
|
409
|
+
enable_web_search: Optional[bool] = False
|
|
410
|
+
# Just for backwards compatibility
|
|
411
|
+
enableWebSearch: Optional[bool] = False
|
|
412
|
+
show_sources: Optional[bool] = False
|
|
413
|
+
agent: Optional[Dict[str, Any]] = None
|
|
414
|
+
|
|
415
|
+
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
416
|
+
"""
|
|
417
|
+
Response from the extract operation.
|
|
418
|
+
"""
|
|
419
|
+
success: bool
|
|
420
|
+
data: Optional[T] = None
|
|
421
|
+
error: Optional[str] = None
|
|
122
422
|
|
|
423
|
+
class FirecrawlApp:
|
|
123
424
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
|
124
425
|
"""
|
|
125
426
|
Initialize the FirecrawlApp instance with API key, API URL.
|
|
@@ -138,200 +439,451 @@ class FirecrawlApp:
|
|
|
138
439
|
|
|
139
440
|
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
|
|
140
441
|
|
|
141
|
-
def scrape_url(
|
|
442
|
+
def scrape_url(
|
|
443
|
+
self,
|
|
444
|
+
url: str,
|
|
445
|
+
*,
|
|
446
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
447
|
+
include_tags: Optional[List[str]] = None,
|
|
448
|
+
exclude_tags: Optional[List[str]] = None,
|
|
449
|
+
only_main_content: Optional[bool] = None,
|
|
450
|
+
wait_for: Optional[int] = None,
|
|
451
|
+
timeout: Optional[int] = None,
|
|
452
|
+
location: Optional[LocationConfig] = None,
|
|
453
|
+
mobile: Optional[bool] = None,
|
|
454
|
+
skip_tls_verification: Optional[bool] = None,
|
|
455
|
+
remove_base64_images: Optional[bool] = None,
|
|
456
|
+
block_ads: Optional[bool] = None,
|
|
457
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
458
|
+
extract: Optional[ExtractConfig] = None,
|
|
459
|
+
json_options: Optional[ExtractConfig] = None,
|
|
460
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
461
|
+
**kwargs) -> ScrapeResponse[Any]:
|
|
142
462
|
"""
|
|
143
|
-
Scrape
|
|
463
|
+
Scrape and extract content from a URL.
|
|
144
464
|
|
|
145
465
|
Args:
|
|
146
|
-
|
|
147
|
-
|
|
466
|
+
url (str): Target URL to scrape
|
|
467
|
+
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
|
468
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
469
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
470
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
471
|
+
wait_for (Optional[int]): Wait for a specific element to appear
|
|
472
|
+
timeout (Optional[int]): Request timeout (ms)
|
|
473
|
+
location (Optional[LocationConfig]): Location configuration
|
|
474
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
475
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
476
|
+
remove_base64_images (Optional[bool]): Remove base64 images
|
|
477
|
+
block_ads (Optional[bool]): Block ads
|
|
478
|
+
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
479
|
+
extract (Optional[ExtractConfig]): Content extraction settings
|
|
480
|
+
json_options (Optional[ExtractConfig]): JSON extraction settings
|
|
481
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
482
|
+
|
|
148
483
|
|
|
149
484
|
Returns:
|
|
150
|
-
|
|
485
|
+
ScrapeResponse with:
|
|
486
|
+
* Requested content formats
|
|
487
|
+
* Page metadata
|
|
488
|
+
* Extraction results
|
|
489
|
+
* Success/error status
|
|
151
490
|
|
|
152
491
|
Raises:
|
|
153
|
-
|
|
492
|
+
Exception: If scraping fails
|
|
154
493
|
"""
|
|
155
|
-
|
|
156
494
|
headers = self._prepare_headers()
|
|
157
495
|
|
|
158
|
-
#
|
|
159
|
-
scrape_params = {
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
# Handle extract (for v1)
|
|
164
|
-
extract = params.get('extract', {})
|
|
165
|
-
if extract:
|
|
166
|
-
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
|
|
167
|
-
extract['schema'] = extract['schema'].schema()
|
|
168
|
-
scrape_params['extract'] = extract
|
|
169
|
-
|
|
170
|
-
# Include any other params directly at the top level of scrape_params
|
|
171
|
-
for key, value in params.items():
|
|
172
|
-
if key not in ['extract']:
|
|
173
|
-
scrape_params[key] = value
|
|
174
|
-
|
|
175
|
-
json = params.get("jsonOptions", {})
|
|
176
|
-
if json:
|
|
177
|
-
if 'schema' in json and hasattr(json['schema'], 'schema'):
|
|
178
|
-
json['schema'] = json['schema'].schema()
|
|
179
|
-
scrape_params['jsonOptions'] = json
|
|
180
|
-
|
|
181
|
-
change_tracking = params.get("changeTrackingOptions", {})
|
|
182
|
-
if change_tracking:
|
|
183
|
-
scrape_params['changeTrackingOptions'] = change_tracking
|
|
184
|
-
|
|
185
|
-
# Include any other params directly at the top level of scrape_params
|
|
186
|
-
for key, value in params.items():
|
|
187
|
-
if key not in ['jsonOptions', 'changeTrackingOptions', 'agent']:
|
|
188
|
-
scrape_params[key] = value
|
|
189
|
-
|
|
190
|
-
agent = params.get('agent')
|
|
191
|
-
if agent:
|
|
192
|
-
scrape_params['agent'] = agent
|
|
193
|
-
|
|
496
|
+
# Build scrape parameters
|
|
497
|
+
scrape_params = {
|
|
498
|
+
'url': url,
|
|
499
|
+
'origin': f"python-sdk@{version}"
|
|
500
|
+
}
|
|
194
501
|
|
|
195
|
-
|
|
196
|
-
|
|
502
|
+
# Add optional parameters if provided
|
|
503
|
+
if formats:
|
|
504
|
+
scrape_params['formats'] = formats
|
|
505
|
+
if include_tags:
|
|
506
|
+
scrape_params['includeTags'] = include_tags
|
|
507
|
+
if exclude_tags:
|
|
508
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
509
|
+
if only_main_content is not None:
|
|
510
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
511
|
+
if wait_for:
|
|
512
|
+
scrape_params['waitFor'] = wait_for
|
|
513
|
+
if timeout:
|
|
514
|
+
scrape_params['timeout'] = timeout
|
|
515
|
+
if location:
|
|
516
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
517
|
+
if mobile is not None:
|
|
518
|
+
scrape_params['mobile'] = mobile
|
|
519
|
+
if skip_tls_verification is not None:
|
|
520
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
521
|
+
if remove_base64_images is not None:
|
|
522
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
523
|
+
if block_ads is not None:
|
|
524
|
+
scrape_params['blockAds'] = block_ads
|
|
525
|
+
if proxy:
|
|
526
|
+
scrape_params['proxy'] = proxy
|
|
527
|
+
if extract:
|
|
528
|
+
if hasattr(extract.schema, 'schema'):
|
|
529
|
+
extract.schema = extract.schema.schema()
|
|
530
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
531
|
+
if json_options:
|
|
532
|
+
if hasattr(json_options.schema, 'schema'):
|
|
533
|
+
json_options.schema = json_options.schema.schema()
|
|
534
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
535
|
+
if actions:
|
|
536
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
537
|
+
scrape_params.update(kwargs)
|
|
538
|
+
|
|
539
|
+
# Make request
|
|
197
540
|
response = requests.post(
|
|
198
|
-
f'{self.api_url}
|
|
541
|
+
f'{self.api_url}/v1/scrape',
|
|
199
542
|
headers=headers,
|
|
200
543
|
json=scrape_params,
|
|
201
|
-
timeout=(
|
|
544
|
+
timeout=(timeout + 5000 if timeout else None)
|
|
202
545
|
)
|
|
546
|
+
|
|
203
547
|
if response.status_code == 200:
|
|
204
548
|
try:
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
raise Exception(
|
|
549
|
+
response_json = response.json()
|
|
550
|
+
if response_json.get('success') and 'data' in response_json:
|
|
551
|
+
return ScrapeResponse(**response_json['data'])
|
|
552
|
+
elif "error" in response_json:
|
|
553
|
+
raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
|
|
554
|
+
else:
|
|
555
|
+
raise Exception(f'Failed to scrape URL. Error: {response_json}')
|
|
556
|
+
except ValueError:
|
|
557
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
214
558
|
else:
|
|
215
559
|
self._handle_error(response, 'scrape URL')
|
|
216
560
|
|
|
217
|
-
def search(
|
|
561
|
+
def search(
|
|
562
|
+
self,
|
|
563
|
+
query: str,
|
|
564
|
+
*,
|
|
565
|
+
limit: Optional[int] = None,
|
|
566
|
+
tbs: Optional[str] = None,
|
|
567
|
+
filter: Optional[str] = None,
|
|
568
|
+
lang: Optional[str] = None,
|
|
569
|
+
country: Optional[str] = None,
|
|
570
|
+
location: Optional[str] = None,
|
|
571
|
+
timeout: Optional[int] = None,
|
|
572
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
573
|
+
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
574
|
+
**kwargs) -> SearchResponse:
|
|
218
575
|
"""
|
|
219
|
-
Search for content using
|
|
576
|
+
Search for content using Firecrawl.
|
|
220
577
|
|
|
221
578
|
Args:
|
|
222
|
-
query (str):
|
|
223
|
-
|
|
579
|
+
query (str): Search query string
|
|
580
|
+
limit (Optional[int]): Max results (default: 5)
|
|
581
|
+
tbs (Optional[str]): Time filter (e.g. "qdr:d")
|
|
582
|
+
filter (Optional[str]): Custom result filter
|
|
583
|
+
lang (Optional[str]): Language code (default: "en")
|
|
584
|
+
country (Optional[str]): Country code (default: "us")
|
|
585
|
+
location (Optional[str]): Geo-targeting
|
|
586
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
587
|
+
scrape_options (Optional[CommonOptions]): Result scraping configuration
|
|
588
|
+
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
589
|
+
**kwargs: Additional keyword arguments for future compatibility
|
|
224
590
|
|
|
225
591
|
Returns:
|
|
226
|
-
|
|
592
|
+
SearchResponse: Response containing:
|
|
593
|
+
* success (bool): Whether request succeeded
|
|
594
|
+
* data (List[FirecrawlDocument]): Search results
|
|
595
|
+
* warning (Optional[str]): Warning message if any
|
|
596
|
+
* error (Optional[str]): Error message if any
|
|
597
|
+
|
|
598
|
+
Raises:
|
|
599
|
+
Exception: If search fails or response cannot be parsed
|
|
227
600
|
"""
|
|
228
|
-
|
|
229
|
-
|
|
601
|
+
# Build search parameters
|
|
602
|
+
search_params = {}
|
|
603
|
+
if params:
|
|
604
|
+
if isinstance(params, dict):
|
|
605
|
+
search_params.update(params)
|
|
606
|
+
else:
|
|
607
|
+
search_params.update(params.dict(exclude_none=True))
|
|
608
|
+
|
|
609
|
+
# Add individual parameters
|
|
610
|
+
if limit is not None:
|
|
611
|
+
search_params['limit'] = limit
|
|
612
|
+
if tbs is not None:
|
|
613
|
+
search_params['tbs'] = tbs
|
|
614
|
+
if filter is not None:
|
|
615
|
+
search_params['filter'] = filter
|
|
616
|
+
if lang is not None:
|
|
617
|
+
search_params['lang'] = lang
|
|
618
|
+
if country is not None:
|
|
619
|
+
search_params['country'] = country
|
|
620
|
+
if location is not None:
|
|
621
|
+
search_params['location'] = location
|
|
622
|
+
if timeout is not None:
|
|
623
|
+
search_params['timeout'] = timeout
|
|
624
|
+
if scrape_options is not None:
|
|
625
|
+
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
626
|
+
|
|
627
|
+
# Add any additional kwargs
|
|
628
|
+
search_params.update(kwargs)
|
|
230
629
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
search_params.query = query
|
|
630
|
+
# Create final params object
|
|
631
|
+
final_params = SearchParams(query=query, **search_params)
|
|
632
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
633
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
236
634
|
|
|
635
|
+
# Make request
|
|
237
636
|
response = requests.post(
|
|
238
637
|
f"{self.api_url}/v1/search",
|
|
239
638
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
240
|
-
json=
|
|
639
|
+
json=params_dict
|
|
241
640
|
)
|
|
242
641
|
|
|
243
|
-
if response.status_code
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
642
|
+
if response.status_code == 200:
|
|
643
|
+
try:
|
|
644
|
+
response_json = response.json()
|
|
645
|
+
if response_json.get('success') and 'data' in response_json:
|
|
646
|
+
return SearchResponse(**response_json)
|
|
647
|
+
elif "error" in response_json:
|
|
648
|
+
raise Exception(f'Search failed. Error: {response_json["error"]}')
|
|
649
|
+
else:
|
|
650
|
+
raise Exception(f'Search failed. Error: {response_json}')
|
|
651
|
+
except ValueError:
|
|
652
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
653
|
+
else:
|
|
654
|
+
self._handle_error(response, 'search')
|
|
655
|
+
|
|
656
|
+
def crawl_url(
|
|
657
|
+
self,
|
|
658
|
+
url: str,
|
|
659
|
+
*,
|
|
660
|
+
include_paths: Optional[List[str]] = None,
|
|
661
|
+
exclude_paths: Optional[List[str]] = None,
|
|
662
|
+
max_depth: Optional[int] = None,
|
|
663
|
+
max_discovery_depth: Optional[int] = None,
|
|
664
|
+
limit: Optional[int] = None,
|
|
665
|
+
allow_backward_links: Optional[bool] = None,
|
|
666
|
+
allow_external_links: Optional[bool] = None,
|
|
667
|
+
ignore_sitemap: Optional[bool] = None,
|
|
668
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
669
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
670
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
671
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
672
|
+
regex_on_full_url: Optional[bool] = None,
|
|
673
|
+
poll_interval: Optional[int] = 2,
|
|
674
|
+
idempotency_key: Optional[str] = None,
|
|
675
|
+
**kwargs
|
|
676
|
+
) -> CrawlStatusResponse:
|
|
255
677
|
"""
|
|
256
|
-
|
|
678
|
+
Crawl a website starting from a URL.
|
|
257
679
|
|
|
258
680
|
Args:
|
|
259
|
-
url (str):
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
681
|
+
url (str): Target URL to start crawling from
|
|
682
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
683
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
684
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
685
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
686
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
687
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
688
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
689
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
690
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
691
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
692
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
693
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
694
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
695
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
696
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
697
|
+
**kwargs: Additional parameters to pass to the API
|
|
263
698
|
|
|
264
699
|
Returns:
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
- 'total' (int): Total number of scraped pages.
|
|
270
|
-
- 'creditsUsed' (int): Estimated number of API credits used for this crawl.
|
|
271
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires.
|
|
272
|
-
- 'data' (List[Dict]): List of all the scraped pages.
|
|
700
|
+
CrawlStatusResponse with:
|
|
701
|
+
* Crawling status and progress
|
|
702
|
+
* Crawled page contents
|
|
703
|
+
* Success/error information
|
|
273
704
|
|
|
274
705
|
Raises:
|
|
275
|
-
Exception: If
|
|
706
|
+
Exception: If crawl fails
|
|
276
707
|
"""
|
|
277
|
-
|
|
708
|
+
crawl_params = {}
|
|
709
|
+
|
|
710
|
+
# Add individual parameters
|
|
711
|
+
if include_paths is not None:
|
|
712
|
+
crawl_params['includePaths'] = include_paths
|
|
713
|
+
if exclude_paths is not None:
|
|
714
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
715
|
+
if max_depth is not None:
|
|
716
|
+
crawl_params['maxDepth'] = max_depth
|
|
717
|
+
if max_discovery_depth is not None:
|
|
718
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
719
|
+
if limit is not None:
|
|
720
|
+
crawl_params['limit'] = limit
|
|
721
|
+
if allow_backward_links is not None:
|
|
722
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
723
|
+
if allow_external_links is not None:
|
|
724
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
725
|
+
if ignore_sitemap is not None:
|
|
726
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
727
|
+
if scrape_options is not None:
|
|
728
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
729
|
+
if webhook is not None:
|
|
730
|
+
crawl_params['webhook'] = webhook
|
|
731
|
+
if deduplicate_similar_urls is not None:
|
|
732
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
733
|
+
if ignore_query_parameters is not None:
|
|
734
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
735
|
+
if regex_on_full_url is not None:
|
|
736
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
737
|
+
|
|
738
|
+
# Add any additional kwargs
|
|
739
|
+
crawl_params.update(kwargs)
|
|
740
|
+
|
|
741
|
+
# Create final params object
|
|
742
|
+
final_params = CrawlParams(**crawl_params)
|
|
743
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
744
|
+
params_dict['url'] = url
|
|
745
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
746
|
+
|
|
747
|
+
# Make request
|
|
278
748
|
headers = self._prepare_headers(idempotency_key)
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
json_data.update(params)
|
|
282
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
749
|
+
response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
750
|
+
|
|
283
751
|
if response.status_code == 200:
|
|
284
752
|
try:
|
|
285
753
|
id = response.json().get('id')
|
|
286
754
|
except:
|
|
287
755
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
288
756
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
289
|
-
|
|
290
757
|
else:
|
|
291
758
|
self._handle_error(response, 'start crawl job')
|
|
292
759
|
|
|
293
|
-
|
|
294
|
-
|
|
760
|
+
def async_crawl_url(
|
|
761
|
+
self,
|
|
762
|
+
url: str,
|
|
763
|
+
*,
|
|
764
|
+
include_paths: Optional[List[str]] = None,
|
|
765
|
+
exclude_paths: Optional[List[str]] = None,
|
|
766
|
+
max_depth: Optional[int] = None,
|
|
767
|
+
max_discovery_depth: Optional[int] = None,
|
|
768
|
+
limit: Optional[int] = None,
|
|
769
|
+
allow_backward_links: Optional[bool] = None,
|
|
770
|
+
allow_external_links: Optional[bool] = None,
|
|
771
|
+
ignore_sitemap: Optional[bool] = None,
|
|
772
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
773
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
774
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
775
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
776
|
+
regex_on_full_url: Optional[bool] = None,
|
|
777
|
+
idempotency_key: Optional[str] = None,
|
|
778
|
+
**kwargs
|
|
779
|
+
) -> CrawlResponse:
|
|
295
780
|
"""
|
|
296
|
-
|
|
781
|
+
Start an asynchronous crawl job.
|
|
297
782
|
|
|
298
783
|
Args:
|
|
299
|
-
url (str):
|
|
300
|
-
|
|
301
|
-
|
|
784
|
+
url (str): Target URL to start crawling from
|
|
785
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
786
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
787
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
788
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
789
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
790
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
791
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
792
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
793
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
794
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
795
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
796
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
797
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
798
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
799
|
+
**kwargs: Additional parameters to pass to the API
|
|
302
800
|
|
|
303
801
|
Returns:
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
802
|
+
CrawlResponse with:
|
|
803
|
+
* success - Whether crawl started successfully
|
|
804
|
+
* id - Unique identifier for the crawl job
|
|
805
|
+
* url - Status check URL for the crawl
|
|
806
|
+
* error - Error message if start failed
|
|
807
|
+
|
|
808
|
+
Raises:
|
|
809
|
+
Exception: If crawl initiation fails
|
|
308
810
|
"""
|
|
309
|
-
|
|
811
|
+
crawl_params = {}
|
|
812
|
+
|
|
813
|
+
# Add individual parameters
|
|
814
|
+
if include_paths is not None:
|
|
815
|
+
crawl_params['includePaths'] = include_paths
|
|
816
|
+
if exclude_paths is not None:
|
|
817
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
818
|
+
if max_depth is not None:
|
|
819
|
+
crawl_params['maxDepth'] = max_depth
|
|
820
|
+
if max_discovery_depth is not None:
|
|
821
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
822
|
+
if limit is not None:
|
|
823
|
+
crawl_params['limit'] = limit
|
|
824
|
+
if allow_backward_links is not None:
|
|
825
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
826
|
+
if allow_external_links is not None:
|
|
827
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
828
|
+
if ignore_sitemap is not None:
|
|
829
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
830
|
+
if scrape_options is not None:
|
|
831
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
832
|
+
if webhook is not None:
|
|
833
|
+
crawl_params['webhook'] = webhook
|
|
834
|
+
if deduplicate_similar_urls is not None:
|
|
835
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
836
|
+
if ignore_query_parameters is not None:
|
|
837
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
838
|
+
if regex_on_full_url is not None:
|
|
839
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
840
|
+
|
|
841
|
+
# Add any additional kwargs
|
|
842
|
+
crawl_params.update(kwargs)
|
|
843
|
+
|
|
844
|
+
# Create final params object
|
|
845
|
+
final_params = CrawlParams(**crawl_params)
|
|
846
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
847
|
+
params_dict['url'] = url
|
|
848
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
849
|
+
|
|
850
|
+
# Make request
|
|
310
851
|
headers = self._prepare_headers(idempotency_key)
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
json_data.update(params)
|
|
314
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
852
|
+
response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
853
|
+
|
|
315
854
|
if response.status_code == 200:
|
|
316
855
|
try:
|
|
317
|
-
return response.json()
|
|
856
|
+
return CrawlResponse(**response.json())
|
|
318
857
|
except:
|
|
319
858
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
320
859
|
else:
|
|
321
860
|
self._handle_error(response, 'start crawl job')
|
|
322
861
|
|
|
323
|
-
def check_crawl_status(self, id: str) ->
|
|
862
|
+
def check_crawl_status(self, id: str) -> CrawlStatusResponse:
|
|
324
863
|
"""
|
|
325
|
-
Check the status of a crawl job
|
|
864
|
+
Check the status and results of a crawl job.
|
|
326
865
|
|
|
327
866
|
Args:
|
|
328
|
-
id
|
|
867
|
+
id: Unique identifier for the crawl job
|
|
329
868
|
|
|
330
869
|
Returns:
|
|
331
|
-
|
|
870
|
+
CrawlStatusResponse containing:
|
|
871
|
+
|
|
872
|
+
Status Information:
|
|
873
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
874
|
+
* completed - Number of pages crawled
|
|
875
|
+
* total - Total pages to crawl
|
|
876
|
+
* creditsUsed - API credits consumed
|
|
877
|
+
* expiresAt - Data expiration timestamp
|
|
878
|
+
|
|
879
|
+
Results:
|
|
880
|
+
* data - List of crawled documents
|
|
881
|
+
* next - URL for next page of results (if paginated)
|
|
882
|
+
* success - Whether status check succeeded
|
|
883
|
+
* error - Error message if failed
|
|
332
884
|
|
|
333
885
|
Raises:
|
|
334
|
-
Exception: If
|
|
886
|
+
Exception: If status check fails
|
|
335
887
|
"""
|
|
336
888
|
endpoint = f'/v1/crawl/{id}'
|
|
337
889
|
|
|
@@ -383,28 +935,37 @@ class FirecrawlApp:
|
|
|
383
935
|
if 'next' in status_data:
|
|
384
936
|
response['next'] = status_data['next']
|
|
385
937
|
|
|
386
|
-
return
|
|
387
|
-
|
|
938
|
+
return CrawlStatusResponse(
|
|
939
|
+
success=False if 'error' in status_data else True,
|
|
388
940
|
**response
|
|
389
|
-
|
|
941
|
+
)
|
|
390
942
|
else:
|
|
391
943
|
self._handle_error(response, 'check crawl status')
|
|
392
944
|
|
|
393
|
-
def check_crawl_errors(self, id: str) ->
|
|
945
|
+
def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
|
|
394
946
|
"""
|
|
395
947
|
Returns information about crawl errors.
|
|
396
948
|
|
|
397
949
|
Args:
|
|
398
|
-
id (str): The ID of the crawl job
|
|
950
|
+
id (str): The ID of the crawl job
|
|
399
951
|
|
|
400
952
|
Returns:
|
|
401
|
-
|
|
953
|
+
CrawlErrorsResponse containing:
|
|
954
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
955
|
+
- id (str): Error ID
|
|
956
|
+
- timestamp (str): When the error occurred
|
|
957
|
+
- url (str): URL that caused the error
|
|
958
|
+
- error (str): Error message
|
|
959
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
960
|
+
|
|
961
|
+
Raises:
|
|
962
|
+
Exception: If error check fails
|
|
402
963
|
"""
|
|
403
964
|
headers = self._prepare_headers()
|
|
404
965
|
response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
|
|
405
966
|
if response.status_code == 200:
|
|
406
967
|
try:
|
|
407
|
-
return response.json()
|
|
968
|
+
return CrawlErrorsResponse(**response.json())
|
|
408
969
|
except:
|
|
409
970
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
410
971
|
else:
|
|
@@ -412,13 +973,18 @@ class FirecrawlApp:
|
|
|
412
973
|
|
|
413
974
|
def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
|
414
975
|
"""
|
|
415
|
-
Cancel an asynchronous crawl job
|
|
976
|
+
Cancel an asynchronous crawl job.
|
|
416
977
|
|
|
417
978
|
Args:
|
|
418
|
-
id (str): The ID of the crawl job to cancel
|
|
979
|
+
id (str): The ID of the crawl job to cancel
|
|
419
980
|
|
|
420
981
|
Returns:
|
|
421
|
-
Dict[str, Any]:
|
|
982
|
+
Dict[str, Any] containing:
|
|
983
|
+
* success (bool): Whether cancellation was successful
|
|
984
|
+
* error (str, optional): Error message if cancellation failed
|
|
985
|
+
|
|
986
|
+
Raises:
|
|
987
|
+
Exception: If cancellation fails
|
|
422
988
|
"""
|
|
423
989
|
headers = self._prepare_headers()
|
|
424
990
|
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
|
|
@@ -430,154 +996,524 @@ class FirecrawlApp:
|
|
|
430
996
|
else:
|
|
431
997
|
self._handle_error(response, "cancel crawl job")
|
|
432
998
|
|
|
433
|
-
def crawl_url_and_watch(
|
|
999
|
+
def crawl_url_and_watch(
|
|
1000
|
+
self,
|
|
1001
|
+
url: str,
|
|
1002
|
+
*,
|
|
1003
|
+
include_paths: Optional[List[str]] = None,
|
|
1004
|
+
exclude_paths: Optional[List[str]] = None,
|
|
1005
|
+
max_depth: Optional[int] = None,
|
|
1006
|
+
max_discovery_depth: Optional[int] = None,
|
|
1007
|
+
limit: Optional[int] = None,
|
|
1008
|
+
allow_backward_links: Optional[bool] = None,
|
|
1009
|
+
allow_external_links: Optional[bool] = None,
|
|
1010
|
+
ignore_sitemap: Optional[bool] = None,
|
|
1011
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
1012
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
1013
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
1014
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
1015
|
+
regex_on_full_url: Optional[bool] = None,
|
|
1016
|
+
idempotency_key: Optional[str] = None,
|
|
1017
|
+
**kwargs
|
|
1018
|
+
) -> 'CrawlWatcher':
|
|
434
1019
|
"""
|
|
435
1020
|
Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
436
1021
|
|
|
437
1022
|
Args:
|
|
438
|
-
url (str):
|
|
439
|
-
|
|
440
|
-
|
|
1023
|
+
url (str): Target URL to start crawling from
|
|
1024
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
1025
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
1026
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
1027
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1028
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
1029
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
1030
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
1031
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1032
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
1033
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
1034
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1035
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1036
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1037
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1038
|
+
**kwargs: Additional parameters to pass to the API
|
|
441
1039
|
|
|
442
1040
|
Returns:
|
|
443
|
-
CrawlWatcher: An instance
|
|
1041
|
+
CrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
1042
|
+
|
|
1043
|
+
Raises:
|
|
1044
|
+
Exception: If crawl job fails to start
|
|
444
1045
|
"""
|
|
445
|
-
crawl_response = self.async_crawl_url(
|
|
446
|
-
|
|
447
|
-
|
|
1046
|
+
crawl_response = self.async_crawl_url(
|
|
1047
|
+
url,
|
|
1048
|
+
include_paths=include_paths,
|
|
1049
|
+
exclude_paths=exclude_paths,
|
|
1050
|
+
max_depth=max_depth,
|
|
1051
|
+
max_discovery_depth=max_discovery_depth,
|
|
1052
|
+
limit=limit,
|
|
1053
|
+
allow_backward_links=allow_backward_links,
|
|
1054
|
+
allow_external_links=allow_external_links,
|
|
1055
|
+
ignore_sitemap=ignore_sitemap,
|
|
1056
|
+
scrape_options=scrape_options,
|
|
1057
|
+
webhook=webhook,
|
|
1058
|
+
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1059
|
+
ignore_query_parameters=ignore_query_parameters,
|
|
1060
|
+
regex_on_full_url=regex_on_full_url,
|
|
1061
|
+
idempotency_key=idempotency_key,
|
|
1062
|
+
**kwargs
|
|
1063
|
+
)
|
|
1064
|
+
if crawl_response.success and crawl_response.id:
|
|
1065
|
+
return CrawlWatcher(crawl_response.id, self)
|
|
448
1066
|
else:
|
|
449
1067
|
raise Exception("Crawl job failed to start")
|
|
450
1068
|
|
|
451
|
-
def map_url(
|
|
1069
|
+
def map_url(
|
|
1070
|
+
self,
|
|
1071
|
+
url: str,
|
|
1072
|
+
*,
|
|
1073
|
+
search: Optional[str] = None,
|
|
1074
|
+
ignore_sitemap: Optional[bool] = None,
|
|
1075
|
+
include_subdomains: Optional[bool] = None,
|
|
1076
|
+
sitemap_only: Optional[bool] = None,
|
|
1077
|
+
limit: Optional[int] = None,
|
|
1078
|
+
timeout: Optional[int] = None,
|
|
1079
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
452
1080
|
"""
|
|
453
|
-
|
|
1081
|
+
Map and discover links from a URL.
|
|
454
1082
|
|
|
455
1083
|
Args:
|
|
456
|
-
url (str):
|
|
457
|
-
|
|
1084
|
+
url (str): Target URL to map
|
|
1085
|
+
search (Optional[str]): Filter pattern for URLs
|
|
1086
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1087
|
+
include_subdomains (Optional[bool]): Include subdomain links
|
|
1088
|
+
sitemap_only (Optional[bool]): Only use sitemap.xml
|
|
1089
|
+
limit (Optional[int]): Maximum URLs to return
|
|
1090
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1091
|
+
params (Optional[MapParams]): Additional mapping parameters
|
|
458
1092
|
|
|
459
1093
|
Returns:
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
1094
|
+
MapResponse: Response containing:
|
|
1095
|
+
* success (bool): Whether request succeeded
|
|
1096
|
+
* links (List[str]): Discovered URLs
|
|
1097
|
+
* error (Optional[str]): Error message if any
|
|
464
1098
|
|
|
465
|
-
|
|
466
|
-
|
|
1099
|
+
Raises:
|
|
1100
|
+
Exception: If mapping fails or response cannot be parsed
|
|
1101
|
+
"""
|
|
1102
|
+
# Build map parameters
|
|
1103
|
+
map_params = {}
|
|
467
1104
|
if params:
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
#
|
|
1105
|
+
map_params.update(params.dict(exclude_none=True))
|
|
1106
|
+
|
|
1107
|
+
# Add individual parameters
|
|
1108
|
+
if search is not None:
|
|
1109
|
+
map_params['search'] = search
|
|
1110
|
+
if ignore_sitemap is not None:
|
|
1111
|
+
map_params['ignoreSitemap'] = ignore_sitemap
|
|
1112
|
+
if include_subdomains is not None:
|
|
1113
|
+
map_params['includeSubdomains'] = include_subdomains
|
|
1114
|
+
if sitemap_only is not None:
|
|
1115
|
+
map_params['sitemapOnly'] = sitemap_only
|
|
1116
|
+
if limit is not None:
|
|
1117
|
+
map_params['limit'] = limit
|
|
1118
|
+
if timeout is not None:
|
|
1119
|
+
map_params['timeout'] = timeout
|
|
1120
|
+
|
|
1121
|
+
# Create final params object
|
|
1122
|
+
final_params = MapParams(**map_params)
|
|
1123
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1124
|
+
params_dict['url'] = url
|
|
1125
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1126
|
+
|
|
1127
|
+
# Make request
|
|
471
1128
|
response = requests.post(
|
|
472
|
-
f
|
|
473
|
-
headers=
|
|
474
|
-
json=
|
|
1129
|
+
f"{self.api_url}/v1/map",
|
|
1130
|
+
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
1131
|
+
json=params_dict
|
|
475
1132
|
)
|
|
1133
|
+
|
|
476
1134
|
if response.status_code == 200:
|
|
477
1135
|
try:
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
raise Exception(
|
|
1136
|
+
response_json = response.json()
|
|
1137
|
+
if response_json.get('success') and 'links' in response_json:
|
|
1138
|
+
return MapResponse(**response_json)
|
|
1139
|
+
elif "error" in response_json:
|
|
1140
|
+
raise Exception(f'Map failed. Error: {response_json["error"]}')
|
|
1141
|
+
else:
|
|
1142
|
+
raise Exception(f'Map failed. Error: {response_json}')
|
|
1143
|
+
except ValueError:
|
|
1144
|
+
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
487
1145
|
else:
|
|
488
1146
|
self._handle_error(response, 'map')
|
|
489
1147
|
|
|
490
|
-
def batch_scrape_urls(
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
1148
|
+
def batch_scrape_urls(
|
|
1149
|
+
self,
|
|
1150
|
+
urls: List[str],
|
|
1151
|
+
*,
|
|
1152
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1153
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1154
|
+
include_tags: Optional[List[str]] = None,
|
|
1155
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1156
|
+
only_main_content: Optional[bool] = None,
|
|
1157
|
+
wait_for: Optional[int] = None,
|
|
1158
|
+
timeout: Optional[int] = None,
|
|
1159
|
+
location: Optional[LocationConfig] = None,
|
|
1160
|
+
mobile: Optional[bool] = None,
|
|
1161
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1162
|
+
remove_base64_images: Optional[bool] = None,
|
|
1163
|
+
block_ads: Optional[bool] = None,
|
|
1164
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1165
|
+
extract: Optional[ExtractConfig] = None,
|
|
1166
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1167
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1168
|
+
agent: Optional[AgentOptions] = None,
|
|
1169
|
+
poll_interval: Optional[int] = 2,
|
|
1170
|
+
idempotency_key: Optional[str] = None,
|
|
1171
|
+
**kwargs
|
|
1172
|
+
) -> BatchScrapeStatusResponse:
|
|
494
1173
|
"""
|
|
495
|
-
|
|
1174
|
+
Batch scrape multiple URLs and monitor until completion.
|
|
496
1175
|
|
|
497
1176
|
Args:
|
|
498
|
-
urls (List[str]):
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
1177
|
+
urls (List[str]): URLs to scrape
|
|
1178
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1179
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1180
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1181
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1182
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1183
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1184
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1185
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1186
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1187
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1188
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1189
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1190
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1191
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1192
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1193
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1194
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1195
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1196
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1197
|
+
**kwargs: Additional parameters to pass to the API
|
|
502
1198
|
|
|
503
1199
|
Returns:
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
- 'total' (int): Total number of scraped pages.
|
|
509
|
-
- 'creditsUsed' (int): Estimated number of API credits used for this batch scrape.
|
|
510
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires.
|
|
511
|
-
- 'data' (List[Dict]): List of all the scraped pages.
|
|
1200
|
+
BatchScrapeStatusResponse with:
|
|
1201
|
+
* Scraping status and progress
|
|
1202
|
+
* Scraped content for each URL
|
|
1203
|
+
* Success/error information
|
|
512
1204
|
|
|
513
1205
|
Raises:
|
|
514
|
-
Exception: If
|
|
1206
|
+
Exception: If batch scrape fails
|
|
515
1207
|
"""
|
|
516
|
-
|
|
1208
|
+
scrape_params = {}
|
|
1209
|
+
|
|
1210
|
+
# Add individual parameters
|
|
1211
|
+
if formats is not None:
|
|
1212
|
+
scrape_params['formats'] = formats
|
|
1213
|
+
if headers is not None:
|
|
1214
|
+
scrape_params['headers'] = headers
|
|
1215
|
+
if include_tags is not None:
|
|
1216
|
+
scrape_params['includeTags'] = include_tags
|
|
1217
|
+
if exclude_tags is not None:
|
|
1218
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1219
|
+
if only_main_content is not None:
|
|
1220
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1221
|
+
if wait_for is not None:
|
|
1222
|
+
scrape_params['waitFor'] = wait_for
|
|
1223
|
+
if timeout is not None:
|
|
1224
|
+
scrape_params['timeout'] = timeout
|
|
1225
|
+
if location is not None:
|
|
1226
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1227
|
+
if mobile is not None:
|
|
1228
|
+
scrape_params['mobile'] = mobile
|
|
1229
|
+
if skip_tls_verification is not None:
|
|
1230
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1231
|
+
if remove_base64_images is not None:
|
|
1232
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1233
|
+
if block_ads is not None:
|
|
1234
|
+
scrape_params['blockAds'] = block_ads
|
|
1235
|
+
if proxy is not None:
|
|
1236
|
+
scrape_params['proxy'] = proxy
|
|
1237
|
+
if extract is not None:
|
|
1238
|
+
if hasattr(extract.schema, 'schema'):
|
|
1239
|
+
extract.schema = extract.schema.schema()
|
|
1240
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1241
|
+
if json_options is not None:
|
|
1242
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1243
|
+
json_options.schema = json_options.schema.schema()
|
|
1244
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1245
|
+
if actions is not None:
|
|
1246
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1247
|
+
if agent is not None:
|
|
1248
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1249
|
+
|
|
1250
|
+
# Add any additional kwargs
|
|
1251
|
+
scrape_params.update(kwargs)
|
|
1252
|
+
|
|
1253
|
+
# Create final params object
|
|
1254
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1255
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1256
|
+
params_dict['urls'] = urls
|
|
1257
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1258
|
+
|
|
1259
|
+
# Make request
|
|
517
1260
|
headers = self._prepare_headers(idempotency_key)
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
json_data.update(params)
|
|
521
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
1261
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1262
|
+
|
|
522
1263
|
if response.status_code == 200:
|
|
523
1264
|
try:
|
|
524
1265
|
id = response.json().get('id')
|
|
525
1266
|
except:
|
|
526
1267
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
527
1268
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
528
|
-
|
|
529
1269
|
else:
|
|
530
1270
|
self._handle_error(response, 'start batch scrape job')
|
|
531
1271
|
|
|
532
|
-
|
|
533
|
-
|
|
1272
|
+
def async_batch_scrape_urls(
|
|
1273
|
+
self,
|
|
1274
|
+
urls: List[str],
|
|
1275
|
+
*,
|
|
1276
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1277
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1278
|
+
include_tags: Optional[List[str]] = None,
|
|
1279
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1280
|
+
only_main_content: Optional[bool] = None,
|
|
1281
|
+
wait_for: Optional[int] = None,
|
|
1282
|
+
timeout: Optional[int] = None,
|
|
1283
|
+
location: Optional[LocationConfig] = None,
|
|
1284
|
+
mobile: Optional[bool] = None,
|
|
1285
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1286
|
+
remove_base64_images: Optional[bool] = None,
|
|
1287
|
+
block_ads: Optional[bool] = None,
|
|
1288
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1289
|
+
extract: Optional[ExtractConfig] = None,
|
|
1290
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1291
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1292
|
+
agent: Optional[AgentOptions] = None,
|
|
1293
|
+
idempotency_key: Optional[str] = None,
|
|
1294
|
+
**kwargs
|
|
1295
|
+
) -> BatchScrapeResponse:
|
|
534
1296
|
"""
|
|
535
|
-
Initiate a
|
|
1297
|
+
Initiate a batch scrape job asynchronously.
|
|
536
1298
|
|
|
537
1299
|
Args:
|
|
538
|
-
urls (List[str]):
|
|
539
|
-
|
|
540
|
-
|
|
1300
|
+
urls (List[str]): URLs to scrape
|
|
1301
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1302
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1303
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1304
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1305
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1306
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1307
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1308
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1309
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1310
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1311
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1312
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1313
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1314
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1315
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1316
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1317
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1318
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1319
|
+
**kwargs: Additional parameters to pass to the API
|
|
541
1320
|
|
|
542
1321
|
Returns:
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
1322
|
+
BatchScrapeResponse with:
|
|
1323
|
+
* success - Whether job started successfully
|
|
1324
|
+
* id - Unique identifier for the job
|
|
1325
|
+
* url - Status check URL
|
|
1326
|
+
* error - Error message if start failed
|
|
1327
|
+
|
|
1328
|
+
Raises:
|
|
1329
|
+
Exception: If job initiation fails
|
|
547
1330
|
"""
|
|
548
|
-
|
|
1331
|
+
scrape_params = {}
|
|
1332
|
+
|
|
1333
|
+
# Add individual parameters
|
|
1334
|
+
if formats is not None:
|
|
1335
|
+
scrape_params['formats'] = formats
|
|
1336
|
+
if headers is not None:
|
|
1337
|
+
scrape_params['headers'] = headers
|
|
1338
|
+
if include_tags is not None:
|
|
1339
|
+
scrape_params['includeTags'] = include_tags
|
|
1340
|
+
if exclude_tags is not None:
|
|
1341
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1342
|
+
if only_main_content is not None:
|
|
1343
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1344
|
+
if wait_for is not None:
|
|
1345
|
+
scrape_params['waitFor'] = wait_for
|
|
1346
|
+
if timeout is not None:
|
|
1347
|
+
scrape_params['timeout'] = timeout
|
|
1348
|
+
if location is not None:
|
|
1349
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1350
|
+
if mobile is not None:
|
|
1351
|
+
scrape_params['mobile'] = mobile
|
|
1352
|
+
if skip_tls_verification is not None:
|
|
1353
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1354
|
+
if remove_base64_images is not None:
|
|
1355
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1356
|
+
if block_ads is not None:
|
|
1357
|
+
scrape_params['blockAds'] = block_ads
|
|
1358
|
+
if proxy is not None:
|
|
1359
|
+
scrape_params['proxy'] = proxy
|
|
1360
|
+
if extract is not None:
|
|
1361
|
+
if hasattr(extract.schema, 'schema'):
|
|
1362
|
+
extract.schema = extract.schema.schema()
|
|
1363
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1364
|
+
if json_options is not None:
|
|
1365
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1366
|
+
json_options.schema = json_options.schema.schema()
|
|
1367
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1368
|
+
if actions is not None:
|
|
1369
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1370
|
+
if agent is not None:
|
|
1371
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1372
|
+
|
|
1373
|
+
# Add any additional kwargs
|
|
1374
|
+
scrape_params.update(kwargs)
|
|
1375
|
+
|
|
1376
|
+
# Create final params object
|
|
1377
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1378
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1379
|
+
params_dict['urls'] = urls
|
|
1380
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1381
|
+
|
|
1382
|
+
# Make request
|
|
549
1383
|
headers = self._prepare_headers(idempotency_key)
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
json_data.update(params)
|
|
553
|
-
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
|
1384
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1385
|
+
|
|
554
1386
|
if response.status_code == 200:
|
|
555
1387
|
try:
|
|
556
|
-
return response.json()
|
|
1388
|
+
return BatchScrapeResponse(**response.json())
|
|
557
1389
|
except:
|
|
558
1390
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
559
1391
|
else:
|
|
560
1392
|
self._handle_error(response, 'start batch scrape job')
|
|
561
1393
|
|
|
562
|
-
def batch_scrape_urls_and_watch(
|
|
1394
|
+
def batch_scrape_urls_and_watch(
|
|
1395
|
+
self,
|
|
1396
|
+
urls: List[str],
|
|
1397
|
+
*,
|
|
1398
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
1399
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1400
|
+
include_tags: Optional[List[str]] = None,
|
|
1401
|
+
exclude_tags: Optional[List[str]] = None,
|
|
1402
|
+
only_main_content: Optional[bool] = None,
|
|
1403
|
+
wait_for: Optional[int] = None,
|
|
1404
|
+
timeout: Optional[int] = None,
|
|
1405
|
+
location: Optional[LocationConfig] = None,
|
|
1406
|
+
mobile: Optional[bool] = None,
|
|
1407
|
+
skip_tls_verification: Optional[bool] = None,
|
|
1408
|
+
remove_base64_images: Optional[bool] = None,
|
|
1409
|
+
block_ads: Optional[bool] = None,
|
|
1410
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1411
|
+
extract: Optional[ExtractConfig] = None,
|
|
1412
|
+
json_options: Optional[ExtractConfig] = None,
|
|
1413
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1414
|
+
agent: Optional[AgentOptions] = None,
|
|
1415
|
+
idempotency_key: Optional[str] = None,
|
|
1416
|
+
**kwargs
|
|
1417
|
+
) -> 'CrawlWatcher':
|
|
563
1418
|
"""
|
|
564
1419
|
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
565
1420
|
|
|
566
1421
|
Args:
|
|
567
|
-
urls (List[str]):
|
|
568
|
-
|
|
569
|
-
|
|
1422
|
+
urls (List[str]): URLs to scrape
|
|
1423
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
1424
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
1425
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
1426
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
1427
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
1428
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
1429
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
1430
|
+
location (Optional[LocationConfig]): Location configuration
|
|
1431
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
1432
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
1433
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1434
|
+
block_ads (Optional[bool]): Block advertisements
|
|
1435
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
1436
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
1437
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
1438
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
1439
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
1440
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1441
|
+
**kwargs: Additional parameters to pass to the API
|
|
570
1442
|
|
|
571
1443
|
Returns:
|
|
572
|
-
CrawlWatcher: An instance
|
|
1444
|
+
CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
1445
|
+
|
|
1446
|
+
Raises:
|
|
1447
|
+
Exception: If batch scrape job fails to start
|
|
573
1448
|
"""
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
1449
|
+
scrape_params = {}
|
|
1450
|
+
|
|
1451
|
+
# Add individual parameters
|
|
1452
|
+
if formats is not None:
|
|
1453
|
+
scrape_params['formats'] = formats
|
|
1454
|
+
if headers is not None:
|
|
1455
|
+
scrape_params['headers'] = headers
|
|
1456
|
+
if include_tags is not None:
|
|
1457
|
+
scrape_params['includeTags'] = include_tags
|
|
1458
|
+
if exclude_tags is not None:
|
|
1459
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
1460
|
+
if only_main_content is not None:
|
|
1461
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
1462
|
+
if wait_for is not None:
|
|
1463
|
+
scrape_params['waitFor'] = wait_for
|
|
1464
|
+
if timeout is not None:
|
|
1465
|
+
scrape_params['timeout'] = timeout
|
|
1466
|
+
if location is not None:
|
|
1467
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1468
|
+
if mobile is not None:
|
|
1469
|
+
scrape_params['mobile'] = mobile
|
|
1470
|
+
if skip_tls_verification is not None:
|
|
1471
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
1472
|
+
if remove_base64_images is not None:
|
|
1473
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
1474
|
+
if block_ads is not None:
|
|
1475
|
+
scrape_params['blockAds'] = block_ads
|
|
1476
|
+
if proxy is not None:
|
|
1477
|
+
scrape_params['proxy'] = proxy
|
|
1478
|
+
if extract is not None:
|
|
1479
|
+
if hasattr(extract.schema, 'schema'):
|
|
1480
|
+
extract.schema = extract.schema.schema()
|
|
1481
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
1482
|
+
if json_options is not None:
|
|
1483
|
+
if hasattr(json_options.schema, 'schema'):
|
|
1484
|
+
json_options.schema = json_options.schema.schema()
|
|
1485
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
1486
|
+
if actions is not None:
|
|
1487
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1488
|
+
if agent is not None:
|
|
1489
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1490
|
+
|
|
1491
|
+
# Add any additional kwargs
|
|
1492
|
+
scrape_params.update(kwargs)
|
|
1493
|
+
|
|
1494
|
+
# Create final params object
|
|
1495
|
+
final_params = ScrapeParams(**scrape_params)
|
|
1496
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
1497
|
+
params_dict['urls'] = urls
|
|
1498
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
1499
|
+
|
|
1500
|
+
# Make request
|
|
1501
|
+
headers = self._prepare_headers(idempotency_key)
|
|
1502
|
+
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
|
1503
|
+
|
|
1504
|
+
if response.status_code == 200:
|
|
1505
|
+
try:
|
|
1506
|
+
crawl_response = BatchScrapeResponse(**response.json())
|
|
1507
|
+
if crawl_response.success and crawl_response.id:
|
|
1508
|
+
return CrawlWatcher(crawl_response.id, self)
|
|
1509
|
+
else:
|
|
1510
|
+
raise Exception("Batch scrape job failed to start")
|
|
1511
|
+
except:
|
|
1512
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
577
1513
|
else:
|
|
578
|
-
|
|
1514
|
+
self._handle_error(response, 'start batch scrape job')
|
|
579
1515
|
|
|
580
|
-
def check_batch_scrape_status(self, id: str) ->
|
|
1516
|
+
def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
|
|
581
1517
|
"""
|
|
582
1518
|
Check the status of a batch scrape job using the Firecrawl API.
|
|
583
1519
|
|
|
@@ -585,7 +1521,7 @@ class FirecrawlApp:
|
|
|
585
1521
|
id (str): The ID of the batch scrape job.
|
|
586
1522
|
|
|
587
1523
|
Returns:
|
|
588
|
-
|
|
1524
|
+
BatchScrapeStatusResponse: The status of the batch scrape job.
|
|
589
1525
|
|
|
590
1526
|
Raises:
|
|
591
1527
|
Exception: If the status check request fails.
|
|
@@ -625,29 +1561,21 @@ class FirecrawlApp:
|
|
|
625
1561
|
break
|
|
626
1562
|
status_data['data'] = data
|
|
627
1563
|
|
|
628
|
-
|
|
1564
|
+
return BatchScrapeStatusResponse(**{
|
|
1565
|
+
'success': False if 'error' in status_data else True,
|
|
629
1566
|
'status': status_data.get('status'),
|
|
630
1567
|
'total': status_data.get('total'),
|
|
631
1568
|
'completed': status_data.get('completed'),
|
|
632
1569
|
'creditsUsed': status_data.get('creditsUsed'),
|
|
633
1570
|
'expiresAt': status_data.get('expiresAt'),
|
|
634
|
-
'data': status_data.get('data')
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
response['error'] = status_data['error']
|
|
639
|
-
|
|
640
|
-
if 'next' in status_data:
|
|
641
|
-
response['next'] = status_data['next']
|
|
642
|
-
|
|
643
|
-
return {
|
|
644
|
-
'success': False if 'error' in status_data else True,
|
|
645
|
-
**response
|
|
646
|
-
}
|
|
1571
|
+
'data': status_data.get('data'),
|
|
1572
|
+
'next': status_data.get('next'),
|
|
1573
|
+
'error': status_data.get('error')
|
|
1574
|
+
})
|
|
647
1575
|
else:
|
|
648
1576
|
self._handle_error(response, 'check batch scrape status')
|
|
649
1577
|
|
|
650
|
-
def check_batch_scrape_errors(self, id: str) ->
|
|
1578
|
+
def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
|
|
651
1579
|
"""
|
|
652
1580
|
Returns information about batch scrape errors.
|
|
653
1581
|
|
|
@@ -655,38 +1583,68 @@ class FirecrawlApp:
|
|
|
655
1583
|
id (str): The ID of the crawl job.
|
|
656
1584
|
|
|
657
1585
|
Returns:
|
|
658
|
-
|
|
1586
|
+
CrawlErrorsResponse: A response containing:
|
|
1587
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
1588
|
+
* id (str): Error ID
|
|
1589
|
+
* timestamp (str): When the error occurred
|
|
1590
|
+
* url (str): URL that caused the error
|
|
1591
|
+
* error (str): Error message
|
|
1592
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
1593
|
+
|
|
1594
|
+
Raises:
|
|
1595
|
+
Exception: If the error check request fails
|
|
659
1596
|
"""
|
|
660
1597
|
headers = self._prepare_headers()
|
|
661
1598
|
response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
|
|
662
1599
|
if response.status_code == 200:
|
|
663
1600
|
try:
|
|
664
|
-
return response.json()
|
|
1601
|
+
return CrawlErrorsResponse(**response.json())
|
|
665
1602
|
except:
|
|
666
1603
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
667
1604
|
else:
|
|
668
1605
|
self._handle_error(response, "check batch scrape errors")
|
|
669
1606
|
|
|
670
|
-
def extract(
|
|
1607
|
+
def extract(
|
|
1608
|
+
self,
|
|
1609
|
+
urls: Optional[List[str]] = None,
|
|
1610
|
+
*,
|
|
1611
|
+
prompt: Optional[str] = None,
|
|
1612
|
+
schema: Optional[Any] = None,
|
|
1613
|
+
system_prompt: Optional[str] = None,
|
|
1614
|
+
allow_external_links: Optional[bool] = False,
|
|
1615
|
+
enable_web_search: Optional[bool] = False,
|
|
1616
|
+
show_sources: Optional[bool] = False,
|
|
1617
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
671
1618
|
"""
|
|
672
|
-
|
|
1619
|
+
Extract structured information from URLs.
|
|
673
1620
|
|
|
674
1621
|
Args:
|
|
675
|
-
urls (Optional[List[str]]):
|
|
676
|
-
|
|
1622
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
1623
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
1624
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
1625
|
+
system_prompt (Optional[str]): System context
|
|
1626
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
1627
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
1628
|
+
show_sources (Optional[bool]): Include source URLs
|
|
1629
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
677
1630
|
|
|
678
1631
|
Returns:
|
|
679
|
-
|
|
1632
|
+
ExtractResponse[Any] with:
|
|
1633
|
+
* success (bool): Whether request succeeded
|
|
1634
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
1635
|
+
* error (Optional[str]): Error message if any
|
|
1636
|
+
|
|
1637
|
+
Raises:
|
|
1638
|
+
ValueError: If prompt/schema missing or extraction fails
|
|
680
1639
|
"""
|
|
681
1640
|
headers = self._prepare_headers()
|
|
682
1641
|
|
|
683
|
-
if not
|
|
1642
|
+
if not prompt and not schema:
|
|
684
1643
|
raise ValueError("Either prompt or schema is required")
|
|
685
1644
|
|
|
686
|
-
if not urls and not
|
|
1645
|
+
if not urls and not prompt:
|
|
687
1646
|
raise ValueError("Either urls or prompt is required")
|
|
688
1647
|
|
|
689
|
-
schema = params.get('schema')
|
|
690
1648
|
if schema:
|
|
691
1649
|
if hasattr(schema, 'model_json_schema'):
|
|
692
1650
|
# Convert Pydantic model to JSON schema
|
|
@@ -694,26 +1652,22 @@ class FirecrawlApp:
|
|
|
694
1652
|
# Otherwise assume it's already a JSON schema dict
|
|
695
1653
|
|
|
696
1654
|
request_data = {
|
|
697
|
-
'urls': urls,
|
|
698
|
-
'allowExternalLinks':
|
|
699
|
-
'enableWebSearch':
|
|
700
|
-
'showSources':
|
|
1655
|
+
'urls': urls or [],
|
|
1656
|
+
'allowExternalLinks': allow_external_links,
|
|
1657
|
+
'enableWebSearch': enable_web_search,
|
|
1658
|
+
'showSources': show_sources,
|
|
701
1659
|
'schema': schema,
|
|
702
|
-
'origin': '
|
|
1660
|
+
'origin': f'python-sdk@{get_version()}'
|
|
703
1661
|
}
|
|
704
1662
|
|
|
705
|
-
if not request_data['urls']:
|
|
706
|
-
request_data['urls'] = []
|
|
707
1663
|
# Only add prompt and systemPrompt if they exist
|
|
708
|
-
if
|
|
709
|
-
request_data['prompt'] =
|
|
710
|
-
if
|
|
711
|
-
request_data['systemPrompt'] =
|
|
712
|
-
elif params.get('systemPrompt'): # Check legacy field name
|
|
713
|
-
request_data['systemPrompt'] = params['systemPrompt']
|
|
1664
|
+
if prompt:
|
|
1665
|
+
request_data['prompt'] = prompt
|
|
1666
|
+
if system_prompt:
|
|
1667
|
+
request_data['systemPrompt'] = system_prompt
|
|
714
1668
|
|
|
715
|
-
if
|
|
716
|
-
request_data['agent'] =
|
|
1669
|
+
if agent:
|
|
1670
|
+
request_data['agent'] = agent
|
|
717
1671
|
|
|
718
1672
|
try:
|
|
719
1673
|
# Send the initial extract request
|
|
@@ -744,10 +1698,7 @@ class FirecrawlApp:
|
|
|
744
1698
|
except:
|
|
745
1699
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
746
1700
|
if status_data['status'] == 'completed':
|
|
747
|
-
|
|
748
|
-
return status_data
|
|
749
|
-
else:
|
|
750
|
-
raise Exception(f'Failed to extract. Error: {status_data["error"]}')
|
|
1701
|
+
return ExtractResponse(**status_data)
|
|
751
1702
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
752
1703
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
753
1704
|
else:
|
|
@@ -761,9 +1712,9 @@ class FirecrawlApp:
|
|
|
761
1712
|
except Exception as e:
|
|
762
1713
|
raise ValueError(str(e), 500)
|
|
763
1714
|
|
|
764
|
-
return
|
|
1715
|
+
return ExtractResponse(success=False, error="Internal server error.")
|
|
765
1716
|
|
|
766
|
-
def get_extract_status(self, job_id: str) ->
|
|
1717
|
+
def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
|
|
767
1718
|
"""
|
|
768
1719
|
Retrieve the status of an extract job.
|
|
769
1720
|
|
|
@@ -771,7 +1722,7 @@ class FirecrawlApp:
|
|
|
771
1722
|
job_id (str): The ID of the extract job.
|
|
772
1723
|
|
|
773
1724
|
Returns:
|
|
774
|
-
|
|
1725
|
+
ExtractResponse[Any]: The status of the extract job.
|
|
775
1726
|
|
|
776
1727
|
Raises:
|
|
777
1728
|
ValueError: If there is an error retrieving the status.
|
|
@@ -781,7 +1732,7 @@ class FirecrawlApp:
|
|
|
781
1732
|
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
|
782
1733
|
if response.status_code == 200:
|
|
783
1734
|
try:
|
|
784
|
-
return response.json()
|
|
1735
|
+
return ExtractResponse(**response.json())
|
|
785
1736
|
except:
|
|
786
1737
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
787
1738
|
else:
|
|
@@ -789,43 +1740,71 @@ class FirecrawlApp:
|
|
|
789
1740
|
except Exception as e:
|
|
790
1741
|
raise ValueError(str(e), 500)
|
|
791
1742
|
|
|
792
|
-
def async_extract(
|
|
1743
|
+
def async_extract(
|
|
1744
|
+
self,
|
|
1745
|
+
urls: List[str],
|
|
1746
|
+
*,
|
|
1747
|
+
prompt: Optional[str] = None,
|
|
1748
|
+
schema: Optional[Any] = None,
|
|
1749
|
+
system_prompt: Optional[str] = None,
|
|
1750
|
+
allow_external_links: Optional[bool] = False,
|
|
1751
|
+
enable_web_search: Optional[bool] = False,
|
|
1752
|
+
show_sources: Optional[bool] = False,
|
|
1753
|
+
agent: Optional[Dict[str, Any]] = None,
|
|
1754
|
+
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
793
1755
|
"""
|
|
794
1756
|
Initiate an asynchronous extract job.
|
|
795
1757
|
|
|
796
1758
|
Args:
|
|
797
|
-
urls (List[str]):
|
|
798
|
-
|
|
799
|
-
|
|
1759
|
+
urls (List[str]): URLs to extract information from
|
|
1760
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
1761
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
1762
|
+
system_prompt (Optional[str]): System context
|
|
1763
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
1764
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
1765
|
+
show_sources (Optional[bool]): Include source URLs
|
|
1766
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
1767
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
800
1768
|
|
|
801
1769
|
Returns:
|
|
802
|
-
|
|
1770
|
+
ExtractResponse[Any] with:
|
|
1771
|
+
* success (bool): Whether request succeeded
|
|
1772
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
1773
|
+
* error (Optional[str]): Error message if any
|
|
803
1774
|
|
|
804
1775
|
Raises:
|
|
805
|
-
ValueError: If
|
|
1776
|
+
ValueError: If job initiation fails
|
|
806
1777
|
"""
|
|
807
1778
|
headers = self._prepare_headers(idempotency_key)
|
|
808
1779
|
|
|
809
|
-
schema =
|
|
1780
|
+
schema = schema
|
|
810
1781
|
if schema:
|
|
811
1782
|
if hasattr(schema, 'model_json_schema'):
|
|
812
1783
|
# Convert Pydantic model to JSON schema
|
|
813
1784
|
schema = schema.model_json_schema()
|
|
814
1785
|
# Otherwise assume it's already a JSON schema dict
|
|
815
1786
|
|
|
816
|
-
jsonData = {'urls': urls, **(params or {})}
|
|
817
1787
|
request_data = {
|
|
818
|
-
|
|
819
|
-
'allowExternalLinks':
|
|
1788
|
+
'urls': urls,
|
|
1789
|
+
'allowExternalLinks': allow_external_links,
|
|
1790
|
+
'enableWebSearch': enable_web_search,
|
|
1791
|
+
'showSources': show_sources,
|
|
820
1792
|
'schema': schema,
|
|
821
|
-
'origin': '
|
|
1793
|
+
'origin': f'python-sdk@{version}'
|
|
822
1794
|
}
|
|
823
1795
|
|
|
1796
|
+
if prompt:
|
|
1797
|
+
request_data['prompt'] = prompt
|
|
1798
|
+
if system_prompt:
|
|
1799
|
+
request_data['systemPrompt'] = system_prompt
|
|
1800
|
+
if agent:
|
|
1801
|
+
request_data['agent'] = agent
|
|
1802
|
+
|
|
824
1803
|
try:
|
|
825
1804
|
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
|
826
1805
|
if response.status_code == 200:
|
|
827
1806
|
try:
|
|
828
|
-
return response.json()
|
|
1807
|
+
return ExtractResponse(**response.json())
|
|
829
1808
|
except:
|
|
830
1809
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
831
1810
|
else:
|
|
@@ -833,34 +1812,44 @@ class FirecrawlApp:
|
|
|
833
1812
|
except Exception as e:
|
|
834
1813
|
raise ValueError(str(e), 500)
|
|
835
1814
|
|
|
836
|
-
def generate_llms_text(
|
|
1815
|
+
def generate_llms_text(
|
|
1816
|
+
self,
|
|
1817
|
+
url: str,
|
|
1818
|
+
*,
|
|
1819
|
+
max_urls: Optional[int] = None,
|
|
1820
|
+
show_full_text: Optional[bool] = None,
|
|
1821
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
|
837
1822
|
"""
|
|
838
1823
|
Generate LLMs.txt for a given URL and poll until completion.
|
|
839
1824
|
|
|
840
1825
|
Args:
|
|
841
|
-
url (str):
|
|
842
|
-
|
|
1826
|
+
url (str): Target URL to generate LLMs.txt from
|
|
1827
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1828
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1829
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
843
1830
|
|
|
844
1831
|
Returns:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
- 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires.
|
|
1832
|
+
GenerateLLMsTextStatusResponse with:
|
|
1833
|
+
* Generated LLMs.txt content
|
|
1834
|
+
* Full version if requested
|
|
1835
|
+
* Generation status
|
|
1836
|
+
* Success/error information
|
|
851
1837
|
|
|
852
1838
|
Raises:
|
|
853
|
-
Exception: If
|
|
1839
|
+
Exception: If generation fails
|
|
854
1840
|
"""
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
else:
|
|
861
|
-
generation_params = params
|
|
1841
|
+
params = GenerateLLMsTextParams(
|
|
1842
|
+
maxUrls=max_urls,
|
|
1843
|
+
showFullText=show_full_text,
|
|
1844
|
+
__experimental_stream=experimental_stream
|
|
1845
|
+
)
|
|
862
1846
|
|
|
863
|
-
response = self.async_generate_llms_text(
|
|
1847
|
+
response = self.async_generate_llms_text(
|
|
1848
|
+
url,
|
|
1849
|
+
max_urls=max_urls,
|
|
1850
|
+
show_full_text=show_full_text,
|
|
1851
|
+
experimental_stream=experimental_stream
|
|
1852
|
+
)
|
|
864
1853
|
if not response.get('success') or 'id' not in response:
|
|
865
1854
|
return response
|
|
866
1855
|
|
|
@@ -879,32 +1868,40 @@ class FirecrawlApp:
|
|
|
879
1868
|
|
|
880
1869
|
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
|
881
1870
|
|
|
882
|
-
def async_generate_llms_text(
|
|
1871
|
+
def async_generate_llms_text(
|
|
1872
|
+
self,
|
|
1873
|
+
url: str,
|
|
1874
|
+
*,
|
|
1875
|
+
max_urls: Optional[int] = None,
|
|
1876
|
+
show_full_text: Optional[bool] = None,
|
|
1877
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
883
1878
|
"""
|
|
884
1879
|
Initiate an asynchronous LLMs.txt generation operation.
|
|
885
1880
|
|
|
886
1881
|
Args:
|
|
887
|
-
url (str): The URL to generate LLMs.txt from.
|
|
888
|
-
|
|
1882
|
+
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
|
|
1883
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1884
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1885
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
889
1886
|
|
|
890
1887
|
Returns:
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
1888
|
+
GenerateLLMsTextResponse: A response containing:
|
|
1889
|
+
* success (bool): Whether the generation initiation was successful
|
|
1890
|
+
* id (str): The unique identifier for the generation job
|
|
1891
|
+
* error (str, optional): Error message if initiation failed
|
|
894
1892
|
|
|
895
1893
|
Raises:
|
|
896
1894
|
Exception: If the generation job initiation fails.
|
|
897
1895
|
"""
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
else:
|
|
904
|
-
generation_params = params
|
|
1896
|
+
params = GenerateLLMsTextParams(
|
|
1897
|
+
maxUrls=max_urls,
|
|
1898
|
+
showFullText=show_full_text,
|
|
1899
|
+
__experimental_stream=experimental_stream
|
|
1900
|
+
)
|
|
905
1901
|
|
|
906
1902
|
headers = self._prepare_headers()
|
|
907
|
-
json_data = {'url': url, **
|
|
1903
|
+
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
1904
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
908
1905
|
|
|
909
1906
|
try:
|
|
910
1907
|
response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers)
|
|
@@ -920,15 +1917,22 @@ class FirecrawlApp:
|
|
|
920
1917
|
|
|
921
1918
|
return {'success': False, 'error': 'Internal server error'}
|
|
922
1919
|
|
|
923
|
-
def check_generate_llms_text_status(self, id: str) ->
|
|
1920
|
+
def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
|
|
924
1921
|
"""
|
|
925
1922
|
Check the status of a LLMs.txt generation operation.
|
|
926
1923
|
|
|
927
1924
|
Args:
|
|
928
|
-
id (str): The
|
|
1925
|
+
id (str): The unique identifier of the LLMs.txt generation job to check status for.
|
|
929
1926
|
|
|
930
1927
|
Returns:
|
|
931
|
-
|
|
1928
|
+
GenerateLLMsTextStatusResponse: A response containing:
|
|
1929
|
+
* success (bool): Whether the generation was successful
|
|
1930
|
+
* status (str): Status of generation ("processing", "completed", "failed")
|
|
1931
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
1932
|
+
* llmstxt (str): Generated LLMs.txt content
|
|
1933
|
+
* llmsfulltxt (str, optional): Full version if requested
|
|
1934
|
+
* error (str, optional): Error message if generation failed
|
|
1935
|
+
* expiresAt (str): When the generated data expires
|
|
932
1936
|
|
|
933
1937
|
Raises:
|
|
934
1938
|
Exception: If the status check fails.
|
|
@@ -950,7 +1954,9 @@ class FirecrawlApp:
|
|
|
950
1954
|
|
|
951
1955
|
return {'success': False, 'error': 'Internal server error'}
|
|
952
1956
|
|
|
953
|
-
def _prepare_headers(
|
|
1957
|
+
def _prepare_headers(
|
|
1958
|
+
self,
|
|
1959
|
+
idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
|
954
1960
|
"""
|
|
955
1961
|
Prepare the headers for API requests.
|
|
956
1962
|
|
|
@@ -972,11 +1978,13 @@ class FirecrawlApp:
|
|
|
972
1978
|
'Authorization': f'Bearer {self.api_key}',
|
|
973
1979
|
}
|
|
974
1980
|
|
|
975
|
-
def _post_request(
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
1981
|
+
def _post_request(
|
|
1982
|
+
self,
|
|
1983
|
+
url: str,
|
|
1984
|
+
data: Dict[str, Any],
|
|
1985
|
+
headers: Dict[str, str],
|
|
1986
|
+
retries: int = 3,
|
|
1987
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
980
1988
|
"""
|
|
981
1989
|
Make a POST request with retries.
|
|
982
1990
|
|
|
@@ -1001,10 +2009,12 @@ class FirecrawlApp:
|
|
|
1001
2009
|
return response
|
|
1002
2010
|
return response
|
|
1003
2011
|
|
|
1004
|
-
def _get_request(
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
2012
|
+
def _get_request(
|
|
2013
|
+
self,
|
|
2014
|
+
url: str,
|
|
2015
|
+
headers: Dict[str, str],
|
|
2016
|
+
retries: int = 3,
|
|
2017
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
1008
2018
|
"""
|
|
1009
2019
|
Make a GET request with retries.
|
|
1010
2020
|
|
|
@@ -1028,10 +2038,12 @@ class FirecrawlApp:
|
|
|
1028
2038
|
return response
|
|
1029
2039
|
return response
|
|
1030
2040
|
|
|
1031
|
-
def _delete_request(
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
2041
|
+
def _delete_request(
|
|
2042
|
+
self,
|
|
2043
|
+
url: str,
|
|
2044
|
+
headers: Dict[str, str],
|
|
2045
|
+
retries: int = 3,
|
|
2046
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
1035
2047
|
"""
|
|
1036
2048
|
Make a DELETE request with retries.
|
|
1037
2049
|
|
|
@@ -1055,16 +2067,21 @@ class FirecrawlApp:
|
|
|
1055
2067
|
return response
|
|
1056
2068
|
return response
|
|
1057
2069
|
|
|
1058
|
-
def _monitor_job_status(
|
|
2070
|
+
def _monitor_job_status(
|
|
2071
|
+
self,
|
|
2072
|
+
id: str,
|
|
2073
|
+
headers: Dict[str, str],
|
|
2074
|
+
poll_interval: int) -> CrawlStatusResponse:
|
|
1059
2075
|
"""
|
|
1060
2076
|
Monitor the status of a crawl job until completion.
|
|
1061
2077
|
|
|
1062
2078
|
Args:
|
|
1063
2079
|
id (str): The ID of the crawl job.
|
|
1064
2080
|
headers (Dict[str, str]): The headers to include in the status check requests.
|
|
1065
|
-
poll_interval (int):
|
|
2081
|
+
poll_interval (int): Seconds between status checks.
|
|
2082
|
+
|
|
1066
2083
|
Returns:
|
|
1067
|
-
|
|
2084
|
+
CrawlStatusResponse: The crawl results if the job is completed successfully.
|
|
1068
2085
|
|
|
1069
2086
|
Raises:
|
|
1070
2087
|
Exception: If the job fails or an error occurs during status checks.
|
|
@@ -1091,7 +2108,7 @@ class FirecrawlApp:
|
|
|
1091
2108
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1092
2109
|
data.extend(status_data.get('data', []))
|
|
1093
2110
|
status_data['data'] = data
|
|
1094
|
-
return status_data
|
|
2111
|
+
return CrawlStatusResponse(**status_data)
|
|
1095
2112
|
else:
|
|
1096
2113
|
raise Exception('Crawl job completed but no data was returned')
|
|
1097
2114
|
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
@@ -1102,7 +2119,10 @@ class FirecrawlApp:
|
|
|
1102
2119
|
else:
|
|
1103
2120
|
self._handle_error(status_response, 'check crawl status')
|
|
1104
2121
|
|
|
1105
|
-
def _handle_error(
|
|
2122
|
+
def _handle_error(
|
|
2123
|
+
self,
|
|
2124
|
+
response: requests.Response,
|
|
2125
|
+
action: str) -> None:
|
|
1106
2126
|
"""
|
|
1107
2127
|
Handle errors from API responses.
|
|
1108
2128
|
|
|
@@ -1119,49 +2139,100 @@ class FirecrawlApp:
|
|
|
1119
2139
|
except:
|
|
1120
2140
|
raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
|
|
1121
2141
|
|
|
1122
|
-
|
|
1123
|
-
if response.status_code == 402:
|
|
1124
|
-
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
1125
|
-
elif response.status_code == 403:
|
|
1126
|
-
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
1127
|
-
elif response.status_code == 408:
|
|
1128
|
-
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
1129
|
-
elif response.status_code == 409:
|
|
1130
|
-
message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
|
|
1131
|
-
elif response.status_code == 500:
|
|
1132
|
-
message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
|
1133
|
-
else:
|
|
1134
|
-
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
|
|
2142
|
+
message = self._get_error_message(response.status_code, action, error_message, error_details)
|
|
1135
2143
|
|
|
1136
2144
|
# Raise an HTTPError with the custom message and attach the response
|
|
1137
2145
|
raise requests.exceptions.HTTPError(message, response=response)
|
|
1138
2146
|
|
|
1139
|
-
def
|
|
1140
|
-
|
|
1141
|
-
|
|
2147
|
+
def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
2148
|
+
"""
|
|
2149
|
+
Generate a standardized error message based on HTTP status code.
|
|
2150
|
+
|
|
2151
|
+
Args:
|
|
2152
|
+
status_code (int): The HTTP status code from the response
|
|
2153
|
+
action (str): Description of the action that was being performed
|
|
2154
|
+
error_message (str): The error message from the API response
|
|
2155
|
+
error_details (str): Additional error details from the API response
|
|
2156
|
+
|
|
2157
|
+
Returns:
|
|
2158
|
+
str: A formatted error message
|
|
2159
|
+
"""
|
|
2160
|
+
if status_code == 402:
|
|
2161
|
+
return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
2162
|
+
elif status_code == 403:
|
|
2163
|
+
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
2164
|
+
elif status_code == 408:
|
|
2165
|
+
return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
2166
|
+
elif status_code == 409:
|
|
2167
|
+
return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
|
|
2168
|
+
elif status_code == 500:
|
|
2169
|
+
return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
|
2170
|
+
else:
|
|
2171
|
+
return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}"
|
|
2172
|
+
|
|
2173
|
+
def deep_research(
|
|
2174
|
+
self,
|
|
2175
|
+
query: str,
|
|
2176
|
+
*,
|
|
2177
|
+
max_depth: Optional[int] = None,
|
|
2178
|
+
time_limit: Optional[int] = None,
|
|
2179
|
+
max_urls: Optional[int] = None,
|
|
2180
|
+
analysis_prompt: Optional[str] = None,
|
|
2181
|
+
system_prompt: Optional[str] = None,
|
|
2182
|
+
__experimental_stream_steps: Optional[bool] = None,
|
|
2183
|
+
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
2184
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
|
|
1142
2185
|
"""
|
|
1143
2186
|
Initiates a deep research operation on a given query and polls until completion.
|
|
1144
2187
|
|
|
1145
2188
|
Args:
|
|
1146
|
-
query (str):
|
|
1147
|
-
|
|
1148
|
-
|
|
2189
|
+
query (str): Research query or topic to investigate
|
|
2190
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
2191
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
2192
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
2193
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
2194
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
2195
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
2196
|
+
on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
|
|
2197
|
+
on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
|
|
1149
2198
|
|
|
1150
2199
|
Returns:
|
|
1151
|
-
|
|
2200
|
+
DeepResearchStatusResponse containing:
|
|
2201
|
+
* success (bool): Whether research completed successfully
|
|
2202
|
+
* status (str): Current state (processing/completed/failed)
|
|
2203
|
+
* error (Optional[str]): Error message if failed
|
|
2204
|
+
* id (str): Unique identifier for the research job
|
|
2205
|
+
* data (Any): Research findings and analysis
|
|
2206
|
+
* sources (List[Dict]): List of discovered sources
|
|
2207
|
+
* activities (List[Dict]): Research progress log
|
|
2208
|
+
* summaries (List[str]): Generated research summaries
|
|
1152
2209
|
|
|
1153
2210
|
Raises:
|
|
1154
|
-
Exception: If
|
|
2211
|
+
Exception: If research fails
|
|
1155
2212
|
"""
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
if
|
|
1160
|
-
research_params =
|
|
1161
|
-
|
|
1162
|
-
research_params =
|
|
1163
|
-
|
|
1164
|
-
|
|
2213
|
+
research_params = {}
|
|
2214
|
+
if max_depth is not None:
|
|
2215
|
+
research_params['maxDepth'] = max_depth
|
|
2216
|
+
if time_limit is not None:
|
|
2217
|
+
research_params['timeLimit'] = time_limit
|
|
2218
|
+
if max_urls is not None:
|
|
2219
|
+
research_params['maxUrls'] = max_urls
|
|
2220
|
+
if analysis_prompt is not None:
|
|
2221
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
2222
|
+
if system_prompt is not None:
|
|
2223
|
+
research_params['systemPrompt'] = system_prompt
|
|
2224
|
+
if __experimental_stream_steps is not None:
|
|
2225
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2226
|
+
research_params = DeepResearchParams(**research_params)
|
|
2227
|
+
|
|
2228
|
+
response = self.async_deep_research(
|
|
2229
|
+
query,
|
|
2230
|
+
max_depth=max_depth,
|
|
2231
|
+
time_limit=time_limit,
|
|
2232
|
+
max_urls=max_urls,
|
|
2233
|
+
analysis_prompt=analysis_prompt,
|
|
2234
|
+
system_prompt=system_prompt
|
|
2235
|
+
)
|
|
1165
2236
|
if not response.get('success') or 'id' not in response:
|
|
1166
2237
|
return response
|
|
1167
2238
|
|
|
@@ -1194,31 +2265,57 @@ class FirecrawlApp:
|
|
|
1194
2265
|
time.sleep(2) # Polling interval
|
|
1195
2266
|
|
|
1196
2267
|
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
|
1197
|
-
|
|
2268
|
+
|
|
2269
|
+
def async_deep_research(
|
|
2270
|
+
self,
|
|
2271
|
+
query: str,
|
|
2272
|
+
*,
|
|
2273
|
+
max_depth: Optional[int] = None,
|
|
2274
|
+
time_limit: Optional[int] = None,
|
|
2275
|
+
max_urls: Optional[int] = None,
|
|
2276
|
+
analysis_prompt: Optional[str] = None,
|
|
2277
|
+
system_prompt: Optional[str] = None,
|
|
2278
|
+
__experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
|
|
1198
2279
|
"""
|
|
1199
2280
|
Initiates an asynchronous deep research operation.
|
|
1200
2281
|
|
|
1201
2282
|
Args:
|
|
1202
|
-
query (str):
|
|
1203
|
-
|
|
2283
|
+
query (str): Research query or topic to investigate
|
|
2284
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
2285
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
2286
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
2287
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
2288
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
2289
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
1204
2290
|
|
|
1205
2291
|
Returns:
|
|
1206
|
-
Dict[str, Any]:
|
|
2292
|
+
Dict[str, Any]: A response containing:
|
|
2293
|
+
* success (bool): Whether the research initiation was successful
|
|
2294
|
+
* id (str): The unique identifier for the research job
|
|
2295
|
+
* error (str, optional): Error message if initiation failed
|
|
1207
2296
|
|
|
1208
2297
|
Raises:
|
|
1209
2298
|
Exception: If the research initiation fails.
|
|
1210
2299
|
"""
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
if
|
|
1215
|
-
research_params =
|
|
1216
|
-
|
|
1217
|
-
research_params =
|
|
2300
|
+
research_params = {}
|
|
2301
|
+
if max_depth is not None:
|
|
2302
|
+
research_params['maxDepth'] = max_depth
|
|
2303
|
+
if time_limit is not None:
|
|
2304
|
+
research_params['timeLimit'] = time_limit
|
|
2305
|
+
if max_urls is not None:
|
|
2306
|
+
research_params['maxUrls'] = max_urls
|
|
2307
|
+
if analysis_prompt is not None:
|
|
2308
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
2309
|
+
if system_prompt is not None:
|
|
2310
|
+
research_params['systemPrompt'] = system_prompt
|
|
2311
|
+
if __experimental_stream_steps is not None:
|
|
2312
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2313
|
+
research_params = DeepResearchParams(**research_params)
|
|
1218
2314
|
|
|
1219
2315
|
headers = self._prepare_headers()
|
|
1220
2316
|
|
|
1221
2317
|
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
|
2318
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
1222
2319
|
|
|
1223
2320
|
# Handle json options schema if present
|
|
1224
2321
|
if 'jsonOptions' in json_data:
|
|
@@ -1240,7 +2337,7 @@ class FirecrawlApp:
|
|
|
1240
2337
|
|
|
1241
2338
|
return {'success': False, 'error': 'Internal server error'}
|
|
1242
2339
|
|
|
1243
|
-
def check_deep_research_status(self, id: str) ->
|
|
2340
|
+
def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
|
|
1244
2341
|
"""
|
|
1245
2342
|
Check the status of a deep research operation.
|
|
1246
2343
|
|
|
@@ -1248,7 +2345,19 @@ class FirecrawlApp:
|
|
|
1248
2345
|
id (str): The ID of the deep research operation.
|
|
1249
2346
|
|
|
1250
2347
|
Returns:
|
|
1251
|
-
|
|
2348
|
+
DeepResearchResponse containing:
|
|
2349
|
+
|
|
2350
|
+
Status:
|
|
2351
|
+
* success - Whether research completed successfully
|
|
2352
|
+
* status - Current state (processing/completed/failed)
|
|
2353
|
+
* error - Error message if failed
|
|
2354
|
+
|
|
2355
|
+
Results:
|
|
2356
|
+
* id - Unique identifier for the research job
|
|
2357
|
+
* data - Research findings and analysis
|
|
2358
|
+
* sources - List of discovered sources
|
|
2359
|
+
* activities - Research progress log
|
|
2360
|
+
* summaries - Generated research summaries
|
|
1252
2361
|
|
|
1253
2362
|
Raises:
|
|
1254
2363
|
Exception: If the status check fails.
|
|
@@ -1271,6 +2380,17 @@ class FirecrawlApp:
|
|
|
1271
2380
|
return {'success': False, 'error': 'Internal server error'}
|
|
1272
2381
|
|
|
1273
2382
|
class CrawlWatcher:
|
|
2383
|
+
"""
|
|
2384
|
+
A class to watch and handle crawl job events via WebSocket connection.
|
|
2385
|
+
|
|
2386
|
+
Attributes:
|
|
2387
|
+
id (str): The ID of the crawl job to watch
|
|
2388
|
+
app (FirecrawlApp): The FirecrawlApp instance
|
|
2389
|
+
data (List[Dict[str, Any]]): List of crawled documents/data
|
|
2390
|
+
status (str): Current status of the crawl job
|
|
2391
|
+
ws_url (str): WebSocket URL for the crawl job
|
|
2392
|
+
event_handlers (dict): Dictionary of event type to list of handler functions
|
|
2393
|
+
"""
|
|
1274
2394
|
def __init__(self, id: str, app: FirecrawlApp):
|
|
1275
2395
|
self.id = id
|
|
1276
2396
|
self.app = app
|
|
@@ -1283,25 +2403,57 @@ class CrawlWatcher:
|
|
|
1283
2403
|
'document': []
|
|
1284
2404
|
}
|
|
1285
2405
|
|
|
1286
|
-
async def connect(self):
|
|
1287
|
-
|
|
2406
|
+
async def connect(self) -> None:
|
|
2407
|
+
"""
|
|
2408
|
+
Establishes WebSocket connection and starts listening for messages.
|
|
2409
|
+
"""
|
|
2410
|
+
async with websockets.connect(
|
|
2411
|
+
self.ws_url,
|
|
2412
|
+
additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
|
|
2413
|
+
) as websocket:
|
|
1288
2414
|
await self._listen(websocket)
|
|
1289
2415
|
|
|
1290
|
-
async def _listen(self, websocket):
|
|
2416
|
+
async def _listen(self, websocket) -> None:
|
|
2417
|
+
"""
|
|
2418
|
+
Listens for incoming WebSocket messages and handles them.
|
|
2419
|
+
|
|
2420
|
+
Args:
|
|
2421
|
+
websocket: The WebSocket connection object
|
|
2422
|
+
"""
|
|
1291
2423
|
async for message in websocket:
|
|
1292
2424
|
msg = json.loads(message)
|
|
1293
2425
|
await self._handle_message(msg)
|
|
1294
2426
|
|
|
1295
|
-
def add_event_listener(self, event_type: str, handler):
|
|
2427
|
+
def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
|
|
2428
|
+
"""
|
|
2429
|
+
Adds an event handler function for a specific event type.
|
|
2430
|
+
|
|
2431
|
+
Args:
|
|
2432
|
+
event_type (str): Type of event to listen for ('done', 'error', or 'document')
|
|
2433
|
+
handler (Callable): Function to handle the event
|
|
2434
|
+
"""
|
|
1296
2435
|
if event_type in self.event_handlers:
|
|
1297
2436
|
self.event_handlers[event_type].append(handler)
|
|
1298
2437
|
|
|
1299
|
-
def dispatch_event(self, event_type: str, detail: Dict[str, Any]):
|
|
2438
|
+
def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
|
|
2439
|
+
"""
|
|
2440
|
+
Dispatches an event to all registered handlers for that event type.
|
|
2441
|
+
|
|
2442
|
+
Args:
|
|
2443
|
+
event_type (str): Type of event to dispatch
|
|
2444
|
+
detail (Dict[str, Any]): Event details/data to pass to handlers
|
|
2445
|
+
"""
|
|
1300
2446
|
if event_type in self.event_handlers:
|
|
1301
2447
|
for handler in self.event_handlers[event_type]:
|
|
1302
2448
|
handler(detail)
|
|
1303
2449
|
|
|
1304
|
-
async def _handle_message(self, msg: Dict[str, Any]):
|
|
2450
|
+
async def _handle_message(self, msg: Dict[str, Any]) -> None:
|
|
2451
|
+
"""
|
|
2452
|
+
Handles incoming WebSocket messages based on their type.
|
|
2453
|
+
|
|
2454
|
+
Args:
|
|
2455
|
+
msg (Dict[str, Any]): The message to handle
|
|
2456
|
+
"""
|
|
1305
2457
|
if msg['type'] == 'done':
|
|
1306
2458
|
self.status = 'completed'
|
|
1307
2459
|
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
|
|
@@ -1316,3 +2468,1773 @@ class CrawlWatcher:
|
|
|
1316
2468
|
elif msg['type'] == 'document':
|
|
1317
2469
|
self.data.append(msg['data'])
|
|
1318
2470
|
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
|
2471
|
+
|
|
2472
|
+
class AsyncFirecrawlApp(FirecrawlApp):
|
|
2473
|
+
"""
|
|
2474
|
+
Asynchronous version of FirecrawlApp that implements async methods using aiohttp.
|
|
2475
|
+
Provides non-blocking alternatives to all FirecrawlApp operations.
|
|
2476
|
+
"""
|
|
2477
|
+
|
|
2478
|
+
async def _async_request(
|
|
2479
|
+
self,
|
|
2480
|
+
method: str,
|
|
2481
|
+
url: str,
|
|
2482
|
+
headers: Dict[str, str],
|
|
2483
|
+
data: Optional[Dict[str, Any]] = None,
|
|
2484
|
+
retries: int = 3,
|
|
2485
|
+
backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2486
|
+
"""
|
|
2487
|
+
Generic async request method with exponential backoff retry logic.
|
|
2488
|
+
|
|
2489
|
+
Args:
|
|
2490
|
+
method (str): The HTTP method to use (e.g., "GET" or "POST").
|
|
2491
|
+
url (str): The URL to send the request to.
|
|
2492
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2493
|
+
data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests).
|
|
2494
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2495
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2496
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2497
|
+
|
|
2498
|
+
Returns:
|
|
2499
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2500
|
+
|
|
2501
|
+
Raises:
|
|
2502
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2503
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2504
|
+
"""
|
|
2505
|
+
async with aiohttp.ClientSession() as session:
|
|
2506
|
+
for attempt in range(retries):
|
|
2507
|
+
try:
|
|
2508
|
+
async with session.request(
|
|
2509
|
+
method=method, url=url, headers=headers, json=data
|
|
2510
|
+
) as response:
|
|
2511
|
+
if response.status == 502:
|
|
2512
|
+
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
|
2513
|
+
continue
|
|
2514
|
+
if response.status >= 300:
|
|
2515
|
+
await self._handle_error(response, f"make {method} request")
|
|
2516
|
+
return await response.json()
|
|
2517
|
+
except aiohttp.ClientError as e:
|
|
2518
|
+
if attempt == retries - 1:
|
|
2519
|
+
raise e
|
|
2520
|
+
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
|
2521
|
+
raise Exception("Max retries exceeded")
|
|
2522
|
+
|
|
2523
|
+
async def _async_post_request(
|
|
2524
|
+
self, url: str, data: Dict[str, Any], headers: Dict[str, str],
|
|
2525
|
+
retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2526
|
+
"""
|
|
2527
|
+
Make an async POST request with exponential backoff retry logic.
|
|
2528
|
+
|
|
2529
|
+
Args:
|
|
2530
|
+
url (str): The URL to send the POST request to.
|
|
2531
|
+
data (Dict[str, Any]): The JSON data to include in the request body.
|
|
2532
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2533
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2534
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2535
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2536
|
+
|
|
2537
|
+
Returns:
|
|
2538
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2539
|
+
|
|
2540
|
+
Raises:
|
|
2541
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2542
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2543
|
+
"""
|
|
2544
|
+
return await self._async_request("POST", url, headers, data, retries, backoff_factor)
|
|
2545
|
+
|
|
2546
|
+
async def _async_get_request(
|
|
2547
|
+
self, url: str, headers: Dict[str, str],
|
|
2548
|
+
retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]:
|
|
2549
|
+
"""
|
|
2550
|
+
Make an async GET request with exponential backoff retry logic.
|
|
2551
|
+
|
|
2552
|
+
Args:
|
|
2553
|
+
url (str): The URL to send the GET request to.
|
|
2554
|
+
headers (Dict[str, str]): Headers to include in the request.
|
|
2555
|
+
retries (int): Maximum number of retry attempts (default: 3).
|
|
2556
|
+
backoff_factor (float): Factor to calculate delay between retries (default: 0.5).
|
|
2557
|
+
Delay will be backoff_factor * (2 ** retry_count).
|
|
2558
|
+
|
|
2559
|
+
Returns:
|
|
2560
|
+
Dict[str, Any]: The parsed JSON response from the server.
|
|
2561
|
+
|
|
2562
|
+
Raises:
|
|
2563
|
+
aiohttp.ClientError: If the request fails after all retries.
|
|
2564
|
+
Exception: If max retries are exceeded or other errors occur.
|
|
2565
|
+
"""
|
|
2566
|
+
return await self._async_request("GET", url, headers, None, retries, backoff_factor)
|
|
2567
|
+
|
|
2568
|
+
async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
|
|
2569
|
+
"""
|
|
2570
|
+
Handle errors from async API responses with detailed error messages.
|
|
2571
|
+
|
|
2572
|
+
Args:
|
|
2573
|
+
response (aiohttp.ClientResponse): The response object from the failed request
|
|
2574
|
+
action (str): Description of the action that was being attempted
|
|
2575
|
+
|
|
2576
|
+
Raises:
|
|
2577
|
+
aiohttp.ClientError: With a detailed error message based on the response status:
|
|
2578
|
+
- 402: Payment Required
|
|
2579
|
+
- 408: Request Timeout
|
|
2580
|
+
- 409: Conflict
|
|
2581
|
+
- 500: Internal Server Error
|
|
2582
|
+
- Other: Unexpected error with status code
|
|
2583
|
+
"""
|
|
2584
|
+
try:
|
|
2585
|
+
error_data = await response.json()
|
|
2586
|
+
error_message = error_data.get('error', 'No error message provided.')
|
|
2587
|
+
error_details = error_data.get('details', 'No additional error details provided.')
|
|
2588
|
+
except:
|
|
2589
|
+
raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
|
|
2590
|
+
|
|
2591
|
+
message = await self._get_async_error_message(response.status, action, error_message, error_details)
|
|
2592
|
+
|
|
2593
|
+
raise aiohttp.ClientError(message)
|
|
2594
|
+
|
|
2595
|
+
async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
2596
|
+
"""
|
|
2597
|
+
Generate a standardized error message based on HTTP status code for async operations.
|
|
2598
|
+
|
|
2599
|
+
Args:
|
|
2600
|
+
status_code (int): The HTTP status code from the response
|
|
2601
|
+
action (str): Description of the action that was being performed
|
|
2602
|
+
error_message (str): The error message from the API response
|
|
2603
|
+
error_details (str): Additional error details from the API response
|
|
2604
|
+
|
|
2605
|
+
Returns:
|
|
2606
|
+
str: A formatted error message
|
|
2607
|
+
"""
|
|
2608
|
+
return self._get_error_message(status_code, action, error_message, error_details)
|
|
2609
|
+
|
|
2610
|
+
async def crawl_url_and_watch(
|
|
2611
|
+
self,
|
|
2612
|
+
url: str,
|
|
2613
|
+
params: Optional[CrawlParams] = None,
|
|
2614
|
+
idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
|
|
2615
|
+
"""
|
|
2616
|
+
Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket.
|
|
2617
|
+
|
|
2618
|
+
Args:
|
|
2619
|
+
url (str): Target URL to start crawling from
|
|
2620
|
+
params (Optional[CrawlParams]): See CrawlParams model for configuration:
|
|
2621
|
+
URL Discovery:
|
|
2622
|
+
* includePaths - Patterns of URLs to include
|
|
2623
|
+
* excludePaths - Patterns of URLs to exclude
|
|
2624
|
+
* maxDepth - Maximum crawl depth
|
|
2625
|
+
* maxDiscoveryDepth - Maximum depth for finding new URLs
|
|
2626
|
+
* limit - Maximum pages to crawl
|
|
2627
|
+
|
|
2628
|
+
Link Following:
|
|
2629
|
+
* allowBackwardLinks - Follow parent directory links
|
|
2630
|
+
* allowExternalLinks - Follow external domain links
|
|
2631
|
+
* ignoreSitemap - Skip sitemap.xml processing
|
|
2632
|
+
|
|
2633
|
+
Advanced:
|
|
2634
|
+
* scrapeOptions - Page scraping configuration
|
|
2635
|
+
* webhook - Notification webhook settings
|
|
2636
|
+
* deduplicateSimilarURLs - Remove similar URLs
|
|
2637
|
+
* ignoreQueryParameters - Ignore URL parameters
|
|
2638
|
+
* regexOnFullURL - Apply regex to full URLs
|
|
2639
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2640
|
+
|
|
2641
|
+
Returns:
|
|
2642
|
+
AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
2643
|
+
|
|
2644
|
+
Raises:
|
|
2645
|
+
Exception: If crawl job fails to start
|
|
2646
|
+
"""
|
|
2647
|
+
crawl_response = await self.async_crawl_url(url, params, idempotency_key)
|
|
2648
|
+
if crawl_response.get('success') and 'id' in crawl_response:
|
|
2649
|
+
return AsyncCrawlWatcher(crawl_response['id'], self)
|
|
2650
|
+
else:
|
|
2651
|
+
raise Exception("Crawl job failed to start")
|
|
2652
|
+
|
|
2653
|
+
async def batch_scrape_urls_and_watch(
|
|
2654
|
+
self,
|
|
2655
|
+
urls: List[str],
|
|
2656
|
+
params: Optional[ScrapeParams] = None,
|
|
2657
|
+
idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher':
|
|
2658
|
+
"""
|
|
2659
|
+
Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress.
|
|
2660
|
+
|
|
2661
|
+
Args:
|
|
2662
|
+
urls (List[str]): List of URLs to scrape
|
|
2663
|
+
params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
|
|
2664
|
+
|
|
2665
|
+
Content Options:
|
|
2666
|
+
* formats - Content formats to retrieve
|
|
2667
|
+
* includeTags - HTML tags to include
|
|
2668
|
+
* excludeTags - HTML tags to exclude
|
|
2669
|
+
* onlyMainContent - Extract main content only
|
|
2670
|
+
|
|
2671
|
+
Request Options:
|
|
2672
|
+
* headers - Custom HTTP headers
|
|
2673
|
+
* timeout - Request timeout (ms)
|
|
2674
|
+
* mobile - Use mobile user agent
|
|
2675
|
+
* proxy - Proxy type
|
|
2676
|
+
|
|
2677
|
+
Extraction Options:
|
|
2678
|
+
* extract - Content extraction config
|
|
2679
|
+
* jsonOptions - JSON extraction config
|
|
2680
|
+
* actions - Actions to perform
|
|
2681
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2682
|
+
|
|
2683
|
+
Returns:
|
|
2684
|
+
AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
2685
|
+
|
|
2686
|
+
Raises:
|
|
2687
|
+
Exception: If batch scrape job fails to start
|
|
2688
|
+
"""
|
|
2689
|
+
batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
|
|
2690
|
+
if batch_response.get('success') and 'id' in batch_response:
|
|
2691
|
+
return AsyncCrawlWatcher(batch_response['id'], self)
|
|
2692
|
+
else:
|
|
2693
|
+
raise Exception("Batch scrape job failed to start")
|
|
2694
|
+
|
|
2695
|
+
async def scrape_url(
|
|
2696
|
+
self,
|
|
2697
|
+
url: str,
|
|
2698
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2699
|
+
include_tags: Optional[List[str]] = None,
|
|
2700
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2701
|
+
only_main_content: Optional[bool] = None,
|
|
2702
|
+
wait_for: Optional[int] = None,
|
|
2703
|
+
timeout: Optional[int] = None,
|
|
2704
|
+
location: Optional[LocationConfig] = None,
|
|
2705
|
+
mobile: Optional[bool] = None,
|
|
2706
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2707
|
+
remove_base64_images: Optional[bool] = None,
|
|
2708
|
+
block_ads: Optional[bool] = None,
|
|
2709
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2710
|
+
extract: Optional[ExtractConfig] = None,
|
|
2711
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2712
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
|
|
2713
|
+
"""
|
|
2714
|
+
Scrape and extract content from a URL asynchronously.
|
|
2715
|
+
|
|
2716
|
+
Args:
|
|
2717
|
+
url (str): Target URL to scrape
|
|
2718
|
+
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
|
2719
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2720
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2721
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2722
|
+
wait_for (Optional[int]): Wait for a specific element to appear
|
|
2723
|
+
timeout (Optional[int]): Request timeout (ms)
|
|
2724
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2725
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2726
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2727
|
+
remove_base64_images (Optional[bool]): Remove base64 images
|
|
2728
|
+
block_ads (Optional[bool]): Block ads
|
|
2729
|
+
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
2730
|
+
extract (Optional[ExtractConfig]): Content extraction settings
|
|
2731
|
+
json_options (Optional[ExtractConfig]): JSON extraction settings
|
|
2732
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
2733
|
+
|
|
2734
|
+
Returns:
|
|
2735
|
+
ScrapeResponse with:
|
|
2736
|
+
* Requested content formats
|
|
2737
|
+
* Page metadata
|
|
2738
|
+
* Extraction results
|
|
2739
|
+
* Success/error status
|
|
2740
|
+
|
|
2741
|
+
Raises:
|
|
2742
|
+
Exception: If scraping fails
|
|
2743
|
+
"""
|
|
2744
|
+
headers = self._prepare_headers()
|
|
2745
|
+
|
|
2746
|
+
# Build scrape parameters
|
|
2747
|
+
scrape_params = {
|
|
2748
|
+
'url': url,
|
|
2749
|
+
'origin': f"python-sdk@{version}"
|
|
2750
|
+
}
|
|
2751
|
+
|
|
2752
|
+
# Add optional parameters if provided and not None
|
|
2753
|
+
if formats:
|
|
2754
|
+
scrape_params['formats'] = formats
|
|
2755
|
+
if include_tags:
|
|
2756
|
+
scrape_params['includeTags'] = include_tags
|
|
2757
|
+
if exclude_tags:
|
|
2758
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
2759
|
+
if only_main_content is not None:
|
|
2760
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
2761
|
+
if wait_for:
|
|
2762
|
+
scrape_params['waitFor'] = wait_for
|
|
2763
|
+
if timeout:
|
|
2764
|
+
scrape_params['timeout'] = timeout
|
|
2765
|
+
if location:
|
|
2766
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
2767
|
+
if mobile is not None:
|
|
2768
|
+
scrape_params['mobile'] = mobile
|
|
2769
|
+
if skip_tls_verification is not None:
|
|
2770
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
2771
|
+
if remove_base64_images is not None:
|
|
2772
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
2773
|
+
if block_ads is not None:
|
|
2774
|
+
scrape_params['blockAds'] = block_ads
|
|
2775
|
+
if proxy:
|
|
2776
|
+
scrape_params['proxy'] = proxy
|
|
2777
|
+
if extract:
|
|
2778
|
+
extract_dict = extract.dict(exclude_none=True)
|
|
2779
|
+
if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
|
|
2780
|
+
extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
|
|
2781
|
+
scrape_params['extract'] = extract_dict
|
|
2782
|
+
if json_options:
|
|
2783
|
+
json_options_dict = json_options.dict(exclude_none=True)
|
|
2784
|
+
if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
|
|
2785
|
+
json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
|
|
2786
|
+
scrape_params['jsonOptions'] = json_options_dict
|
|
2787
|
+
if actions:
|
|
2788
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
2789
|
+
|
|
2790
|
+
# Make async request
|
|
2791
|
+
endpoint = f'/v1/scrape'
|
|
2792
|
+
response = await self._async_post_request(
|
|
2793
|
+
f'{self.api_url}{endpoint}',
|
|
2794
|
+
scrape_params,
|
|
2795
|
+
headers
|
|
2796
|
+
)
|
|
2797
|
+
|
|
2798
|
+
if response.get('success') and 'data' in response:
|
|
2799
|
+
return ScrapeResponse(**response['data'])
|
|
2800
|
+
elif "error" in response:
|
|
2801
|
+
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
2802
|
+
else:
|
|
2803
|
+
# Use the response content directly if possible, otherwise a generic message
|
|
2804
|
+
error_content = response.get('error', str(response))
|
|
2805
|
+
raise Exception(f'Failed to scrape URL. Error: {error_content}')
|
|
2806
|
+
|
|
2807
|
+
async def batch_scrape_urls(
|
|
2808
|
+
self,
|
|
2809
|
+
urls: List[str],
|
|
2810
|
+
*,
|
|
2811
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2812
|
+
headers: Optional[Dict[str, str]] = None,
|
|
2813
|
+
include_tags: Optional[List[str]] = None,
|
|
2814
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2815
|
+
only_main_content: Optional[bool] = None,
|
|
2816
|
+
wait_for: Optional[int] = None,
|
|
2817
|
+
timeout: Optional[int] = None,
|
|
2818
|
+
location: Optional[LocationConfig] = None,
|
|
2819
|
+
mobile: Optional[bool] = None,
|
|
2820
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2821
|
+
remove_base64_images: Optional[bool] = None,
|
|
2822
|
+
block_ads: Optional[bool] = None,
|
|
2823
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2824
|
+
extract: Optional[ExtractConfig] = None,
|
|
2825
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2826
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2827
|
+
agent: Optional[AgentOptions] = None,
|
|
2828
|
+
poll_interval: Optional[int] = 2,
|
|
2829
|
+
idempotency_key: Optional[str] = None,
|
|
2830
|
+
**kwargs
|
|
2831
|
+
) -> BatchScrapeStatusResponse:
|
|
2832
|
+
"""
|
|
2833
|
+
Asynchronously scrape multiple URLs and monitor until completion.
|
|
2834
|
+
|
|
2835
|
+
Args:
|
|
2836
|
+
urls (List[str]): URLs to scrape
|
|
2837
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
2838
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
2839
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2840
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2841
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2842
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
2843
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
2844
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2845
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2846
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2847
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
2848
|
+
block_ads (Optional[bool]): Block advertisements
|
|
2849
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
2850
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
2851
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
2852
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
2853
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
2854
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
2855
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2856
|
+
**kwargs: Additional parameters to pass to the API
|
|
2857
|
+
|
|
2858
|
+
Returns:
|
|
2859
|
+
BatchScrapeStatusResponse with:
|
|
2860
|
+
* Scraping status and progress
|
|
2861
|
+
* Scraped content for each URL
|
|
2862
|
+
* Success/error information
|
|
2863
|
+
|
|
2864
|
+
Raises:
|
|
2865
|
+
Exception: If batch scrape fails
|
|
2866
|
+
"""
|
|
2867
|
+
scrape_params = {}
|
|
2868
|
+
|
|
2869
|
+
# Add individual parameters
|
|
2870
|
+
if formats is not None:
|
|
2871
|
+
scrape_params['formats'] = formats
|
|
2872
|
+
if headers is not None:
|
|
2873
|
+
scrape_params['headers'] = headers
|
|
2874
|
+
if include_tags is not None:
|
|
2875
|
+
scrape_params['includeTags'] = include_tags
|
|
2876
|
+
if exclude_tags is not None:
|
|
2877
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
2878
|
+
if only_main_content is not None:
|
|
2879
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
2880
|
+
if wait_for is not None:
|
|
2881
|
+
scrape_params['waitFor'] = wait_for
|
|
2882
|
+
if timeout is not None:
|
|
2883
|
+
scrape_params['timeout'] = timeout
|
|
2884
|
+
if location is not None:
|
|
2885
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
2886
|
+
if mobile is not None:
|
|
2887
|
+
scrape_params['mobile'] = mobile
|
|
2888
|
+
if skip_tls_verification is not None:
|
|
2889
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
2890
|
+
if remove_base64_images is not None:
|
|
2891
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
2892
|
+
if block_ads is not None:
|
|
2893
|
+
scrape_params['blockAds'] = block_ads
|
|
2894
|
+
if proxy is not None:
|
|
2895
|
+
scrape_params['proxy'] = proxy
|
|
2896
|
+
if extract is not None:
|
|
2897
|
+
if hasattr(extract.schema, 'schema'):
|
|
2898
|
+
extract.schema = extract.schema.schema()
|
|
2899
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
2900
|
+
if json_options is not None:
|
|
2901
|
+
if hasattr(json_options.schema, 'schema'):
|
|
2902
|
+
json_options.schema = json_options.schema.schema()
|
|
2903
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
2904
|
+
if actions is not None:
|
|
2905
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
2906
|
+
if agent is not None:
|
|
2907
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
2908
|
+
|
|
2909
|
+
# Add any additional kwargs
|
|
2910
|
+
scrape_params.update(kwargs)
|
|
2911
|
+
|
|
2912
|
+
# Create final params object
|
|
2913
|
+
final_params = ScrapeParams(**scrape_params)
|
|
2914
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
2915
|
+
params_dict['urls'] = urls
|
|
2916
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
2917
|
+
|
|
2918
|
+
# Make request
|
|
2919
|
+
headers = self._prepare_headers(idempotency_key)
|
|
2920
|
+
response = await self._async_post_request(
|
|
2921
|
+
f'{self.api_url}/v1/batch/scrape',
|
|
2922
|
+
params_dict,
|
|
2923
|
+
headers
|
|
2924
|
+
)
|
|
2925
|
+
|
|
2926
|
+
if response.status_code == 200:
|
|
2927
|
+
try:
|
|
2928
|
+
id = response.json().get('id')
|
|
2929
|
+
except:
|
|
2930
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
2931
|
+
return self._monitor_job_status(id, headers, poll_interval)
|
|
2932
|
+
else:
|
|
2933
|
+
self._handle_error(response, 'start batch scrape job')
|
|
2934
|
+
|
|
2935
|
+
|
|
2936
|
+
async def async_batch_scrape_urls(
|
|
2937
|
+
self,
|
|
2938
|
+
urls: List[str],
|
|
2939
|
+
*,
|
|
2940
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
2941
|
+
headers: Optional[Dict[str, str]] = None,
|
|
2942
|
+
include_tags: Optional[List[str]] = None,
|
|
2943
|
+
exclude_tags: Optional[List[str]] = None,
|
|
2944
|
+
only_main_content: Optional[bool] = None,
|
|
2945
|
+
wait_for: Optional[int] = None,
|
|
2946
|
+
timeout: Optional[int] = None,
|
|
2947
|
+
location: Optional[LocationConfig] = None,
|
|
2948
|
+
mobile: Optional[bool] = None,
|
|
2949
|
+
skip_tls_verification: Optional[bool] = None,
|
|
2950
|
+
remove_base64_images: Optional[bool] = None,
|
|
2951
|
+
block_ads: Optional[bool] = None,
|
|
2952
|
+
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2953
|
+
extract: Optional[ExtractConfig] = None,
|
|
2954
|
+
json_options: Optional[ExtractConfig] = None,
|
|
2955
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2956
|
+
agent: Optional[AgentOptions] = None,
|
|
2957
|
+
idempotency_key: Optional[str] = None,
|
|
2958
|
+
**kwargs
|
|
2959
|
+
) -> BatchScrapeResponse:
|
|
2960
|
+
"""
|
|
2961
|
+
Initiate a batch scrape job asynchronously.
|
|
2962
|
+
|
|
2963
|
+
Args:
|
|
2964
|
+
urls (List[str]): URLs to scrape
|
|
2965
|
+
formats (Optional[List[Literal]]): Content formats to retrieve
|
|
2966
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
2967
|
+
include_tags (Optional[List[str]]): HTML tags to include
|
|
2968
|
+
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2969
|
+
only_main_content (Optional[bool]): Extract main content only
|
|
2970
|
+
wait_for (Optional[int]): Wait time in milliseconds
|
|
2971
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
2972
|
+
location (Optional[LocationConfig]): Location configuration
|
|
2973
|
+
mobile (Optional[bool]): Use mobile user agent
|
|
2974
|
+
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2975
|
+
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
2976
|
+
block_ads (Optional[bool]): Block advertisements
|
|
2977
|
+
proxy (Optional[Literal]): Proxy type to use
|
|
2978
|
+
extract (Optional[ExtractConfig]): Content extraction config
|
|
2979
|
+
json_options (Optional[ExtractConfig]): JSON extraction config
|
|
2980
|
+
actions (Optional[List[Union]]): Actions to perform
|
|
2981
|
+
agent (Optional[AgentOptions]): Agent configuration
|
|
2982
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2983
|
+
**kwargs: Additional parameters to pass to the API
|
|
2984
|
+
|
|
2985
|
+
Returns:
|
|
2986
|
+
BatchScrapeResponse with:
|
|
2987
|
+
* success - Whether job started successfully
|
|
2988
|
+
* id - Unique identifier for the job
|
|
2989
|
+
* url - Status check URL
|
|
2990
|
+
* error - Error message if start failed
|
|
2991
|
+
|
|
2992
|
+
Raises:
|
|
2993
|
+
Exception: If job initiation fails
|
|
2994
|
+
"""
|
|
2995
|
+
scrape_params = {}
|
|
2996
|
+
|
|
2997
|
+
# Add individual parameters
|
|
2998
|
+
if formats is not None:
|
|
2999
|
+
scrape_params['formats'] = formats
|
|
3000
|
+
if headers is not None:
|
|
3001
|
+
scrape_params['headers'] = headers
|
|
3002
|
+
if include_tags is not None:
|
|
3003
|
+
scrape_params['includeTags'] = include_tags
|
|
3004
|
+
if exclude_tags is not None:
|
|
3005
|
+
scrape_params['excludeTags'] = exclude_tags
|
|
3006
|
+
if only_main_content is not None:
|
|
3007
|
+
scrape_params['onlyMainContent'] = only_main_content
|
|
3008
|
+
if wait_for is not None:
|
|
3009
|
+
scrape_params['waitFor'] = wait_for
|
|
3010
|
+
if timeout is not None:
|
|
3011
|
+
scrape_params['timeout'] = timeout
|
|
3012
|
+
if location is not None:
|
|
3013
|
+
scrape_params['location'] = location.dict(exclude_none=True)
|
|
3014
|
+
if mobile is not None:
|
|
3015
|
+
scrape_params['mobile'] = mobile
|
|
3016
|
+
if skip_tls_verification is not None:
|
|
3017
|
+
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
3018
|
+
if remove_base64_images is not None:
|
|
3019
|
+
scrape_params['removeBase64Images'] = remove_base64_images
|
|
3020
|
+
if block_ads is not None:
|
|
3021
|
+
scrape_params['blockAds'] = block_ads
|
|
3022
|
+
if proxy is not None:
|
|
3023
|
+
scrape_params['proxy'] = proxy
|
|
3024
|
+
if extract is not None:
|
|
3025
|
+
if hasattr(extract.schema, 'schema'):
|
|
3026
|
+
extract.schema = extract.schema.schema()
|
|
3027
|
+
scrape_params['extract'] = extract.dict(exclude_none=True)
|
|
3028
|
+
if json_options is not None:
|
|
3029
|
+
if hasattr(json_options.schema, 'schema'):
|
|
3030
|
+
json_options.schema = json_options.schema.schema()
|
|
3031
|
+
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
|
3032
|
+
if actions is not None:
|
|
3033
|
+
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
3034
|
+
if agent is not None:
|
|
3035
|
+
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
3036
|
+
|
|
3037
|
+
# Add any additional kwargs
|
|
3038
|
+
scrape_params.update(kwargs)
|
|
3039
|
+
|
|
3040
|
+
# Create final params object
|
|
3041
|
+
final_params = ScrapeParams(**scrape_params)
|
|
3042
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3043
|
+
params_dict['urls'] = urls
|
|
3044
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3045
|
+
|
|
3046
|
+
# Make request
|
|
3047
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3048
|
+
response = await self._async_post_request(
|
|
3049
|
+
f'{self.api_url}/v1/batch/scrape',
|
|
3050
|
+
params_dict,
|
|
3051
|
+
headers
|
|
3052
|
+
)
|
|
3053
|
+
|
|
3054
|
+
if response.status_code == 200:
|
|
3055
|
+
try:
|
|
3056
|
+
return BatchScrapeResponse(**response.json())
|
|
3057
|
+
except:
|
|
3058
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3059
|
+
else:
|
|
3060
|
+
self._handle_error(response, 'start batch scrape job')
|
|
3061
|
+
|
|
3062
|
+
async def crawl_url(
|
|
3063
|
+
self,
|
|
3064
|
+
url: str,
|
|
3065
|
+
*,
|
|
3066
|
+
include_paths: Optional[List[str]] = None,
|
|
3067
|
+
exclude_paths: Optional[List[str]] = None,
|
|
3068
|
+
max_depth: Optional[int] = None,
|
|
3069
|
+
max_discovery_depth: Optional[int] = None,
|
|
3070
|
+
limit: Optional[int] = None,
|
|
3071
|
+
allow_backward_links: Optional[bool] = None,
|
|
3072
|
+
allow_external_links: Optional[bool] = None,
|
|
3073
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3074
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
3075
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3076
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
3077
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
3078
|
+
regex_on_full_url: Optional[bool] = None,
|
|
3079
|
+
poll_interval: Optional[int] = 2,
|
|
3080
|
+
idempotency_key: Optional[str] = None,
|
|
3081
|
+
**kwargs
|
|
3082
|
+
) -> CrawlStatusResponse:
|
|
3083
|
+
"""
|
|
3084
|
+
Crawl a website starting from a URL.
|
|
3085
|
+
|
|
3086
|
+
Args:
|
|
3087
|
+
url (str): Target URL to start crawling from
|
|
3088
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
3089
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
3090
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
3091
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3092
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
3093
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3094
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
3095
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3096
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
3097
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3098
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3099
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3100
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
3101
|
+
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
3102
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3103
|
+
**kwargs: Additional parameters to pass to the API
|
|
3104
|
+
|
|
3105
|
+
Returns:
|
|
3106
|
+
CrawlStatusResponse with:
|
|
3107
|
+
* Crawling status and progress
|
|
3108
|
+
* Crawled page contents
|
|
3109
|
+
* Success/error information
|
|
3110
|
+
|
|
3111
|
+
Raises:
|
|
3112
|
+
Exception: If crawl fails
|
|
3113
|
+
"""
|
|
3114
|
+
crawl_params = {}
|
|
3115
|
+
|
|
3116
|
+
# Add individual parameters
|
|
3117
|
+
if include_paths is not None:
|
|
3118
|
+
crawl_params['includePaths'] = include_paths
|
|
3119
|
+
if exclude_paths is not None:
|
|
3120
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
3121
|
+
if max_depth is not None:
|
|
3122
|
+
crawl_params['maxDepth'] = max_depth
|
|
3123
|
+
if max_discovery_depth is not None:
|
|
3124
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3125
|
+
if limit is not None:
|
|
3126
|
+
crawl_params['limit'] = limit
|
|
3127
|
+
if allow_backward_links is not None:
|
|
3128
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3129
|
+
if allow_external_links is not None:
|
|
3130
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
3131
|
+
if ignore_sitemap is not None:
|
|
3132
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
3133
|
+
if scrape_options is not None:
|
|
3134
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
3135
|
+
if webhook is not None:
|
|
3136
|
+
crawl_params['webhook'] = webhook
|
|
3137
|
+
if deduplicate_similar_urls is not None:
|
|
3138
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
3139
|
+
if ignore_query_parameters is not None:
|
|
3140
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
3141
|
+
if regex_on_full_url is not None:
|
|
3142
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3143
|
+
|
|
3144
|
+
# Add any additional kwargs
|
|
3145
|
+
crawl_params.update(kwargs)
|
|
3146
|
+
|
|
3147
|
+
# Create final params object
|
|
3148
|
+
final_params = CrawlParams(**crawl_params)
|
|
3149
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3150
|
+
params_dict['url'] = url
|
|
3151
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3152
|
+
|
|
3153
|
+
# Make request
|
|
3154
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3155
|
+
response = await self._async_post_request(
|
|
3156
|
+
f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
3157
|
+
|
|
3158
|
+
if response.status_code == 200:
|
|
3159
|
+
try:
|
|
3160
|
+
id = response.json().get('id')
|
|
3161
|
+
except:
|
|
3162
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3163
|
+
return self._monitor_job_status(id, headers, poll_interval)
|
|
3164
|
+
else:
|
|
3165
|
+
self._handle_error(response, 'start crawl job')
|
|
3166
|
+
|
|
3167
|
+
|
|
3168
|
+
async def async_crawl_url(
|
|
3169
|
+
self,
|
|
3170
|
+
url: str,
|
|
3171
|
+
*,
|
|
3172
|
+
include_paths: Optional[List[str]] = None,
|
|
3173
|
+
exclude_paths: Optional[List[str]] = None,
|
|
3174
|
+
max_depth: Optional[int] = None,
|
|
3175
|
+
max_discovery_depth: Optional[int] = None,
|
|
3176
|
+
limit: Optional[int] = None,
|
|
3177
|
+
allow_backward_links: Optional[bool] = None,
|
|
3178
|
+
allow_external_links: Optional[bool] = None,
|
|
3179
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3180
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
3181
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3182
|
+
deduplicate_similar_urls: Optional[bool] = None,
|
|
3183
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
3184
|
+
regex_on_full_url: Optional[bool] = None,
|
|
3185
|
+
idempotency_key: Optional[str] = None,
|
|
3186
|
+
**kwargs
|
|
3187
|
+
) -> CrawlResponse:
|
|
3188
|
+
"""
|
|
3189
|
+
Start an asynchronous crawl job.
|
|
3190
|
+
|
|
3191
|
+
Args:
|
|
3192
|
+
url (str): Target URL to start crawling from
|
|
3193
|
+
include_paths (Optional[List[str]]): Patterns of URLs to include
|
|
3194
|
+
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
|
3195
|
+
max_depth (Optional[int]): Maximum crawl depth
|
|
3196
|
+
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3197
|
+
limit (Optional[int]): Maximum pages to crawl
|
|
3198
|
+
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3199
|
+
allow_external_links (Optional[bool]): Follow external domain links
|
|
3200
|
+
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3201
|
+
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
|
3202
|
+
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3203
|
+
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3204
|
+
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3205
|
+
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
3206
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3207
|
+
**kwargs: Additional parameters to pass to the API
|
|
3208
|
+
|
|
3209
|
+
Returns:
|
|
3210
|
+
CrawlResponse with:
|
|
3211
|
+
* success - Whether crawl started successfully
|
|
3212
|
+
* id - Unique identifier for the crawl job
|
|
3213
|
+
* url - Status check URL for the crawl
|
|
3214
|
+
* error - Error message if start failed
|
|
3215
|
+
|
|
3216
|
+
Raises:
|
|
3217
|
+
Exception: If crawl initiation fails
|
|
3218
|
+
"""
|
|
3219
|
+
crawl_params = {}
|
|
3220
|
+
|
|
3221
|
+
# Add individual parameters
|
|
3222
|
+
if include_paths is not None:
|
|
3223
|
+
crawl_params['includePaths'] = include_paths
|
|
3224
|
+
if exclude_paths is not None:
|
|
3225
|
+
crawl_params['excludePaths'] = exclude_paths
|
|
3226
|
+
if max_depth is not None:
|
|
3227
|
+
crawl_params['maxDepth'] = max_depth
|
|
3228
|
+
if max_discovery_depth is not None:
|
|
3229
|
+
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3230
|
+
if limit is not None:
|
|
3231
|
+
crawl_params['limit'] = limit
|
|
3232
|
+
if allow_backward_links is not None:
|
|
3233
|
+
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3234
|
+
if allow_external_links is not None:
|
|
3235
|
+
crawl_params['allowExternalLinks'] = allow_external_links
|
|
3236
|
+
if ignore_sitemap is not None:
|
|
3237
|
+
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
3238
|
+
if scrape_options is not None:
|
|
3239
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
3240
|
+
if webhook is not None:
|
|
3241
|
+
crawl_params['webhook'] = webhook
|
|
3242
|
+
if deduplicate_similar_urls is not None:
|
|
3243
|
+
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
|
3244
|
+
if ignore_query_parameters is not None:
|
|
3245
|
+
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
3246
|
+
if regex_on_full_url is not None:
|
|
3247
|
+
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3248
|
+
|
|
3249
|
+
# Add any additional kwargs
|
|
3250
|
+
crawl_params.update(kwargs)
|
|
3251
|
+
|
|
3252
|
+
# Create final params object
|
|
3253
|
+
final_params = CrawlParams(**crawl_params)
|
|
3254
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3255
|
+
params_dict['url'] = url
|
|
3256
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3257
|
+
|
|
3258
|
+
# Make request
|
|
3259
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3260
|
+
response = await self._async_post_request(
|
|
3261
|
+
f'{self.api_url}/v1/crawl',
|
|
3262
|
+
params_dict,
|
|
3263
|
+
headers
|
|
3264
|
+
)
|
|
3265
|
+
|
|
3266
|
+
if response.status_code == 200:
|
|
3267
|
+
try:
|
|
3268
|
+
return CrawlResponse(**response.json())
|
|
3269
|
+
except:
|
|
3270
|
+
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3271
|
+
else:
|
|
3272
|
+
self._handle_error(response, 'start crawl job')
|
|
3273
|
+
|
|
3274
|
+
async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
|
|
3275
|
+
"""
|
|
3276
|
+
Check the status and results of an asynchronous crawl job.
|
|
3277
|
+
|
|
3278
|
+
Args:
|
|
3279
|
+
id (str): Unique identifier for the crawl job
|
|
3280
|
+
|
|
3281
|
+
Returns:
|
|
3282
|
+
CrawlStatusResponse containing:
|
|
3283
|
+
Status Information:
|
|
3284
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
3285
|
+
* completed - Number of pages crawled
|
|
3286
|
+
* total - Total pages to crawl
|
|
3287
|
+
* creditsUsed - API credits consumed
|
|
3288
|
+
* expiresAt - Data expiration timestamp
|
|
3289
|
+
|
|
3290
|
+
Results:
|
|
3291
|
+
* data - List of crawled documents
|
|
3292
|
+
* next - URL for next page of results (if paginated)
|
|
3293
|
+
* success - Whether status check succeeded
|
|
3294
|
+
* error - Error message if failed
|
|
3295
|
+
|
|
3296
|
+
Raises:
|
|
3297
|
+
Exception: If status check fails
|
|
3298
|
+
"""
|
|
3299
|
+
headers = self._prepare_headers()
|
|
3300
|
+
endpoint = f'/v1/crawl/{id}'
|
|
3301
|
+
|
|
3302
|
+
status_data = await self._async_get_request(
|
|
3303
|
+
f'{self.api_url}{endpoint}',
|
|
3304
|
+
headers
|
|
3305
|
+
)
|
|
3306
|
+
|
|
3307
|
+
if status_data['status'] == 'completed':
|
|
3308
|
+
if 'data' in status_data:
|
|
3309
|
+
data = status_data['data']
|
|
3310
|
+
while 'next' in status_data:
|
|
3311
|
+
if len(status_data['data']) == 0:
|
|
3312
|
+
break
|
|
3313
|
+
next_url = status_data.get('next')
|
|
3314
|
+
if not next_url:
|
|
3315
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3316
|
+
break
|
|
3317
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3318
|
+
data.extend(next_data.get('data', []))
|
|
3319
|
+
status_data = next_data
|
|
3320
|
+
status_data['data'] = data
|
|
3321
|
+
|
|
3322
|
+
response = {
|
|
3323
|
+
'status': status_data.get('status'),
|
|
3324
|
+
'total': status_data.get('total'),
|
|
3325
|
+
'completed': status_data.get('completed'),
|
|
3326
|
+
'creditsUsed': status_data.get('creditsUsed'),
|
|
3327
|
+
'expiresAt': status_data.get('expiresAt'),
|
|
3328
|
+
'data': status_data.get('data')
|
|
3329
|
+
}
|
|
3330
|
+
|
|
3331
|
+
if 'error' in status_data:
|
|
3332
|
+
response['error'] = status_data['error']
|
|
3333
|
+
|
|
3334
|
+
if 'next' in status_data:
|
|
3335
|
+
response['next'] = status_data['next']
|
|
3336
|
+
|
|
3337
|
+
return {
|
|
3338
|
+
'success': False if 'error' in status_data else True,
|
|
3339
|
+
**response
|
|
3340
|
+
}
|
|
3341
|
+
|
|
3342
|
+
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
|
|
3343
|
+
"""
|
|
3344
|
+
Monitor the status of an asynchronous job until completion.
|
|
3345
|
+
|
|
3346
|
+
Args:
|
|
3347
|
+
id (str): The ID of the job to monitor
|
|
3348
|
+
headers (Dict[str, str]): Headers to include in status check requests
|
|
3349
|
+
poll_interval (int): Seconds between status checks (default: 2)
|
|
3350
|
+
|
|
3351
|
+
Returns:
|
|
3352
|
+
CrawlStatusResponse: The job results if completed successfully
|
|
3353
|
+
|
|
3354
|
+
Raises:
|
|
3355
|
+
Exception: If the job fails or an error occurs during status checks
|
|
3356
|
+
"""
|
|
3357
|
+
while True:
|
|
3358
|
+
status_data = await self._async_get_request(
|
|
3359
|
+
f'{self.api_url}/v1/crawl/{id}',
|
|
3360
|
+
headers
|
|
3361
|
+
)
|
|
3362
|
+
|
|
3363
|
+
if status_data['status'] == 'completed':
|
|
3364
|
+
if 'data' in status_data:
|
|
3365
|
+
data = status_data['data']
|
|
3366
|
+
while 'next' in status_data:
|
|
3367
|
+
if len(status_data['data']) == 0:
|
|
3368
|
+
break
|
|
3369
|
+
next_url = status_data.get('next')
|
|
3370
|
+
if not next_url:
|
|
3371
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3372
|
+
break
|
|
3373
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3374
|
+
data.extend(next_data.get('data', []))
|
|
3375
|
+
status_data = next_data
|
|
3376
|
+
status_data['data'] = data
|
|
3377
|
+
return status_data
|
|
3378
|
+
else:
|
|
3379
|
+
raise Exception('Job completed but no data was returned')
|
|
3380
|
+
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
3381
|
+
await asyncio.sleep(max(poll_interval, 2))
|
|
3382
|
+
else:
|
|
3383
|
+
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
|
|
3384
|
+
|
|
3385
|
+
async def map_url(
|
|
3386
|
+
self,
|
|
3387
|
+
url: str,
|
|
3388
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
3389
|
+
"""
|
|
3390
|
+
Asynchronously map and discover links from a URL.
|
|
3391
|
+
|
|
3392
|
+
Args:
|
|
3393
|
+
url (str): Target URL to map
|
|
3394
|
+
params (Optional[MapParams]): See MapParams model:
|
|
3395
|
+
Discovery Options:
|
|
3396
|
+
* search - Filter pattern for URLs
|
|
3397
|
+
* ignoreSitemap - Skip sitemap.xml
|
|
3398
|
+
* includeSubdomains - Include subdomain links
|
|
3399
|
+
* sitemapOnly - Only use sitemap.xml
|
|
3400
|
+
|
|
3401
|
+
Limits:
|
|
3402
|
+
* limit - Max URLs to return
|
|
3403
|
+
* timeout - Request timeout (ms)
|
|
3404
|
+
|
|
3405
|
+
Returns:
|
|
3406
|
+
MapResponse with:
|
|
3407
|
+
* Discovered URLs
|
|
3408
|
+
* Success/error status
|
|
3409
|
+
|
|
3410
|
+
Raises:
|
|
3411
|
+
Exception: If mapping fails
|
|
3412
|
+
"""
|
|
3413
|
+
headers = self._prepare_headers()
|
|
3414
|
+
json_data = {'url': url}
|
|
3415
|
+
if params:
|
|
3416
|
+
json_data.update(params)
|
|
3417
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
3418
|
+
|
|
3419
|
+
endpoint = f'/v1/map'
|
|
3420
|
+
response = await self._async_post_request(
|
|
3421
|
+
f'{self.api_url}{endpoint}',
|
|
3422
|
+
json_data,
|
|
3423
|
+
headers
|
|
3424
|
+
)
|
|
3425
|
+
|
|
3426
|
+
if response.get('success') and 'links' in response:
|
|
3427
|
+
return response
|
|
3428
|
+
elif 'error' in response:
|
|
3429
|
+
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
|
3430
|
+
else:
|
|
3431
|
+
raise Exception(f'Failed to map URL. Error: {response}')
|
|
3432
|
+
|
|
3433
|
+
async def extract(
|
|
3434
|
+
self,
|
|
3435
|
+
urls: List[str],
|
|
3436
|
+
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
|
|
3437
|
+
"""
|
|
3438
|
+
Asynchronously extract structured information from URLs.
|
|
3439
|
+
|
|
3440
|
+
Args:
|
|
3441
|
+
urls (List[str]): URLs to extract from
|
|
3442
|
+
params (Optional[ExtractParams]): See ExtractParams model:
|
|
3443
|
+
Extraction Config:
|
|
3444
|
+
* prompt - Custom extraction prompt
|
|
3445
|
+
* schema - JSON schema/Pydantic model
|
|
3446
|
+
* systemPrompt - System context
|
|
3447
|
+
|
|
3448
|
+
Behavior Options:
|
|
3449
|
+
* allowExternalLinks - Follow external links
|
|
3450
|
+
* enableWebSearch - Enable web search
|
|
3451
|
+
* includeSubdomains - Include subdomains
|
|
3452
|
+
* showSources - Include source URLs
|
|
3453
|
+
|
|
3454
|
+
Scraping Options:
|
|
3455
|
+
* scrapeOptions - Page scraping config
|
|
3456
|
+
|
|
3457
|
+
Returns:
|
|
3458
|
+
ExtractResponse with:
|
|
3459
|
+
* Structured data matching schema
|
|
3460
|
+
* Source information if requested
|
|
3461
|
+
* Success/error status
|
|
3462
|
+
|
|
3463
|
+
Raises:
|
|
3464
|
+
ValueError: If prompt/schema missing or extraction fails
|
|
3465
|
+
"""
|
|
3466
|
+
headers = self._prepare_headers()
|
|
3467
|
+
|
|
3468
|
+
if not params or (not params.get('prompt') and not params.get('schema')):
|
|
3469
|
+
raise ValueError("Either prompt or schema is required")
|
|
3470
|
+
|
|
3471
|
+
schema = params.get('schema')
|
|
3472
|
+
if schema:
|
|
3473
|
+
if hasattr(schema, 'model_json_schema'):
|
|
3474
|
+
schema = schema.model_json_schema()
|
|
3475
|
+
|
|
3476
|
+
request_data = {
|
|
3477
|
+
'urls': urls,
|
|
3478
|
+
'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)),
|
|
3479
|
+
'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)),
|
|
3480
|
+
'showSources': params.get('show_sources', params.get('showSources', False)),
|
|
3481
|
+
'schema': schema,
|
|
3482
|
+
'origin': f'python-sdk@{version}'
|
|
3483
|
+
}
|
|
3484
|
+
|
|
3485
|
+
if params.get('prompt'):
|
|
3486
|
+
request_data['prompt'] = params['prompt']
|
|
3487
|
+
if params.get('system_prompt'):
|
|
3488
|
+
request_data['systemPrompt'] = params['system_prompt']
|
|
3489
|
+
elif params.get('systemPrompt'):
|
|
3490
|
+
request_data['systemPrompt'] = params['systemPrompt']
|
|
3491
|
+
|
|
3492
|
+
response = await self._async_post_request(
|
|
3493
|
+
f'{self.api_url}/v1/extract',
|
|
3494
|
+
request_data,
|
|
3495
|
+
headers
|
|
3496
|
+
)
|
|
3497
|
+
|
|
3498
|
+
if response.get('success'):
|
|
3499
|
+
job_id = response.get('id')
|
|
3500
|
+
if not job_id:
|
|
3501
|
+
raise Exception('Job ID not returned from extract request.')
|
|
3502
|
+
|
|
3503
|
+
while True:
|
|
3504
|
+
status_data = await self._async_get_request(
|
|
3505
|
+
f'{self.api_url}/v1/extract/{job_id}',
|
|
3506
|
+
headers
|
|
3507
|
+
)
|
|
3508
|
+
|
|
3509
|
+
if status_data['status'] == 'completed':
|
|
3510
|
+
return status_data
|
|
3511
|
+
elif status_data['status'] in ['failed', 'cancelled']:
|
|
3512
|
+
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
3513
|
+
|
|
3514
|
+
await asyncio.sleep(2)
|
|
3515
|
+
else:
|
|
3516
|
+
raise Exception(f'Failed to extract. Error: {response.get("error")}')
|
|
3517
|
+
|
|
3518
|
+
async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse:
|
|
3519
|
+
"""
|
|
3520
|
+
Check the status of an asynchronous batch scrape job.
|
|
3521
|
+
|
|
3522
|
+
Args:
|
|
3523
|
+
id (str): The ID of the batch scrape job
|
|
3524
|
+
|
|
3525
|
+
Returns:
|
|
3526
|
+
BatchScrapeStatusResponse containing:
|
|
3527
|
+
Status Information:
|
|
3528
|
+
* status - Current state (scraping/completed/failed/cancelled)
|
|
3529
|
+
* completed - Number of URLs scraped
|
|
3530
|
+
* total - Total URLs to scrape
|
|
3531
|
+
* creditsUsed - API credits consumed
|
|
3532
|
+
* expiresAt - Data expiration timestamp
|
|
3533
|
+
|
|
3534
|
+
Results:
|
|
3535
|
+
* data - List of scraped documents
|
|
3536
|
+
* next - URL for next page of results (if paginated)
|
|
3537
|
+
* success - Whether status check succeeded
|
|
3538
|
+
* error - Error message if failed
|
|
3539
|
+
|
|
3540
|
+
Raises:
|
|
3541
|
+
Exception: If status check fails
|
|
3542
|
+
"""
|
|
3543
|
+
headers = self._prepare_headers()
|
|
3544
|
+
endpoint = f'/v1/batch/scrape/{id}'
|
|
3545
|
+
|
|
3546
|
+
status_data = await self._async_get_request(
|
|
3547
|
+
f'{self.api_url}{endpoint}',
|
|
3548
|
+
headers
|
|
3549
|
+
)
|
|
3550
|
+
|
|
3551
|
+
if status_data['status'] == 'completed':
|
|
3552
|
+
if 'data' in status_data:
|
|
3553
|
+
data = status_data['data']
|
|
3554
|
+
while 'next' in status_data:
|
|
3555
|
+
if len(status_data['data']) == 0:
|
|
3556
|
+
break
|
|
3557
|
+
next_url = status_data.get('next')
|
|
3558
|
+
if not next_url:
|
|
3559
|
+
logger.warning("Expected 'next' URL is missing.")
|
|
3560
|
+
break
|
|
3561
|
+
next_data = await self._async_get_request(next_url, headers)
|
|
3562
|
+
data.extend(next_data.get('data', []))
|
|
3563
|
+
status_data = next_data
|
|
3564
|
+
status_data['data'] = data
|
|
3565
|
+
|
|
3566
|
+
response = {
|
|
3567
|
+
'status': status_data.get('status'),
|
|
3568
|
+
'total': status_data.get('total'),
|
|
3569
|
+
'completed': status_data.get('completed'),
|
|
3570
|
+
'creditsUsed': status_data.get('creditsUsed'),
|
|
3571
|
+
'expiresAt': status_data.get('expiresAt'),
|
|
3572
|
+
'data': status_data.get('data')
|
|
3573
|
+
}
|
|
3574
|
+
|
|
3575
|
+
if 'error' in status_data:
|
|
3576
|
+
response['error'] = status_data['error']
|
|
3577
|
+
|
|
3578
|
+
if 'next' in status_data:
|
|
3579
|
+
response['next'] = status_data['next']
|
|
3580
|
+
|
|
3581
|
+
return {
|
|
3582
|
+
'success': False if 'error' in status_data else True,
|
|
3583
|
+
**response
|
|
3584
|
+
}
|
|
3585
|
+
|
|
3586
|
+
async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
|
|
3587
|
+
"""
|
|
3588
|
+
Get information about errors from an asynchronous batch scrape job.
|
|
3589
|
+
|
|
3590
|
+
Args:
|
|
3591
|
+
id (str): The ID of the batch scrape job
|
|
3592
|
+
|
|
3593
|
+
Returns:
|
|
3594
|
+
CrawlErrorsResponse containing:
|
|
3595
|
+
errors (List[Dict[str, str]]): List of errors with fields:
|
|
3596
|
+
* id (str): Error ID
|
|
3597
|
+
* timestamp (str): When the error occurred
|
|
3598
|
+
* url (str): URL that caused the error
|
|
3599
|
+
* error (str): Error message
|
|
3600
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
3601
|
+
|
|
3602
|
+
Raises:
|
|
3603
|
+
Exception: If error check fails
|
|
3604
|
+
"""
|
|
3605
|
+
headers = self._prepare_headers()
|
|
3606
|
+
return await self._async_get_request(
|
|
3607
|
+
f'{self.api_url}/v1/batch/scrape/{id}/errors',
|
|
3608
|
+
headers
|
|
3609
|
+
)
|
|
3610
|
+
|
|
3611
|
+
async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse:
|
|
3612
|
+
"""
|
|
3613
|
+
Get information about errors from an asynchronous crawl job.
|
|
3614
|
+
|
|
3615
|
+
Args:
|
|
3616
|
+
id (str): The ID of the crawl job
|
|
3617
|
+
|
|
3618
|
+
Returns:
|
|
3619
|
+
CrawlErrorsResponse containing:
|
|
3620
|
+
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
3621
|
+
- id (str): Error ID
|
|
3622
|
+
- timestamp (str): When the error occurred
|
|
3623
|
+
- url (str): URL that caused the error
|
|
3624
|
+
- error (str): Error message
|
|
3625
|
+
* robotsBlocked (List[str]): List of URLs blocked by robots.txt
|
|
3626
|
+
|
|
3627
|
+
Raises:
|
|
3628
|
+
Exception: If error check fails
|
|
3629
|
+
"""
|
|
3630
|
+
headers = self._prepare_headers()
|
|
3631
|
+
return await self._async_get_request(
|
|
3632
|
+
f'{self.api_url}/v1/crawl/{id}/errors',
|
|
3633
|
+
headers
|
|
3634
|
+
)
|
|
3635
|
+
|
|
3636
|
+
async def cancel_crawl(self, id: str) -> Dict[str, Any]:
|
|
3637
|
+
"""
|
|
3638
|
+
Cancel an asynchronous crawl job.
|
|
3639
|
+
|
|
3640
|
+
Args:
|
|
3641
|
+
id (str): The ID of the crawl job to cancel
|
|
3642
|
+
|
|
3643
|
+
Returns:
|
|
3644
|
+
Dict[str, Any] containing:
|
|
3645
|
+
* success (bool): Whether cancellation was successful
|
|
3646
|
+
* error (str, optional): Error message if cancellation failed
|
|
3647
|
+
|
|
3648
|
+
Raises:
|
|
3649
|
+
Exception: If cancellation fails
|
|
3650
|
+
"""
|
|
3651
|
+
headers = self._prepare_headers()
|
|
3652
|
+
async with aiohttp.ClientSession() as session:
|
|
3653
|
+
async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
|
|
3654
|
+
return await response.json()
|
|
3655
|
+
|
|
3656
|
+
async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
|
|
3657
|
+
"""
|
|
3658
|
+
Check the status of an asynchronous extraction job.
|
|
3659
|
+
|
|
3660
|
+
Args:
|
|
3661
|
+
job_id (str): The ID of the extraction job
|
|
3662
|
+
|
|
3663
|
+
Returns:
|
|
3664
|
+
ExtractResponse[Any] with:
|
|
3665
|
+
* success (bool): Whether request succeeded
|
|
3666
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
3667
|
+
* error (Optional[str]): Error message if any
|
|
3668
|
+
* warning (Optional[str]): Warning message if any
|
|
3669
|
+
* sources (Optional[List[str]]): Source URLs if requested
|
|
3670
|
+
|
|
3671
|
+
Raises:
|
|
3672
|
+
ValueError: If status check fails
|
|
3673
|
+
"""
|
|
3674
|
+
headers = self._prepare_headers()
|
|
3675
|
+
try:
|
|
3676
|
+
return await self._async_get_request(
|
|
3677
|
+
f'{self.api_url}/v1/extract/{job_id}',
|
|
3678
|
+
headers
|
|
3679
|
+
)
|
|
3680
|
+
except Exception as e:
|
|
3681
|
+
raise ValueError(str(e))
|
|
3682
|
+
|
|
3683
|
+
async def async_extract(
|
|
3684
|
+
self,
|
|
3685
|
+
urls: Optional[List[str]] = None,
|
|
3686
|
+
*,
|
|
3687
|
+
prompt: Optional[str] = None,
|
|
3688
|
+
schema: Optional[Any] = None,
|
|
3689
|
+
system_prompt: Optional[str] = None,
|
|
3690
|
+
allow_external_links: Optional[bool] = False,
|
|
3691
|
+
enable_web_search: Optional[bool] = False,
|
|
3692
|
+
show_sources: Optional[bool] = False,
|
|
3693
|
+
agent: Optional[Dict[str, Any]] = None,
|
|
3694
|
+
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
3695
|
+
"""
|
|
3696
|
+
Initiate an asynchronous extraction job without waiting for completion.
|
|
3697
|
+
|
|
3698
|
+
Args:
|
|
3699
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
3700
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
3701
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
3702
|
+
system_prompt (Optional[str]): System context
|
|
3703
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
3704
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
3705
|
+
show_sources (Optional[bool]): Include source URLs
|
|
3706
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
3707
|
+
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3708
|
+
|
|
3709
|
+
Returns:
|
|
3710
|
+
ExtractResponse[Any] with:
|
|
3711
|
+
* success (bool): Whether request succeeded
|
|
3712
|
+
* data (Optional[Any]): Extracted data matching schema
|
|
3713
|
+
* error (Optional[str]): Error message if any
|
|
3714
|
+
|
|
3715
|
+
Raises:
|
|
3716
|
+
ValueError: If job initiation fails
|
|
3717
|
+
"""
|
|
3718
|
+
headers = self._prepare_headers(idempotency_key)
|
|
3719
|
+
|
|
3720
|
+
if not prompt and not schema:
|
|
3721
|
+
raise ValueError("Either prompt or schema is required")
|
|
3722
|
+
|
|
3723
|
+
if not urls and not prompt:
|
|
3724
|
+
raise ValueError("Either urls or prompt is required")
|
|
3725
|
+
|
|
3726
|
+
if schema:
|
|
3727
|
+
if hasattr(schema, 'model_json_schema'):
|
|
3728
|
+
schema = schema.model_json_schema()
|
|
3729
|
+
|
|
3730
|
+
request_data = {
|
|
3731
|
+
'urls': urls or [],
|
|
3732
|
+
'allowExternalLinks': allow_external_links,
|
|
3733
|
+
'enableWebSearch': enable_web_search,
|
|
3734
|
+
'showSources': show_sources,
|
|
3735
|
+
'schema': schema,
|
|
3736
|
+
'origin': f'python-sdk@{version}'
|
|
3737
|
+
}
|
|
3738
|
+
|
|
3739
|
+
if prompt:
|
|
3740
|
+
request_data['prompt'] = prompt
|
|
3741
|
+
if system_prompt:
|
|
3742
|
+
request_data['systemPrompt'] = system_prompt
|
|
3743
|
+
if agent:
|
|
3744
|
+
request_data['agent'] = agent
|
|
3745
|
+
|
|
3746
|
+
try:
|
|
3747
|
+
return await self._async_post_request(
|
|
3748
|
+
f'{self.api_url}/v1/extract',
|
|
3749
|
+
request_data,
|
|
3750
|
+
headers
|
|
3751
|
+
)
|
|
3752
|
+
except Exception as e:
|
|
3753
|
+
raise ValueError(str(e))
|
|
3754
|
+
|
|
3755
|
+
async def generate_llms_text(
|
|
3756
|
+
self,
|
|
3757
|
+
url: str,
|
|
3758
|
+
*,
|
|
3759
|
+
max_urls: Optional[int] = None,
|
|
3760
|
+
show_full_text: Optional[bool] = None,
|
|
3761
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
|
3762
|
+
"""
|
|
3763
|
+
Generate LLMs.txt for a given URL and monitor until completion.
|
|
3764
|
+
|
|
3765
|
+
Args:
|
|
3766
|
+
url (str): Target URL to generate LLMs.txt from
|
|
3767
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
3768
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
3769
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
3770
|
+
|
|
3771
|
+
Returns:
|
|
3772
|
+
GenerateLLMsTextStatusResponse containing:
|
|
3773
|
+
* success (bool): Whether generation completed successfully
|
|
3774
|
+
* status (str): Status of generation (processing/completed/failed)
|
|
3775
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
3776
|
+
- llmstxt (str): Generated LLMs.txt content
|
|
3777
|
+
- llmsfulltxt (str, optional): Full version if requested
|
|
3778
|
+
* error (str, optional): Error message if generation failed
|
|
3779
|
+
* expiresAt (str): When the generated data expires
|
|
3780
|
+
|
|
3781
|
+
Raises:
|
|
3782
|
+
Exception: If generation fails
|
|
3783
|
+
"""
|
|
3784
|
+
params = {}
|
|
3785
|
+
if max_urls is not None:
|
|
3786
|
+
params['maxUrls'] = max_urls
|
|
3787
|
+
if show_full_text is not None:
|
|
3788
|
+
params['showFullText'] = show_full_text
|
|
3789
|
+
if experimental_stream is not None:
|
|
3790
|
+
params['__experimental_stream'] = experimental_stream
|
|
3791
|
+
|
|
3792
|
+
response = await self.async_generate_llms_text(
|
|
3793
|
+
url,
|
|
3794
|
+
max_urls=max_urls,
|
|
3795
|
+
show_full_text=show_full_text,
|
|
3796
|
+
experimental_stream=experimental_stream
|
|
3797
|
+
)
|
|
3798
|
+
if not response.get('success') or 'id' not in response:
|
|
3799
|
+
return response
|
|
3800
|
+
|
|
3801
|
+
job_id = response['id']
|
|
3802
|
+
while True:
|
|
3803
|
+
status = await self.check_generate_llms_text_status(job_id)
|
|
3804
|
+
|
|
3805
|
+
if status['status'] == 'completed':
|
|
3806
|
+
return status
|
|
3807
|
+
elif status['status'] == 'failed':
|
|
3808
|
+
raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}')
|
|
3809
|
+
elif status['status'] != 'processing':
|
|
3810
|
+
break
|
|
3811
|
+
|
|
3812
|
+
await asyncio.sleep(2)
|
|
3813
|
+
|
|
3814
|
+
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
|
3815
|
+
|
|
3816
|
+
async def async_generate_llms_text(
|
|
3817
|
+
self,
|
|
3818
|
+
url: str,
|
|
3819
|
+
*,
|
|
3820
|
+
max_urls: Optional[int] = None,
|
|
3821
|
+
show_full_text: Optional[bool] = None,
|
|
3822
|
+
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
3823
|
+
"""
|
|
3824
|
+
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
|
3825
|
+
|
|
3826
|
+
Args:
|
|
3827
|
+
url (str): Target URL to generate LLMs.txt from
|
|
3828
|
+
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
3829
|
+
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
3830
|
+
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
3831
|
+
|
|
3832
|
+
Returns:
|
|
3833
|
+
GenerateLLMsTextResponse containing:
|
|
3834
|
+
* success (bool): Whether job started successfully
|
|
3835
|
+
* id (str): Unique identifier for the job
|
|
3836
|
+
* error (str, optional): Error message if start failed
|
|
3837
|
+
|
|
3838
|
+
Raises:
|
|
3839
|
+
ValueError: If job initiation fails
|
|
3840
|
+
"""
|
|
3841
|
+
params = {}
|
|
3842
|
+
if max_urls is not None:
|
|
3843
|
+
params['maxUrls'] = max_urls
|
|
3844
|
+
if show_full_text is not None:
|
|
3845
|
+
params['showFullText'] = show_full_text
|
|
3846
|
+
if experimental_stream is not None:
|
|
3847
|
+
params['__experimental_stream'] = experimental_stream
|
|
3848
|
+
|
|
3849
|
+
headers = self._prepare_headers()
|
|
3850
|
+
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
3851
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
3852
|
+
|
|
3853
|
+
try:
|
|
3854
|
+
return await self._async_post_request(
|
|
3855
|
+
f'{self.api_url}/v1/llmstxt',
|
|
3856
|
+
json_data,
|
|
3857
|
+
headers
|
|
3858
|
+
)
|
|
3859
|
+
except Exception as e:
|
|
3860
|
+
raise ValueError(str(e))
|
|
3861
|
+
|
|
3862
|
+
async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse:
|
|
3863
|
+
"""
|
|
3864
|
+
Check the status of an asynchronous LLMs.txt generation job.
|
|
3865
|
+
|
|
3866
|
+
Args:
|
|
3867
|
+
id (str): The ID of the generation job
|
|
3868
|
+
|
|
3869
|
+
Returns:
|
|
3870
|
+
GenerateLLMsTextStatusResponse containing:
|
|
3871
|
+
* success (bool): Whether generation completed successfully
|
|
3872
|
+
* status (str): Status of generation (processing/completed/failed)
|
|
3873
|
+
* data (Dict[str, str], optional): Generated text with fields:
|
|
3874
|
+
- llmstxt (str): Generated LLMs.txt content
|
|
3875
|
+
- llmsfulltxt (str, optional): Full version if requested
|
|
3876
|
+
* error (str, optional): Error message if generation failed
|
|
3877
|
+
* expiresAt (str): When the generated data expires
|
|
3878
|
+
|
|
3879
|
+
Raises:
|
|
3880
|
+
ValueError: If status check fails
|
|
3881
|
+
"""
|
|
3882
|
+
headers = self._prepare_headers()
|
|
3883
|
+
try:
|
|
3884
|
+
return await self._async_get_request(
|
|
3885
|
+
f'{self.api_url}/v1/llmstxt/{id}',
|
|
3886
|
+
headers
|
|
3887
|
+
)
|
|
3888
|
+
except Exception as e:
|
|
3889
|
+
raise ValueError(str(e))
|
|
3890
|
+
|
|
3891
|
+
async def deep_research(
|
|
3892
|
+
self,
|
|
3893
|
+
query: str,
|
|
3894
|
+
*,
|
|
3895
|
+
max_depth: Optional[int] = None,
|
|
3896
|
+
time_limit: Optional[int] = None,
|
|
3897
|
+
max_urls: Optional[int] = None,
|
|
3898
|
+
analysis_prompt: Optional[str] = None,
|
|
3899
|
+
system_prompt: Optional[str] = None,
|
|
3900
|
+
__experimental_stream_steps: Optional[bool] = None,
|
|
3901
|
+
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
3902
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
|
|
3903
|
+
"""
|
|
3904
|
+
Initiates a deep research operation on a given query and polls until completion.
|
|
3905
|
+
|
|
3906
|
+
Args:
|
|
3907
|
+
query (str): Research query or topic to investigate
|
|
3908
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
3909
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
3910
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
3911
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
3912
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
3913
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
3914
|
+
on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
|
|
3915
|
+
on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
|
|
3916
|
+
|
|
3917
|
+
Returns:
|
|
3918
|
+
DeepResearchStatusResponse containing:
|
|
3919
|
+
* success (bool): Whether research completed successfully
|
|
3920
|
+
* status (str): Current state (processing/completed/failed)
|
|
3921
|
+
* error (Optional[str]): Error message if failed
|
|
3922
|
+
* id (str): Unique identifier for the research job
|
|
3923
|
+
* data (Any): Research findings and analysis
|
|
3924
|
+
* sources (List[Dict]): List of discovered sources
|
|
3925
|
+
* activities (List[Dict]): Research progress log
|
|
3926
|
+
* summaries (List[str]): Generated research summaries
|
|
3927
|
+
|
|
3928
|
+
Raises:
|
|
3929
|
+
Exception: If research fails
|
|
3930
|
+
"""
|
|
3931
|
+
research_params = {}
|
|
3932
|
+
if max_depth is not None:
|
|
3933
|
+
research_params['maxDepth'] = max_depth
|
|
3934
|
+
if time_limit is not None:
|
|
3935
|
+
research_params['timeLimit'] = time_limit
|
|
3936
|
+
if max_urls is not None:
|
|
3937
|
+
research_params['maxUrls'] = max_urls
|
|
3938
|
+
if analysis_prompt is not None:
|
|
3939
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
3940
|
+
if system_prompt is not None:
|
|
3941
|
+
research_params['systemPrompt'] = system_prompt
|
|
3942
|
+
if __experimental_stream_steps is not None:
|
|
3943
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
3944
|
+
research_params = DeepResearchParams(**research_params)
|
|
3945
|
+
|
|
3946
|
+
response = await self.async_deep_research(
|
|
3947
|
+
query,
|
|
3948
|
+
max_depth=max_depth,
|
|
3949
|
+
time_limit=time_limit,
|
|
3950
|
+
max_urls=max_urls,
|
|
3951
|
+
analysis_prompt=analysis_prompt,
|
|
3952
|
+
system_prompt=system_prompt
|
|
3953
|
+
)
|
|
3954
|
+
if not response.get('success') or 'id' not in response:
|
|
3955
|
+
return response
|
|
3956
|
+
|
|
3957
|
+
job_id = response['id']
|
|
3958
|
+
last_activity_count = 0
|
|
3959
|
+
last_source_count = 0
|
|
3960
|
+
|
|
3961
|
+
while True:
|
|
3962
|
+
status = await self.check_deep_research_status(job_id)
|
|
3963
|
+
|
|
3964
|
+
if on_activity and 'activities' in status:
|
|
3965
|
+
new_activities = status['activities'][last_activity_count:]
|
|
3966
|
+
for activity in new_activities:
|
|
3967
|
+
on_activity(activity)
|
|
3968
|
+
last_activity_count = len(status['activities'])
|
|
3969
|
+
|
|
3970
|
+
if on_source and 'sources' in status:
|
|
3971
|
+
new_sources = status['sources'][last_source_count:]
|
|
3972
|
+
for source in new_sources:
|
|
3973
|
+
on_source(source)
|
|
3974
|
+
last_source_count = len(status['sources'])
|
|
3975
|
+
|
|
3976
|
+
if status['status'] == 'completed':
|
|
3977
|
+
return status
|
|
3978
|
+
elif status['status'] == 'failed':
|
|
3979
|
+
raise Exception(f'Deep research failed. Error: {status.get("error")}')
|
|
3980
|
+
elif status['status'] != 'processing':
|
|
3981
|
+
break
|
|
3982
|
+
|
|
3983
|
+
await asyncio.sleep(2)
|
|
3984
|
+
|
|
3985
|
+
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
|
3986
|
+
|
|
3987
|
+
async def async_deep_research(
|
|
3988
|
+
self,
|
|
3989
|
+
query: str,
|
|
3990
|
+
*,
|
|
3991
|
+
max_depth: Optional[int] = None,
|
|
3992
|
+
time_limit: Optional[int] = None,
|
|
3993
|
+
max_urls: Optional[int] = None,
|
|
3994
|
+
analysis_prompt: Optional[str] = None,
|
|
3995
|
+
system_prompt: Optional[str] = None,
|
|
3996
|
+
__experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
|
|
3997
|
+
"""
|
|
3998
|
+
Initiates an asynchronous deep research operation.
|
|
3999
|
+
|
|
4000
|
+
Args:
|
|
4001
|
+
query (str): Research query or topic to investigate
|
|
4002
|
+
max_depth (Optional[int]): Maximum depth of research exploration
|
|
4003
|
+
time_limit (Optional[int]): Time limit in seconds for research
|
|
4004
|
+
max_urls (Optional[int]): Maximum number of URLs to process
|
|
4005
|
+
analysis_prompt (Optional[str]): Custom prompt for analysis
|
|
4006
|
+
system_prompt (Optional[str]): Custom system prompt
|
|
4007
|
+
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
|
4008
|
+
|
|
4009
|
+
Returns:
|
|
4010
|
+
Dict[str, Any]: A response containing:
|
|
4011
|
+
* success (bool): Whether the research initiation was successful
|
|
4012
|
+
* id (str): The unique identifier for the research job
|
|
4013
|
+
* error (str, optional): Error message if initiation failed
|
|
4014
|
+
|
|
4015
|
+
Raises:
|
|
4016
|
+
Exception: If the research initiation fails.
|
|
4017
|
+
"""
|
|
4018
|
+
research_params = {}
|
|
4019
|
+
if max_depth is not None:
|
|
4020
|
+
research_params['maxDepth'] = max_depth
|
|
4021
|
+
if time_limit is not None:
|
|
4022
|
+
research_params['timeLimit'] = time_limit
|
|
4023
|
+
if max_urls is not None:
|
|
4024
|
+
research_params['maxUrls'] = max_urls
|
|
4025
|
+
if analysis_prompt is not None:
|
|
4026
|
+
research_params['analysisPrompt'] = analysis_prompt
|
|
4027
|
+
if system_prompt is not None:
|
|
4028
|
+
research_params['systemPrompt'] = system_prompt
|
|
4029
|
+
if __experimental_stream_steps is not None:
|
|
4030
|
+
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
4031
|
+
research_params = DeepResearchParams(**research_params)
|
|
4032
|
+
|
|
4033
|
+
headers = self._prepare_headers()
|
|
4034
|
+
|
|
4035
|
+
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
|
4036
|
+
json_data['origin'] = f"python-sdk@{version}"
|
|
4037
|
+
|
|
4038
|
+
try:
|
|
4039
|
+
return await self._async_post_request(
|
|
4040
|
+
f'{self.api_url}/v1/deep-research',
|
|
4041
|
+
json_data,
|
|
4042
|
+
headers
|
|
4043
|
+
)
|
|
4044
|
+
except Exception as e:
|
|
4045
|
+
raise ValueError(str(e))
|
|
4046
|
+
|
|
4047
|
+
async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
|
|
4048
|
+
"""
|
|
4049
|
+
Check the status of a deep research operation.
|
|
4050
|
+
|
|
4051
|
+
Args:
|
|
4052
|
+
id (str): The ID of the deep research operation.
|
|
4053
|
+
|
|
4054
|
+
Returns:
|
|
4055
|
+
DeepResearchResponse containing:
|
|
4056
|
+
|
|
4057
|
+
Status:
|
|
4058
|
+
* success - Whether research completed successfully
|
|
4059
|
+
* status - Current state (processing/completed/failed)
|
|
4060
|
+
* error - Error message if failed
|
|
4061
|
+
|
|
4062
|
+
Results:
|
|
4063
|
+
* id - Unique identifier for the research job
|
|
4064
|
+
* data - Research findings and analysis
|
|
4065
|
+
* sources - List of discovered sources
|
|
4066
|
+
* activities - Research progress log
|
|
4067
|
+
* summaries - Generated research summaries
|
|
4068
|
+
|
|
4069
|
+
Raises:
|
|
4070
|
+
Exception: If the status check fails.
|
|
4071
|
+
"""
|
|
4072
|
+
headers = self._prepare_headers()
|
|
4073
|
+
try:
|
|
4074
|
+
return await self._async_get_request(
|
|
4075
|
+
f'{self.api_url}/v1/deep-research/{id}',
|
|
4076
|
+
headers
|
|
4077
|
+
)
|
|
4078
|
+
except Exception as e:
|
|
4079
|
+
raise ValueError(str(e))
|
|
4080
|
+
|
|
4081
|
+
async def search(
|
|
4082
|
+
self,
|
|
4083
|
+
query: str,
|
|
4084
|
+
*,
|
|
4085
|
+
limit: Optional[int] = None,
|
|
4086
|
+
tbs: Optional[str] = None,
|
|
4087
|
+
filter: Optional[str] = None,
|
|
4088
|
+
lang: Optional[str] = None,
|
|
4089
|
+
country: Optional[str] = None,
|
|
4090
|
+
location: Optional[str] = None,
|
|
4091
|
+
timeout: Optional[int] = None,
|
|
4092
|
+
scrape_options: Optional[CommonOptions] = None,
|
|
4093
|
+
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
4094
|
+
**kwargs) -> SearchResponse:
|
|
4095
|
+
"""
|
|
4096
|
+
Asynchronously search for content using Firecrawl.
|
|
4097
|
+
|
|
4098
|
+
Args:
|
|
4099
|
+
query (str): Search query string
|
|
4100
|
+
limit (Optional[int]): Max results (default: 5)
|
|
4101
|
+
tbs (Optional[str]): Time filter (e.g. "qdr:d")
|
|
4102
|
+
filter (Optional[str]): Custom result filter
|
|
4103
|
+
lang (Optional[str]): Language code (default: "en")
|
|
4104
|
+
country (Optional[str]): Country code (default: "us")
|
|
4105
|
+
location (Optional[str]): Geo-targeting
|
|
4106
|
+
timeout (Optional[int]): Request timeout in milliseconds
|
|
4107
|
+
scrape_options (Optional[CommonOptions]): Result scraping configuration
|
|
4108
|
+
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
4109
|
+
**kwargs: Additional keyword arguments for future compatibility
|
|
4110
|
+
|
|
4111
|
+
Returns:
|
|
4112
|
+
SearchResponse: Response containing:
|
|
4113
|
+
* success (bool): Whether request succeeded
|
|
4114
|
+
* data (List[FirecrawlDocument]): Search results
|
|
4115
|
+
* warning (Optional[str]): Warning message if any
|
|
4116
|
+
* error (Optional[str]): Error message if any
|
|
4117
|
+
|
|
4118
|
+
Raises:
|
|
4119
|
+
Exception: If search fails or response cannot be parsed
|
|
4120
|
+
"""
|
|
4121
|
+
# Build search parameters
|
|
4122
|
+
search_params = {}
|
|
4123
|
+
if params:
|
|
4124
|
+
if isinstance(params, dict):
|
|
4125
|
+
search_params.update(params)
|
|
4126
|
+
else:
|
|
4127
|
+
search_params.update(params.dict(exclude_none=True))
|
|
4128
|
+
|
|
4129
|
+
# Add individual parameters
|
|
4130
|
+
if limit is not None:
|
|
4131
|
+
search_params['limit'] = limit
|
|
4132
|
+
if tbs is not None:
|
|
4133
|
+
search_params['tbs'] = tbs
|
|
4134
|
+
if filter is not None:
|
|
4135
|
+
search_params['filter'] = filter
|
|
4136
|
+
if lang is not None:
|
|
4137
|
+
search_params['lang'] = lang
|
|
4138
|
+
if country is not None:
|
|
4139
|
+
search_params['country'] = country
|
|
4140
|
+
if location is not None:
|
|
4141
|
+
search_params['location'] = location
|
|
4142
|
+
if timeout is not None:
|
|
4143
|
+
search_params['timeout'] = timeout
|
|
4144
|
+
if scrape_options is not None:
|
|
4145
|
+
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
4146
|
+
|
|
4147
|
+
# Add any additional kwargs
|
|
4148
|
+
search_params.update(kwargs)
|
|
4149
|
+
|
|
4150
|
+
# Create final params object
|
|
4151
|
+
final_params = SearchParams(query=query, **search_params)
|
|
4152
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
4153
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
4154
|
+
|
|
4155
|
+
return await self._async_post_request(
|
|
4156
|
+
f"{self.api_url}/v1/search",
|
|
4157
|
+
params_dict,
|
|
4158
|
+
{"Authorization": f"Bearer {self.api_key}"}
|
|
4159
|
+
)
|
|
4160
|
+
|
|
4161
|
+
class AsyncCrawlWatcher(CrawlWatcher):
|
|
4162
|
+
"""
|
|
4163
|
+
Async version of CrawlWatcher that properly handles async operations.
|
|
4164
|
+
"""
|
|
4165
|
+
def __init__(self, id: str, app: AsyncFirecrawlApp):
|
|
4166
|
+
super().__init__(id, app)
|
|
4167
|
+
|
|
4168
|
+
async def connect(self) -> None:
|
|
4169
|
+
"""
|
|
4170
|
+
Establishes async WebSocket connection and starts listening for messages.
|
|
4171
|
+
"""
|
|
4172
|
+
async with websockets.connect(
|
|
4173
|
+
self.ws_url,
|
|
4174
|
+
additional_headers=[("Authorization", f"Bearer {self.app.api_key}")]
|
|
4175
|
+
) as websocket:
|
|
4176
|
+
await self._listen(websocket)
|
|
4177
|
+
|
|
4178
|
+
async def _listen(self, websocket) -> None:
|
|
4179
|
+
"""
|
|
4180
|
+
Listens for incoming WebSocket messages and handles them asynchronously.
|
|
4181
|
+
|
|
4182
|
+
Args:
|
|
4183
|
+
websocket: The WebSocket connection object
|
|
4184
|
+
"""
|
|
4185
|
+
async for message in websocket:
|
|
4186
|
+
msg = json.loads(message)
|
|
4187
|
+
await self._handle_message(msg)
|
|
4188
|
+
|
|
4189
|
+
async def _handle_message(self, msg: Dict[str, Any]) -> None:
|
|
4190
|
+
"""
|
|
4191
|
+
Handles incoming WebSocket messages based on their type asynchronously.
|
|
4192
|
+
|
|
4193
|
+
Args:
|
|
4194
|
+
msg (Dict[str, Any]): The message to handle
|
|
4195
|
+
"""
|
|
4196
|
+
if msg['type'] == 'done':
|
|
4197
|
+
self.status = 'completed'
|
|
4198
|
+
self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
|
|
4199
|
+
elif msg['type'] == 'error':
|
|
4200
|
+
self.status = 'failed'
|
|
4201
|
+
self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
|
|
4202
|
+
elif msg['type'] == 'catchup':
|
|
4203
|
+
self.status = msg['data']['status']
|
|
4204
|
+
self.data.extend(msg['data'].get('data', []))
|
|
4205
|
+
for doc in self.data:
|
|
4206
|
+
self.dispatch_event('document', {'data': doc, 'id': self.id})
|
|
4207
|
+
elif msg['type'] == 'document':
|
|
4208
|
+
self.data.append(msg['data'])
|
|
4209
|
+
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
|
4210
|
+
|
|
4211
|
+
async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None:
|
|
4212
|
+
"""
|
|
4213
|
+
Handle errors from async API responses.
|
|
4214
|
+
"""
|
|
4215
|
+
try:
|
|
4216
|
+
error_data = await response.json()
|
|
4217
|
+
error_message = error_data.get('error', 'No error message provided.')
|
|
4218
|
+
error_details = error_data.get('details', 'No additional error details provided.')
|
|
4219
|
+
except:
|
|
4220
|
+
raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}')
|
|
4221
|
+
|
|
4222
|
+
# Use the app's method to get the error message
|
|
4223
|
+
message = await self.app._get_async_error_message(response.status, action, error_message, error_details)
|
|
4224
|
+
|
|
4225
|
+
raise aiohttp.ClientError(message)
|
|
4226
|
+
|
|
4227
|
+
async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str:
|
|
4228
|
+
"""
|
|
4229
|
+
Generate a standardized error message based on HTTP status code for async operations.
|
|
4230
|
+
|
|
4231
|
+
Args:
|
|
4232
|
+
status_code (int): The HTTP status code from the response
|
|
4233
|
+
action (str): Description of the action that was being performed
|
|
4234
|
+
error_message (str): The error message from the API response
|
|
4235
|
+
error_details (str): Additional error details from the API response
|
|
4236
|
+
|
|
4237
|
+
Returns:
|
|
4238
|
+
str: A formatted error message
|
|
4239
|
+
"""
|
|
4240
|
+
return self._get_error_message(status_code, action, error_message, error_details)
|