firecrawl-py 2.16.3__py3-none-any.whl → 3.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- build/lib/firecrawl/firecrawl.py → firecrawl/firecrawl.backup.py +108 -92
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/{firecrawl.py → v1/client.py} +405 -371
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl_py-2.16.3.dist-info → firecrawl_py-3.0.2.dist-info}/LICENSE +0 -0
- {firecrawl_py-2.16.3.dist-info → firecrawl_py-3.0.2.dist-info}/METADATA +49 -32
- firecrawl_py-3.0.2.dist-info/RECORD +78 -0
- {firecrawl_py-2.16.3.dist-info → firecrawl_py-3.0.2.dist-info}/top_level.txt +0 -2
- tests/test_timeout_conversion.py +117 -0
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl_py-2.16.3.dist-info/RECORD +0 -19
- {firecrawl_py-2.16.3.dist-info → firecrawl_py-3.0.2.dist-info}/WHEEL +0 -0
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Firecrawl v1 API Client - Legacy Implementation
|
|
3
3
|
|
|
4
|
-
This module provides
|
|
5
|
-
It
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
This module provides the legacy v1 implementation of the Firecrawl SDK.
|
|
5
|
+
It contains the complete `V1FirecrawlApp` class with all v1 API methods and types
|
|
6
|
+
for backward compatibility. This is used by the unified client to provide
|
|
7
|
+
version-specific access patterns like app.v1.scrape_url().
|
|
8
8
|
|
|
9
9
|
Classes:
|
|
10
|
-
-
|
|
10
|
+
- V1FirecrawlApp: Legacy v1 client for interacting with the Firecrawl API.
|
|
11
|
+
- AsyncV1FirecrawlApp: Async version of the v1 client.
|
|
12
|
+
- CrawlWatcher: WebSocket-based crawl monitoring for v1.
|
|
11
13
|
"""
|
|
12
14
|
import logging
|
|
13
15
|
import os
|
|
@@ -16,20 +18,13 @@ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar,
|
|
|
16
18
|
import json
|
|
17
19
|
from datetime import datetime
|
|
18
20
|
import re
|
|
19
|
-
import warnings
|
|
20
21
|
import requests
|
|
21
22
|
import pydantic
|
|
22
23
|
import websockets
|
|
23
24
|
import aiohttp
|
|
24
25
|
import asyncio
|
|
25
|
-
from pydantic import Field
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
|
|
29
|
-
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
|
30
|
-
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
|
|
31
|
-
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
|
32
|
-
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
|
|
27
|
+
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
33
28
|
|
|
34
29
|
def get_version():
|
|
35
30
|
try:
|
|
@@ -45,11 +40,9 @@ def get_version():
|
|
|
45
40
|
|
|
46
41
|
version = get_version()
|
|
47
42
|
|
|
48
|
-
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
49
|
-
|
|
50
43
|
T = TypeVar('T')
|
|
51
44
|
|
|
52
|
-
# class
|
|
45
|
+
# class V1FirecrawlDocumentMetadata(pydantic.BaseModel):
|
|
53
46
|
# """Metadata for a Firecrawl document."""
|
|
54
47
|
# title: Optional[str] = None
|
|
55
48
|
# description: Optional[str] = None
|
|
@@ -84,21 +77,21 @@ T = TypeVar('T')
|
|
|
84
77
|
# statusCode: Optional[int] = None
|
|
85
78
|
# error: Optional[str] = None
|
|
86
79
|
|
|
87
|
-
class
|
|
80
|
+
class V1AgentOptions(pydantic.BaseModel):
|
|
88
81
|
"""Configuration for the agent."""
|
|
89
82
|
model: Literal["FIRE-1"] = "FIRE-1"
|
|
90
83
|
prompt: Optional[str] = None
|
|
91
84
|
|
|
92
|
-
class
|
|
85
|
+
class V1AgentOptionsExtract(pydantic.BaseModel):
|
|
93
86
|
"""Configuration for the agent in extract operations."""
|
|
94
87
|
model: Literal["FIRE-1"] = "FIRE-1"
|
|
95
88
|
|
|
96
|
-
class
|
|
89
|
+
class V1ActionsResult(pydantic.BaseModel):
|
|
97
90
|
"""Result of actions performed during scraping."""
|
|
98
91
|
screenshots: List[str]
|
|
99
92
|
pdfs: List[str]
|
|
100
93
|
|
|
101
|
-
class
|
|
94
|
+
class V1ChangeTrackingData(pydantic.BaseModel):
|
|
102
95
|
"""
|
|
103
96
|
Data for the change tracking format.
|
|
104
97
|
"""
|
|
@@ -106,9 +99,9 @@ class ChangeTrackingData(pydantic.BaseModel):
|
|
|
106
99
|
changeStatus: str # "new" | "same" | "changed" | "removed"
|
|
107
100
|
visibility: str # "visible" | "hidden"
|
|
108
101
|
diff: Optional[Dict[str, Any]] = None
|
|
109
|
-
|
|
102
|
+
json_field: Optional[Any] = pydantic.Field(None, alias='json')
|
|
110
103
|
|
|
111
|
-
class
|
|
104
|
+
class V1FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
112
105
|
"""Document retrieved or processed by Firecrawl."""
|
|
113
106
|
url: Optional[str] = None
|
|
114
107
|
markdown: Optional[str] = None
|
|
@@ -116,34 +109,34 @@ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
|
116
109
|
rawHtml: Optional[str] = None
|
|
117
110
|
links: Optional[List[str]] = None
|
|
118
111
|
extract: Optional[T] = None
|
|
119
|
-
|
|
112
|
+
json_field: Optional[T] = pydantic.Field(None, alias='json')
|
|
120
113
|
screenshot: Optional[str] = None
|
|
121
114
|
metadata: Optional[Any] = None
|
|
122
|
-
actions: Optional[
|
|
115
|
+
actions: Optional[V1ActionsResult] = None
|
|
123
116
|
title: Optional[str] = None # v1 search only
|
|
124
117
|
description: Optional[str] = None # v1 search only
|
|
125
|
-
changeTracking: Optional[
|
|
118
|
+
changeTracking: Optional[V1ChangeTrackingData] = None
|
|
126
119
|
|
|
127
|
-
class
|
|
120
|
+
class V1LocationConfig(pydantic.BaseModel):
|
|
128
121
|
"""Location configuration for scraping."""
|
|
129
122
|
country: Optional[str] = None
|
|
130
123
|
languages: Optional[List[str]] = None
|
|
131
124
|
|
|
132
|
-
class
|
|
125
|
+
class V1WebhookConfig(pydantic.BaseModel):
|
|
133
126
|
"""Configuration for webhooks."""
|
|
134
127
|
url: str
|
|
135
128
|
headers: Optional[Dict[str, str]] = None
|
|
136
129
|
metadata: Optional[Dict[str, str]] = None
|
|
137
130
|
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
138
131
|
|
|
139
|
-
class
|
|
132
|
+
class V1ChangeTrackingOptions(pydantic.BaseModel):
|
|
140
133
|
"""Configuration for change tracking."""
|
|
141
134
|
modes: Optional[List[Literal["git-diff", "json"]]] = None
|
|
142
|
-
|
|
135
|
+
schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
143
136
|
prompt: Optional[str] = None
|
|
144
137
|
tag: Optional[str] = None
|
|
145
138
|
|
|
146
|
-
class
|
|
139
|
+
class V1ScrapeOptions(pydantic.BaseModel):
|
|
147
140
|
"""Parameters for scraping operations."""
|
|
148
141
|
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
|
|
149
142
|
headers: Optional[Dict[str, str]] = None
|
|
@@ -151,93 +144,93 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
151
144
|
excludeTags: Optional[List[str]] = None
|
|
152
145
|
onlyMainContent: Optional[bool] = None
|
|
153
146
|
waitFor: Optional[int] = None
|
|
154
|
-
timeout: Optional[int] =
|
|
155
|
-
location: Optional[
|
|
147
|
+
timeout: Optional[int] = 30000
|
|
148
|
+
location: Optional[V1LocationConfig] = None
|
|
156
149
|
mobile: Optional[bool] = None
|
|
157
150
|
skipTlsVerification: Optional[bool] = None
|
|
158
151
|
removeBase64Images: Optional[bool] = None
|
|
159
152
|
blockAds: Optional[bool] = None
|
|
160
153
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
161
|
-
changeTrackingOptions: Optional[
|
|
154
|
+
changeTrackingOptions: Optional[V1ChangeTrackingOptions] = None
|
|
162
155
|
maxAge: Optional[int] = None
|
|
163
156
|
storeInCache: Optional[bool] = None
|
|
164
157
|
parsePDF: Optional[bool] = None
|
|
165
158
|
|
|
166
|
-
class
|
|
159
|
+
class V1WaitAction(pydantic.BaseModel):
|
|
167
160
|
"""Wait action to perform during scraping."""
|
|
168
161
|
type: Literal["wait"]
|
|
169
162
|
milliseconds: Optional[int] = None
|
|
170
163
|
selector: Optional[str] = None
|
|
171
164
|
|
|
172
|
-
class
|
|
165
|
+
class V1ScreenshotAction(pydantic.BaseModel):
|
|
173
166
|
"""Screenshot action to perform during scraping."""
|
|
174
167
|
type: Literal["screenshot"]
|
|
175
168
|
fullPage: Optional[bool] = None
|
|
176
169
|
quality: Optional[int] = None
|
|
177
170
|
|
|
178
|
-
class
|
|
171
|
+
class V1ClickAction(pydantic.BaseModel):
|
|
179
172
|
"""Click action to perform during scraping."""
|
|
180
173
|
type: Literal["click"]
|
|
181
174
|
selector: str
|
|
182
175
|
|
|
183
|
-
class
|
|
176
|
+
class V1WriteAction(pydantic.BaseModel):
|
|
184
177
|
"""Write action to perform during scraping."""
|
|
185
178
|
type: Literal["write"]
|
|
186
179
|
text: str
|
|
187
180
|
|
|
188
|
-
class
|
|
181
|
+
class V1PressAction(pydantic.BaseModel):
|
|
189
182
|
"""Press action to perform during scraping."""
|
|
190
183
|
type: Literal["press"]
|
|
191
184
|
key: str
|
|
192
185
|
|
|
193
|
-
class
|
|
186
|
+
class V1ScrollAction(pydantic.BaseModel):
|
|
194
187
|
"""Scroll action to perform during scraping."""
|
|
195
188
|
type: Literal["scroll"]
|
|
196
189
|
direction: Literal["up", "down"]
|
|
197
190
|
selector: Optional[str] = None
|
|
198
191
|
|
|
199
|
-
class
|
|
192
|
+
class V1ScrapeAction(pydantic.BaseModel):
|
|
200
193
|
"""Scrape action to perform during scraping."""
|
|
201
194
|
type: Literal["scrape"]
|
|
202
195
|
|
|
203
|
-
class
|
|
196
|
+
class V1ExecuteJavascriptAction(pydantic.BaseModel):
|
|
204
197
|
"""Execute javascript action to perform during scraping."""
|
|
205
198
|
type: Literal["executeJavascript"]
|
|
206
199
|
script: str
|
|
207
200
|
|
|
208
|
-
class
|
|
201
|
+
class V1PDFAction(pydantic.BaseModel):
|
|
209
202
|
"""PDF action to perform during scraping."""
|
|
210
203
|
type: Literal["pdf"]
|
|
211
204
|
format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
|
|
212
205
|
landscape: Optional[bool] = None
|
|
213
206
|
scale: Optional[float] = None
|
|
214
207
|
|
|
215
|
-
class
|
|
208
|
+
class V1ExtractAgent(pydantic.BaseModel):
|
|
216
209
|
"""Configuration for the agent in extract operations."""
|
|
217
210
|
model: Literal["FIRE-1"] = "FIRE-1"
|
|
218
211
|
|
|
219
|
-
class
|
|
212
|
+
class V1JsonConfig(pydantic.BaseModel):
|
|
220
213
|
"""Configuration for extraction."""
|
|
221
214
|
prompt: Optional[str] = None
|
|
222
|
-
|
|
215
|
+
schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
223
216
|
systemPrompt: Optional[str] = None
|
|
224
|
-
agent: Optional[
|
|
217
|
+
agent: Optional[V1ExtractAgent] = None
|
|
225
218
|
|
|
226
|
-
class
|
|
219
|
+
class V1ScrapeParams(V1ScrapeOptions):
|
|
227
220
|
"""Parameters for scraping operations."""
|
|
228
|
-
extract: Optional[
|
|
229
|
-
jsonOptions: Optional[
|
|
230
|
-
actions: Optional[List[Union[
|
|
231
|
-
agent: Optional[
|
|
232
|
-
webhook: Optional[
|
|
221
|
+
extract: Optional[V1JsonConfig] = None
|
|
222
|
+
jsonOptions: Optional[V1JsonConfig] = None
|
|
223
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None
|
|
224
|
+
agent: Optional[V1AgentOptions] = None
|
|
225
|
+
webhook: Optional[V1WebhookConfig] = None
|
|
233
226
|
|
|
234
|
-
class
|
|
227
|
+
class V1ScrapeResponse(V1FirecrawlDocument[T], Generic[T]):
|
|
235
228
|
"""Response from scraping operations."""
|
|
236
229
|
success: bool = True
|
|
237
230
|
warning: Optional[str] = None
|
|
238
231
|
error: Optional[str] = None
|
|
239
232
|
|
|
240
|
-
class
|
|
233
|
+
class V1BatchScrapeResponse(pydantic.BaseModel):
|
|
241
234
|
"""Response from batch scrape operations."""
|
|
242
235
|
id: Optional[str] = None
|
|
243
236
|
url: Optional[str] = None
|
|
@@ -245,7 +238,7 @@ class BatchScrapeResponse(pydantic.BaseModel):
|
|
|
245
238
|
error: Optional[str] = None
|
|
246
239
|
invalidURLs: Optional[List[str]] = None
|
|
247
240
|
|
|
248
|
-
class
|
|
241
|
+
class V1BatchScrapeStatusResponse(pydantic.BaseModel):
|
|
249
242
|
"""Response from batch scrape status checks."""
|
|
250
243
|
success: bool = True
|
|
251
244
|
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
@@ -254,9 +247,9 @@ class BatchScrapeStatusResponse(pydantic.BaseModel):
|
|
|
254
247
|
creditsUsed: int
|
|
255
248
|
expiresAt: datetime
|
|
256
249
|
next: Optional[str] = None
|
|
257
|
-
data: List[
|
|
250
|
+
data: List[V1FirecrawlDocument]
|
|
258
251
|
|
|
259
|
-
class
|
|
252
|
+
class V1CrawlParams(pydantic.BaseModel):
|
|
260
253
|
"""Parameters for crawling operations."""
|
|
261
254
|
includePaths: Optional[List[str]] = None
|
|
262
255
|
excludePaths: Optional[List[str]] = None
|
|
@@ -264,10 +257,11 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
264
257
|
maxDiscoveryDepth: Optional[int] = None
|
|
265
258
|
limit: Optional[int] = None
|
|
266
259
|
allowBackwardLinks: Optional[bool] = None
|
|
260
|
+
crawlEntireDomain: Optional[bool] = None
|
|
267
261
|
allowExternalLinks: Optional[bool] = None
|
|
268
262
|
ignoreSitemap: Optional[bool] = None
|
|
269
|
-
scrapeOptions: Optional[
|
|
270
|
-
webhook: Optional[Union[str,
|
|
263
|
+
scrapeOptions: Optional[V1ScrapeOptions] = None
|
|
264
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None
|
|
271
265
|
deduplicateSimilarURLs: Optional[bool] = None
|
|
272
266
|
ignoreQueryParameters: Optional[bool] = None
|
|
273
267
|
regexOnFullURL: Optional[bool] = None
|
|
@@ -275,14 +269,14 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
275
269
|
maxConcurrency: Optional[int] = None
|
|
276
270
|
allowSubdomains: Optional[bool] = None
|
|
277
271
|
|
|
278
|
-
class
|
|
272
|
+
class V1CrawlResponse(pydantic.BaseModel):
|
|
279
273
|
"""Response from crawling operations."""
|
|
280
274
|
id: Optional[str] = None
|
|
281
275
|
url: Optional[str] = None
|
|
282
276
|
success: bool = True
|
|
283
277
|
error: Optional[str] = None
|
|
284
278
|
|
|
285
|
-
class
|
|
279
|
+
class V1CrawlStatusResponse(pydantic.BaseModel):
|
|
286
280
|
"""Response from crawl status checks."""
|
|
287
281
|
success: bool = True
|
|
288
282
|
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
@@ -291,42 +285,50 @@ class CrawlStatusResponse(pydantic.BaseModel):
|
|
|
291
285
|
creditsUsed: int
|
|
292
286
|
expiresAt: datetime
|
|
293
287
|
next: Optional[str] = None
|
|
294
|
-
data: List[
|
|
288
|
+
data: List[V1FirecrawlDocument]
|
|
289
|
+
|
|
290
|
+
class V1CrawlError(pydantic.BaseModel):
|
|
291
|
+
"""A crawl error."""
|
|
292
|
+
id: str
|
|
293
|
+
timestamp: Optional[datetime] = None
|
|
294
|
+
url: str
|
|
295
|
+
code: Optional[str] = None
|
|
296
|
+
error: str
|
|
295
297
|
|
|
296
|
-
class
|
|
298
|
+
class V1CrawlErrorsResponse(pydantic.BaseModel):
|
|
297
299
|
"""Response from crawl/batch scrape error monitoring."""
|
|
298
|
-
errors: List[
|
|
300
|
+
errors: List[V1CrawlError]
|
|
299
301
|
robotsBlocked: List[str]
|
|
300
302
|
|
|
301
|
-
class
|
|
303
|
+
class V1MapParams(pydantic.BaseModel):
|
|
302
304
|
"""Parameters for mapping operations."""
|
|
303
305
|
search: Optional[str] = None
|
|
304
306
|
ignoreSitemap: Optional[bool] = None
|
|
305
307
|
includeSubdomains: Optional[bool] = None
|
|
306
308
|
sitemapOnly: Optional[bool] = None
|
|
307
309
|
limit: Optional[int] = None
|
|
308
|
-
timeout: Optional[int] =
|
|
310
|
+
timeout: Optional[int] = 30000
|
|
309
311
|
useIndex: Optional[bool] = None
|
|
310
312
|
|
|
311
|
-
class
|
|
313
|
+
class V1MapResponse(pydantic.BaseModel):
|
|
312
314
|
"""Response from mapping operations."""
|
|
313
315
|
success: bool = True
|
|
314
316
|
links: Optional[List[str]] = None
|
|
315
317
|
error: Optional[str] = None
|
|
316
318
|
|
|
317
|
-
class
|
|
319
|
+
class V1ExtractParams(pydantic.BaseModel):
|
|
318
320
|
"""Parameters for extracting information from URLs."""
|
|
319
321
|
prompt: Optional[str] = None
|
|
320
|
-
|
|
322
|
+
schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
321
323
|
systemPrompt: Optional[str] = None
|
|
322
324
|
allowExternalLinks: Optional[bool] = None
|
|
323
325
|
enableWebSearch: Optional[bool] = None
|
|
324
326
|
includeSubdomains: Optional[bool] = None
|
|
325
327
|
origin: Optional[str] = None
|
|
326
328
|
showSources: Optional[bool] = None
|
|
327
|
-
scrapeOptions: Optional[
|
|
329
|
+
scrapeOptions: Optional[V1ScrapeOptions] = None
|
|
328
330
|
|
|
329
|
-
class
|
|
331
|
+
class V1ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
330
332
|
"""Response from extract operations."""
|
|
331
333
|
id: Optional[str] = None
|
|
332
334
|
status: Optional[Literal["processing", "completed", "failed"]] = None
|
|
@@ -337,7 +339,7 @@ class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
|
337
339
|
warning: Optional[str] = None
|
|
338
340
|
sources: Optional[Dict[Any, Any]] = None
|
|
339
341
|
|
|
340
|
-
class
|
|
342
|
+
class V1SearchParams(pydantic.BaseModel):
|
|
341
343
|
query: str
|
|
342
344
|
limit: Optional[int] = 5
|
|
343
345
|
tbs: Optional[str] = None
|
|
@@ -347,16 +349,16 @@ class SearchParams(pydantic.BaseModel):
|
|
|
347
349
|
location: Optional[str] = None
|
|
348
350
|
origin: Optional[str] = "api"
|
|
349
351
|
timeout: Optional[int] = 60000
|
|
350
|
-
scrapeOptions: Optional[
|
|
352
|
+
scrapeOptions: Optional[V1ScrapeOptions] = None
|
|
351
353
|
|
|
352
|
-
class
|
|
354
|
+
class V1SearchResponse(pydantic.BaseModel):
|
|
353
355
|
"""Response from search operations."""
|
|
354
356
|
success: bool = True
|
|
355
|
-
data: List[
|
|
357
|
+
data: List[V1FirecrawlDocument]
|
|
356
358
|
warning: Optional[str] = None
|
|
357
359
|
error: Optional[str] = None
|
|
358
360
|
|
|
359
|
-
class
|
|
361
|
+
class V1GenerateLLMsTextParams(pydantic.BaseModel):
|
|
360
362
|
"""
|
|
361
363
|
Parameters for the LLMs.txt generation operation.
|
|
362
364
|
"""
|
|
@@ -365,7 +367,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
|
|
|
365
367
|
cache: Optional[bool] = True
|
|
366
368
|
__experimental_stream: Optional[bool] = None
|
|
367
369
|
|
|
368
|
-
class
|
|
370
|
+
class V1DeepResearchParams(pydantic.BaseModel):
|
|
369
371
|
"""
|
|
370
372
|
Parameters for the deep research operation.
|
|
371
373
|
"""
|
|
@@ -376,7 +378,7 @@ class DeepResearchParams(pydantic.BaseModel):
|
|
|
376
378
|
systemPrompt: Optional[str] = None
|
|
377
379
|
__experimental_streamSteps: Optional[bool] = None
|
|
378
380
|
|
|
379
|
-
class
|
|
381
|
+
class V1DeepResearchResponse(pydantic.BaseModel):
|
|
380
382
|
"""
|
|
381
383
|
Response from the deep research operation.
|
|
382
384
|
"""
|
|
@@ -384,7 +386,7 @@ class DeepResearchResponse(pydantic.BaseModel):
|
|
|
384
386
|
id: str
|
|
385
387
|
error: Optional[str] = None
|
|
386
388
|
|
|
387
|
-
class
|
|
389
|
+
class V1DeepResearchStatusResponse(pydantic.BaseModel):
|
|
388
390
|
"""
|
|
389
391
|
Status response from the deep research operation.
|
|
390
392
|
"""
|
|
@@ -399,25 +401,25 @@ class DeepResearchStatusResponse(pydantic.BaseModel):
|
|
|
399
401
|
sources: List[Dict[str, Any]]
|
|
400
402
|
summaries: List[str]
|
|
401
403
|
|
|
402
|
-
class
|
|
404
|
+
class V1GenerateLLMsTextResponse(pydantic.BaseModel):
|
|
403
405
|
"""Response from LLMs.txt generation operations."""
|
|
404
406
|
success: bool = True
|
|
405
407
|
id: str
|
|
406
408
|
error: Optional[str] = None
|
|
407
409
|
|
|
408
|
-
class
|
|
410
|
+
class V1GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
|
|
409
411
|
llmstxt: str
|
|
410
412
|
llmsfulltxt: Optional[str] = None
|
|
411
413
|
|
|
412
|
-
class
|
|
414
|
+
class V1GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
|
413
415
|
"""Status response from LLMs.txt generation operations."""
|
|
414
416
|
success: bool = True
|
|
415
|
-
data: Optional[
|
|
417
|
+
data: Optional[V1GenerateLLMsTextStatusResponseData] = None
|
|
416
418
|
status: Literal["processing", "completed", "failed"]
|
|
417
419
|
error: Optional[str] = None
|
|
418
420
|
expiresAt: str
|
|
419
421
|
|
|
420
|
-
class
|
|
422
|
+
class V1SearchResponse(pydantic.BaseModel):
|
|
421
423
|
"""
|
|
422
424
|
Response from the search operation.
|
|
423
425
|
"""
|
|
@@ -426,12 +428,12 @@ class SearchResponse(pydantic.BaseModel):
|
|
|
426
428
|
warning: Optional[str] = None
|
|
427
429
|
error: Optional[str] = None
|
|
428
430
|
|
|
429
|
-
class
|
|
431
|
+
class V1ExtractParams(pydantic.BaseModel):
|
|
430
432
|
"""
|
|
431
433
|
Parameters for the extract operation.
|
|
432
434
|
"""
|
|
433
435
|
prompt: Optional[str] = None
|
|
434
|
-
|
|
436
|
+
schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
435
437
|
system_prompt: Optional[str] = None
|
|
436
438
|
allow_external_links: Optional[bool] = False
|
|
437
439
|
enable_web_search: Optional[bool] = False
|
|
@@ -440,10 +442,25 @@ class ExtractParams(pydantic.BaseModel):
|
|
|
440
442
|
show_sources: Optional[bool] = False
|
|
441
443
|
agent: Optional[Dict[str, Any]] = None
|
|
442
444
|
|
|
443
|
-
class
|
|
445
|
+
class V1FirecrawlApp:
|
|
446
|
+
"""
|
|
447
|
+
Legacy v1 Firecrawl client for backward compatibility.
|
|
448
|
+
|
|
449
|
+
This class provides the complete v1 API implementation including:
|
|
450
|
+
- URL scraping with various formats and options
|
|
451
|
+
- Website crawling with monitoring capabilities
|
|
452
|
+
- Batch scraping operations
|
|
453
|
+
- Search functionality
|
|
454
|
+
- Data extraction with LLM integration
|
|
455
|
+
- Deep research capabilities
|
|
456
|
+
- LLMs.txt generation
|
|
457
|
+
|
|
458
|
+
This is used by the unified client to provide version-specific access
|
|
459
|
+
through app.v1.method_name() patterns.
|
|
460
|
+
"""
|
|
444
461
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
|
445
462
|
"""
|
|
446
|
-
Initialize the
|
|
463
|
+
Initialize the V1FirecrawlApp instance with API key, API URL.
|
|
447
464
|
|
|
448
465
|
Args:
|
|
449
466
|
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
|
@@ -457,7 +474,7 @@ class FirecrawlApp:
|
|
|
457
474
|
logger.warning("No API key provided for cloud service")
|
|
458
475
|
raise ValueError('No API key provided')
|
|
459
476
|
|
|
460
|
-
logger.debug(f"Initialized
|
|
477
|
+
logger.debug(f"Initialized V1FirecrawlApp with API URL: {self.api_url}")
|
|
461
478
|
|
|
462
479
|
def scrape_url(
|
|
463
480
|
self,
|
|
@@ -469,22 +486,22 @@ class FirecrawlApp:
|
|
|
469
486
|
exclude_tags: Optional[List[str]] = None,
|
|
470
487
|
only_main_content: Optional[bool] = None,
|
|
471
488
|
wait_for: Optional[int] = None,
|
|
472
|
-
timeout: Optional[int] =
|
|
473
|
-
location: Optional[
|
|
489
|
+
timeout: Optional[int] = 30000,
|
|
490
|
+
location: Optional[V1LocationConfig] = None,
|
|
474
491
|
mobile: Optional[bool] = None,
|
|
475
492
|
skip_tls_verification: Optional[bool] = None,
|
|
476
493
|
remove_base64_images: Optional[bool] = None,
|
|
477
494
|
block_ads: Optional[bool] = None,
|
|
478
495
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
479
496
|
parse_pdf: Optional[bool] = None,
|
|
480
|
-
extract: Optional[
|
|
481
|
-
json_options: Optional[
|
|
482
|
-
actions: Optional[List[Union[
|
|
483
|
-
change_tracking_options: Optional[
|
|
497
|
+
extract: Optional[V1JsonConfig] = None,
|
|
498
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
499
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
500
|
+
change_tracking_options: Optional[V1ChangeTrackingOptions] = None,
|
|
484
501
|
max_age: Optional[int] = None,
|
|
485
502
|
store_in_cache: Optional[bool] = None,
|
|
486
503
|
zero_data_retention: Optional[bool] = None,
|
|
487
|
-
**kwargs) ->
|
|
504
|
+
**kwargs) -> V1ScrapeResponse[Any]:
|
|
488
505
|
"""
|
|
489
506
|
Scrape and extract content from a URL.
|
|
490
507
|
|
|
@@ -547,7 +564,7 @@ class FirecrawlApp:
|
|
|
547
564
|
if timeout:
|
|
548
565
|
scrape_params['timeout'] = timeout
|
|
549
566
|
if location:
|
|
550
|
-
scrape_params['location'] = location.dict(exclude_none=True)
|
|
567
|
+
scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
551
568
|
if mobile is not None:
|
|
552
569
|
scrape_params['mobile'] = mobile
|
|
553
570
|
if skip_tls_verification is not None:
|
|
@@ -564,16 +581,16 @@ class FirecrawlApp:
|
|
|
564
581
|
extract = self._ensure_schema_dict(extract)
|
|
565
582
|
if isinstance(extract, dict) and "schema" in extract:
|
|
566
583
|
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
567
|
-
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
584
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
|
|
568
585
|
if json_options is not None:
|
|
569
586
|
json_options = self._ensure_schema_dict(json_options)
|
|
570
587
|
if isinstance(json_options, dict) and "schema" in json_options:
|
|
571
588
|
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
572
|
-
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
589
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
|
|
573
590
|
if actions:
|
|
574
|
-
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
591
|
+
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
|
|
575
592
|
if change_tracking_options:
|
|
576
|
-
scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
|
|
593
|
+
scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(by_alias=True, exclude_none=True)
|
|
577
594
|
if max_age is not None:
|
|
578
595
|
scrape_params['maxAge'] = max_age
|
|
579
596
|
if store_in_cache is not None:
|
|
@@ -593,14 +610,14 @@ class FirecrawlApp:
|
|
|
593
610
|
f'{self.api_url}/v1/scrape',
|
|
594
611
|
headers=_headers,
|
|
595
612
|
json=scrape_params,
|
|
596
|
-
timeout=(timeout +
|
|
613
|
+
timeout=(timeout / 1000.0 + 5 if timeout is not None else None)
|
|
597
614
|
)
|
|
598
615
|
|
|
599
616
|
if response.status_code == 200:
|
|
600
617
|
try:
|
|
601
618
|
response_json = response.json()
|
|
602
619
|
if response_json.get('success') and 'data' in response_json:
|
|
603
|
-
return
|
|
620
|
+
return V1ScrapeResponse(**response_json['data'])
|
|
604
621
|
elif "error" in response_json:
|
|
605
622
|
raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
|
|
606
623
|
else:
|
|
@@ -620,9 +637,9 @@ class FirecrawlApp:
|
|
|
620
637
|
lang: Optional[str] = None,
|
|
621
638
|
country: Optional[str] = None,
|
|
622
639
|
location: Optional[str] = None,
|
|
623
|
-
timeout: Optional[int] =
|
|
624
|
-
scrape_options: Optional[
|
|
625
|
-
**kwargs) ->
|
|
640
|
+
timeout: Optional[int] = 30000,
|
|
641
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
642
|
+
**kwargs) -> V1SearchResponse:
|
|
626
643
|
"""
|
|
627
644
|
Search for content using Firecrawl.
|
|
628
645
|
|
|
@@ -670,15 +687,15 @@ class FirecrawlApp:
|
|
|
670
687
|
if timeout is not None:
|
|
671
688
|
search_params['timeout'] = timeout
|
|
672
689
|
if scrape_options is not None:
|
|
673
|
-
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
690
|
+
search_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
|
|
674
691
|
|
|
675
692
|
# Add any additional kwargs
|
|
676
693
|
search_params.update(kwargs)
|
|
677
694
|
_integration = search_params.get('integration')
|
|
678
695
|
|
|
679
696
|
# Create final params object
|
|
680
|
-
final_params =
|
|
681
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
697
|
+
final_params = V1SearchParams(query=query, **search_params)
|
|
698
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
682
699
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
683
700
|
|
|
684
701
|
if _integration:
|
|
@@ -695,7 +712,7 @@ class FirecrawlApp:
|
|
|
695
712
|
try:
|
|
696
713
|
response_json = response.json()
|
|
697
714
|
if response_json.get('success') and 'data' in response_json:
|
|
698
|
-
return
|
|
715
|
+
return V1SearchResponse(**response_json)
|
|
699
716
|
elif "error" in response_json:
|
|
700
717
|
raise Exception(f'Search failed. Error: {response_json["error"]}')
|
|
701
718
|
else:
|
|
@@ -718,8 +735,8 @@ class FirecrawlApp:
|
|
|
718
735
|
crawl_entire_domain: Optional[bool] = None,
|
|
719
736
|
allow_external_links: Optional[bool] = None,
|
|
720
737
|
ignore_sitemap: Optional[bool] = None,
|
|
721
|
-
scrape_options: Optional[
|
|
722
|
-
webhook: Optional[Union[str,
|
|
738
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
739
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
723
740
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
724
741
|
ignore_query_parameters: Optional[bool] = None,
|
|
725
742
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -730,7 +747,7 @@ class FirecrawlApp:
|
|
|
730
747
|
poll_interval: Optional[int] = 2,
|
|
731
748
|
idempotency_key: Optional[str] = None,
|
|
732
749
|
**kwargs
|
|
733
|
-
) ->
|
|
750
|
+
) -> V1CrawlStatusResponse:
|
|
734
751
|
"""
|
|
735
752
|
Crawl a website starting from a URL.
|
|
736
753
|
|
|
@@ -792,7 +809,7 @@ class FirecrawlApp:
|
|
|
792
809
|
if ignore_sitemap is not None:
|
|
793
810
|
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
794
811
|
if scrape_options is not None:
|
|
795
|
-
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
812
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
|
|
796
813
|
if webhook is not None:
|
|
797
814
|
crawl_params['webhook'] = webhook
|
|
798
815
|
if deduplicate_similar_urls is not None:
|
|
@@ -814,8 +831,8 @@ class FirecrawlApp:
|
|
|
814
831
|
_integration = crawl_params.get('integration')
|
|
815
832
|
|
|
816
833
|
# Create final params object
|
|
817
|
-
final_params =
|
|
818
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
834
|
+
final_params = V1CrawlParams(**crawl_params)
|
|
835
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
819
836
|
params_dict['url'] = url
|
|
820
837
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
821
838
|
|
|
@@ -848,8 +865,8 @@ class FirecrawlApp:
|
|
|
848
865
|
crawl_entire_domain: Optional[bool] = None,
|
|
849
866
|
allow_external_links: Optional[bool] = None,
|
|
850
867
|
ignore_sitemap: Optional[bool] = None,
|
|
851
|
-
scrape_options: Optional[
|
|
852
|
-
webhook: Optional[Union[str,
|
|
868
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
869
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
853
870
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
854
871
|
ignore_query_parameters: Optional[bool] = None,
|
|
855
872
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -859,7 +876,7 @@ class FirecrawlApp:
|
|
|
859
876
|
zero_data_retention: Optional[bool] = None,
|
|
860
877
|
idempotency_key: Optional[str] = None,
|
|
861
878
|
**kwargs
|
|
862
|
-
) ->
|
|
879
|
+
) -> V1CrawlResponse:
|
|
863
880
|
"""
|
|
864
881
|
Start an asynchronous crawl job.
|
|
865
882
|
|
|
@@ -874,8 +891,8 @@ class FirecrawlApp:
|
|
|
874
891
|
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
875
892
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
876
893
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
877
|
-
scrape_options (Optional[
|
|
878
|
-
webhook (Optional[Union[str,
|
|
894
|
+
scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
|
|
895
|
+
webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
|
|
879
896
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
880
897
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
881
898
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
@@ -887,7 +904,7 @@ class FirecrawlApp:
|
|
|
887
904
|
**kwargs: Additional parameters to pass to the API
|
|
888
905
|
|
|
889
906
|
Returns:
|
|
890
|
-
|
|
907
|
+
V1CrawlResponse with:
|
|
891
908
|
* success - Whether crawl started successfully
|
|
892
909
|
* id - Unique identifier for the crawl job
|
|
893
910
|
* url - Status check URL for the crawl
|
|
@@ -921,7 +938,7 @@ class FirecrawlApp:
|
|
|
921
938
|
if ignore_sitemap is not None:
|
|
922
939
|
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
923
940
|
if scrape_options is not None:
|
|
924
|
-
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
941
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
|
|
925
942
|
if webhook is not None:
|
|
926
943
|
crawl_params['webhook'] = webhook
|
|
927
944
|
if deduplicate_similar_urls is not None:
|
|
@@ -942,8 +959,8 @@ class FirecrawlApp:
|
|
|
942
959
|
crawl_params.update(kwargs)
|
|
943
960
|
|
|
944
961
|
# Create final params object
|
|
945
|
-
final_params =
|
|
946
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
962
|
+
final_params = V1CrawlParams(**crawl_params)
|
|
963
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
947
964
|
params_dict['url'] = url
|
|
948
965
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
949
966
|
|
|
@@ -953,13 +970,13 @@ class FirecrawlApp:
|
|
|
953
970
|
|
|
954
971
|
if response.status_code == 200:
|
|
955
972
|
try:
|
|
956
|
-
return
|
|
973
|
+
return V1CrawlResponse(**response.json())
|
|
957
974
|
except:
|
|
958
975
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
959
976
|
else:
|
|
960
977
|
self._handle_error(response, 'start crawl job')
|
|
961
978
|
|
|
962
|
-
def check_crawl_status(self, id: str) ->
|
|
979
|
+
def check_crawl_status(self, id: str) -> V1CrawlStatusResponse:
|
|
963
980
|
"""
|
|
964
981
|
Check the status and results of a crawl job.
|
|
965
982
|
|
|
@@ -967,7 +984,7 @@ class FirecrawlApp:
|
|
|
967
984
|
id: Unique identifier for the crawl job
|
|
968
985
|
|
|
969
986
|
Returns:
|
|
970
|
-
|
|
987
|
+
V1CrawlStatusResponse containing:
|
|
971
988
|
|
|
972
989
|
Status Information:
|
|
973
990
|
* status - Current state (scraping/completed/failed/cancelled)
|
|
@@ -1035,14 +1052,14 @@ class FirecrawlApp:
|
|
|
1035
1052
|
if 'next' in status_data:
|
|
1036
1053
|
response['next'] = status_data['next']
|
|
1037
1054
|
|
|
1038
|
-
return
|
|
1055
|
+
return V1CrawlStatusResponse(
|
|
1039
1056
|
success=False if 'error' in status_data else True,
|
|
1040
1057
|
**response
|
|
1041
1058
|
)
|
|
1042
1059
|
else:
|
|
1043
1060
|
self._handle_error(response, 'check crawl status')
|
|
1044
1061
|
|
|
1045
|
-
def check_crawl_errors(self, id: str) ->
|
|
1062
|
+
def check_crawl_errors(self, id: str) -> V1CrawlErrorsResponse:
|
|
1046
1063
|
"""
|
|
1047
1064
|
Returns information about crawl errors.
|
|
1048
1065
|
|
|
@@ -1050,7 +1067,7 @@ class FirecrawlApp:
|
|
|
1050
1067
|
id (str): The ID of the crawl job
|
|
1051
1068
|
|
|
1052
1069
|
Returns:
|
|
1053
|
-
|
|
1070
|
+
V1CrawlErrorsResponse containing:
|
|
1054
1071
|
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
1055
1072
|
- id (str): Error ID
|
|
1056
1073
|
- timestamp (str): When the error occurred
|
|
@@ -1065,7 +1082,7 @@ class FirecrawlApp:
|
|
|
1065
1082
|
response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
|
|
1066
1083
|
if response.status_code == 200:
|
|
1067
1084
|
try:
|
|
1068
|
-
return
|
|
1085
|
+
return V1CrawlErrorsResponse(**response.json())
|
|
1069
1086
|
except:
|
|
1070
1087
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1071
1088
|
else:
|
|
@@ -1109,8 +1126,8 @@ class FirecrawlApp:
|
|
|
1109
1126
|
crawl_entire_domain: Optional[bool] = None,
|
|
1110
1127
|
allow_external_links: Optional[bool] = None,
|
|
1111
1128
|
ignore_sitemap: Optional[bool] = None,
|
|
1112
|
-
scrape_options: Optional[
|
|
1113
|
-
webhook: Optional[Union[str,
|
|
1129
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
1130
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
1114
1131
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1115
1132
|
ignore_query_parameters: Optional[bool] = None,
|
|
1116
1133
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -1120,7 +1137,7 @@ class FirecrawlApp:
|
|
|
1120
1137
|
zero_data_retention: Optional[bool] = None,
|
|
1121
1138
|
idempotency_key: Optional[str] = None,
|
|
1122
1139
|
**kwargs
|
|
1123
|
-
) -> '
|
|
1140
|
+
) -> 'V1CrawlWatcher':
|
|
1124
1141
|
"""
|
|
1125
1142
|
Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
1126
1143
|
|
|
@@ -1135,8 +1152,8 @@ class FirecrawlApp:
|
|
|
1135
1152
|
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
1136
1153
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1137
1154
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1138
|
-
scrape_options (Optional[
|
|
1139
|
-
webhook (Optional[Union[str,
|
|
1155
|
+
scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
|
|
1156
|
+
webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
|
|
1140
1157
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1141
1158
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1142
1159
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
@@ -1148,7 +1165,7 @@ class FirecrawlApp:
|
|
|
1148
1165
|
**kwargs: Additional parameters to pass to the API
|
|
1149
1166
|
|
|
1150
1167
|
Returns:
|
|
1151
|
-
|
|
1168
|
+
V1CrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
1152
1169
|
|
|
1153
1170
|
Raises:
|
|
1154
1171
|
Exception: If crawl job fails to start
|
|
@@ -1161,6 +1178,7 @@ class FirecrawlApp:
|
|
|
1161
1178
|
max_discovery_depth=max_discovery_depth,
|
|
1162
1179
|
limit=limit,
|
|
1163
1180
|
allow_backward_links=allow_backward_links,
|
|
1181
|
+
crawl_entire_domain=crawl_entire_domain,
|
|
1164
1182
|
allow_external_links=allow_external_links,
|
|
1165
1183
|
ignore_sitemap=ignore_sitemap,
|
|
1166
1184
|
scrape_options=scrape_options,
|
|
@@ -1176,7 +1194,7 @@ class FirecrawlApp:
|
|
|
1176
1194
|
**kwargs
|
|
1177
1195
|
)
|
|
1178
1196
|
if crawl_response.success and crawl_response.id:
|
|
1179
|
-
return
|
|
1197
|
+
return V1CrawlWatcher(crawl_response.id, self)
|
|
1180
1198
|
else:
|
|
1181
1199
|
raise Exception("Crawl job failed to start")
|
|
1182
1200
|
|
|
@@ -1189,9 +1207,9 @@ class FirecrawlApp:
|
|
|
1189
1207
|
include_subdomains: Optional[bool] = None,
|
|
1190
1208
|
sitemap_only: Optional[bool] = None,
|
|
1191
1209
|
limit: Optional[int] = None,
|
|
1192
|
-
timeout: Optional[int] =
|
|
1210
|
+
timeout: Optional[int] = 30000,
|
|
1193
1211
|
use_index: Optional[bool] = None,
|
|
1194
|
-
**kwargs) ->
|
|
1212
|
+
**kwargs) -> V1MapResponse:
|
|
1195
1213
|
"""
|
|
1196
1214
|
Map and discover links from a URL.
|
|
1197
1215
|
|
|
@@ -1206,7 +1224,7 @@ class FirecrawlApp:
|
|
|
1206
1224
|
**kwargs: Additional parameters to pass to the API
|
|
1207
1225
|
|
|
1208
1226
|
Returns:
|
|
1209
|
-
|
|
1227
|
+
V1MapResponse: Response containing:
|
|
1210
1228
|
* success (bool): Whether request succeeded
|
|
1211
1229
|
* links (List[str]): Discovered URLs
|
|
1212
1230
|
* error (Optional[str]): Error message if any
|
|
@@ -1241,8 +1259,8 @@ class FirecrawlApp:
|
|
|
1241
1259
|
_integration = map_params.get('integration')
|
|
1242
1260
|
|
|
1243
1261
|
# Create final params object
|
|
1244
|
-
final_params =
|
|
1245
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
1262
|
+
final_params = V1MapParams(**map_params)
|
|
1263
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
1246
1264
|
params_dict['url'] = url
|
|
1247
1265
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
1248
1266
|
|
|
@@ -1260,7 +1278,7 @@ class FirecrawlApp:
|
|
|
1260
1278
|
try:
|
|
1261
1279
|
response_json = response.json()
|
|
1262
1280
|
if response_json.get('success') and 'links' in response_json:
|
|
1263
|
-
return
|
|
1281
|
+
return V1MapResponse(**response_json)
|
|
1264
1282
|
elif "error" in response_json:
|
|
1265
1283
|
raise Exception(f'Map failed. Error: {response_json["error"]}')
|
|
1266
1284
|
else:
|
|
@@ -1280,23 +1298,23 @@ class FirecrawlApp:
|
|
|
1280
1298
|
exclude_tags: Optional[List[str]] = None,
|
|
1281
1299
|
only_main_content: Optional[bool] = None,
|
|
1282
1300
|
wait_for: Optional[int] = None,
|
|
1283
|
-
timeout: Optional[int] =
|
|
1284
|
-
location: Optional[
|
|
1301
|
+
timeout: Optional[int] = 30000,
|
|
1302
|
+
location: Optional[V1LocationConfig] = None,
|
|
1285
1303
|
mobile: Optional[bool] = None,
|
|
1286
1304
|
skip_tls_verification: Optional[bool] = None,
|
|
1287
1305
|
remove_base64_images: Optional[bool] = None,
|
|
1288
1306
|
block_ads: Optional[bool] = None,
|
|
1289
1307
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1290
|
-
extract: Optional[
|
|
1291
|
-
json_options: Optional[
|
|
1292
|
-
actions: Optional[List[Union[
|
|
1293
|
-
agent: Optional[
|
|
1308
|
+
extract: Optional[V1JsonConfig] = None,
|
|
1309
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
1310
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
1311
|
+
agent: Optional[V1AgentOptions] = None,
|
|
1294
1312
|
poll_interval: Optional[int] = 2,
|
|
1295
1313
|
max_concurrency: Optional[int] = None,
|
|
1296
1314
|
zero_data_retention: Optional[bool] = None,
|
|
1297
1315
|
idempotency_key: Optional[str] = None,
|
|
1298
1316
|
**kwargs
|
|
1299
|
-
) ->
|
|
1317
|
+
) -> V1BatchScrapeStatusResponse:
|
|
1300
1318
|
"""
|
|
1301
1319
|
Batch scrape multiple URLs and monitor until completion.
|
|
1302
1320
|
|
|
@@ -1325,7 +1343,7 @@ class FirecrawlApp:
|
|
|
1325
1343
|
**kwargs: Additional parameters to pass to the API
|
|
1326
1344
|
|
|
1327
1345
|
Returns:
|
|
1328
|
-
|
|
1346
|
+
V1BatchScrapeStatusResponse with:
|
|
1329
1347
|
* Scraping status and progress
|
|
1330
1348
|
* Scraped content for each URL
|
|
1331
1349
|
* Success/error information
|
|
@@ -1354,7 +1372,7 @@ class FirecrawlApp:
|
|
|
1354
1372
|
if timeout is not None:
|
|
1355
1373
|
scrape_params['timeout'] = timeout
|
|
1356
1374
|
if location is not None:
|
|
1357
|
-
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1375
|
+
scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
1358
1376
|
if mobile is not None:
|
|
1359
1377
|
scrape_params['mobile'] = mobile
|
|
1360
1378
|
if skip_tls_verification is not None:
|
|
@@ -1369,16 +1387,16 @@ class FirecrawlApp:
|
|
|
1369
1387
|
extract = self._ensure_schema_dict(extract)
|
|
1370
1388
|
if isinstance(extract, dict) and "schema" in extract:
|
|
1371
1389
|
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
1372
|
-
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
1390
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
|
|
1373
1391
|
if json_options is not None:
|
|
1374
1392
|
json_options = self._ensure_schema_dict(json_options)
|
|
1375
1393
|
if isinstance(json_options, dict) and "schema" in json_options:
|
|
1376
1394
|
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
1377
|
-
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
1395
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
|
|
1378
1396
|
if actions:
|
|
1379
|
-
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
1397
|
+
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
|
|
1380
1398
|
if agent is not None:
|
|
1381
|
-
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1399
|
+
scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
|
|
1382
1400
|
if max_concurrency is not None:
|
|
1383
1401
|
scrape_params['maxConcurrency'] = max_concurrency
|
|
1384
1402
|
if zero_data_retention is not None:
|
|
@@ -1388,8 +1406,8 @@ class FirecrawlApp:
|
|
|
1388
1406
|
scrape_params.update(kwargs)
|
|
1389
1407
|
|
|
1390
1408
|
# Create final params object
|
|
1391
|
-
final_params =
|
|
1392
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
1409
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
1410
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
1393
1411
|
params_dict['urls'] = urls
|
|
1394
1412
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
1395
1413
|
|
|
@@ -1421,22 +1439,22 @@ class FirecrawlApp:
|
|
|
1421
1439
|
exclude_tags: Optional[List[str]] = None,
|
|
1422
1440
|
only_main_content: Optional[bool] = None,
|
|
1423
1441
|
wait_for: Optional[int] = None,
|
|
1424
|
-
timeout: Optional[int] =
|
|
1425
|
-
location: Optional[
|
|
1442
|
+
timeout: Optional[int] = 30000,
|
|
1443
|
+
location: Optional[V1LocationConfig] = None,
|
|
1426
1444
|
mobile: Optional[bool] = None,
|
|
1427
1445
|
skip_tls_verification: Optional[bool] = None,
|
|
1428
1446
|
remove_base64_images: Optional[bool] = None,
|
|
1429
1447
|
block_ads: Optional[bool] = None,
|
|
1430
1448
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1431
|
-
extract: Optional[
|
|
1432
|
-
json_options: Optional[
|
|
1433
|
-
actions: Optional[List[Union[
|
|
1434
|
-
agent: Optional[
|
|
1449
|
+
extract: Optional[V1JsonConfig] = None,
|
|
1450
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
1451
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
1452
|
+
agent: Optional[V1AgentOptions] = None,
|
|
1435
1453
|
max_concurrency: Optional[int] = None,
|
|
1436
1454
|
idempotency_key: Optional[str] = None,
|
|
1437
1455
|
zero_data_retention: Optional[bool] = None,
|
|
1438
1456
|
**kwargs
|
|
1439
|
-
) ->
|
|
1457
|
+
) -> V1BatchScrapeResponse:
|
|
1440
1458
|
"""
|
|
1441
1459
|
Initiate a batch scrape job asynchronously.
|
|
1442
1460
|
|
|
@@ -1465,7 +1483,7 @@ class FirecrawlApp:
|
|
|
1465
1483
|
**kwargs: Additional parameters to pass to the API
|
|
1466
1484
|
|
|
1467
1485
|
Returns:
|
|
1468
|
-
|
|
1486
|
+
V1BatchScrapeResponse with:
|
|
1469
1487
|
* success - Whether job started successfully
|
|
1470
1488
|
* id - Unique identifier for the job
|
|
1471
1489
|
* url - Status check URL
|
|
@@ -1495,7 +1513,7 @@ class FirecrawlApp:
|
|
|
1495
1513
|
if timeout is not None:
|
|
1496
1514
|
scrape_params['timeout'] = timeout
|
|
1497
1515
|
if location is not None:
|
|
1498
|
-
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1516
|
+
scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
1499
1517
|
if mobile is not None:
|
|
1500
1518
|
scrape_params['mobile'] = mobile
|
|
1501
1519
|
if skip_tls_verification is not None:
|
|
@@ -1510,16 +1528,16 @@ class FirecrawlApp:
|
|
|
1510
1528
|
extract = self._ensure_schema_dict(extract)
|
|
1511
1529
|
if isinstance(extract, dict) and "schema" in extract:
|
|
1512
1530
|
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
1513
|
-
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
1531
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
|
|
1514
1532
|
if json_options is not None:
|
|
1515
1533
|
json_options = self._ensure_schema_dict(json_options)
|
|
1516
1534
|
if isinstance(json_options, dict) and "schema" in json_options:
|
|
1517
1535
|
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
1518
|
-
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
1536
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
|
|
1519
1537
|
if actions:
|
|
1520
|
-
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
1538
|
+
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
|
|
1521
1539
|
if agent is not None:
|
|
1522
|
-
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1540
|
+
scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
|
|
1523
1541
|
if max_concurrency is not None:
|
|
1524
1542
|
scrape_params['maxConcurrency'] = max_concurrency
|
|
1525
1543
|
if zero_data_retention is not None:
|
|
@@ -1529,8 +1547,8 @@ class FirecrawlApp:
|
|
|
1529
1547
|
scrape_params.update(kwargs)
|
|
1530
1548
|
|
|
1531
1549
|
# Create final params object
|
|
1532
|
-
final_params =
|
|
1533
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
1550
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
1551
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
1534
1552
|
params_dict['urls'] = urls
|
|
1535
1553
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
1536
1554
|
|
|
@@ -1545,7 +1563,7 @@ class FirecrawlApp:
|
|
|
1545
1563
|
|
|
1546
1564
|
if response.status_code == 200:
|
|
1547
1565
|
try:
|
|
1548
|
-
return
|
|
1566
|
+
return V1BatchScrapeResponse(**response.json())
|
|
1549
1567
|
except:
|
|
1550
1568
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1551
1569
|
else:
|
|
@@ -1561,22 +1579,22 @@ class FirecrawlApp:
|
|
|
1561
1579
|
exclude_tags: Optional[List[str]] = None,
|
|
1562
1580
|
only_main_content: Optional[bool] = None,
|
|
1563
1581
|
wait_for: Optional[int] = None,
|
|
1564
|
-
timeout: Optional[int] =
|
|
1565
|
-
location: Optional[
|
|
1582
|
+
timeout: Optional[int] = 30000,
|
|
1583
|
+
location: Optional[V1LocationConfig] = None,
|
|
1566
1584
|
mobile: Optional[bool] = None,
|
|
1567
1585
|
skip_tls_verification: Optional[bool] = None,
|
|
1568
1586
|
remove_base64_images: Optional[bool] = None,
|
|
1569
1587
|
block_ads: Optional[bool] = None,
|
|
1570
1588
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1571
|
-
extract: Optional[
|
|
1572
|
-
json_options: Optional[
|
|
1573
|
-
actions: Optional[List[Union[
|
|
1574
|
-
agent: Optional[
|
|
1589
|
+
extract: Optional[V1JsonConfig] = None,
|
|
1590
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
1591
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
1592
|
+
agent: Optional[V1AgentOptions] = None,
|
|
1575
1593
|
max_concurrency: Optional[int] = None,
|
|
1576
1594
|
zero_data_retention: Optional[bool] = None,
|
|
1577
1595
|
idempotency_key: Optional[str] = None,
|
|
1578
1596
|
**kwargs
|
|
1579
|
-
) -> '
|
|
1597
|
+
) -> 'V1CrawlWatcher':
|
|
1580
1598
|
"""
|
|
1581
1599
|
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
1582
1600
|
|
|
@@ -1605,7 +1623,7 @@ class FirecrawlApp:
|
|
|
1605
1623
|
**kwargs: Additional parameters to pass to the API
|
|
1606
1624
|
|
|
1607
1625
|
Returns:
|
|
1608
|
-
|
|
1626
|
+
V1CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
1609
1627
|
|
|
1610
1628
|
Raises:
|
|
1611
1629
|
Exception: If batch scrape job fails to start
|
|
@@ -1631,7 +1649,7 @@ class FirecrawlApp:
|
|
|
1631
1649
|
if timeout is not None:
|
|
1632
1650
|
scrape_params['timeout'] = timeout
|
|
1633
1651
|
if location is not None:
|
|
1634
|
-
scrape_params['location'] = location.dict(exclude_none=True)
|
|
1652
|
+
scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
1635
1653
|
if mobile is not None:
|
|
1636
1654
|
scrape_params['mobile'] = mobile
|
|
1637
1655
|
if skip_tls_verification is not None:
|
|
@@ -1646,16 +1664,16 @@ class FirecrawlApp:
|
|
|
1646
1664
|
extract = self._ensure_schema_dict(extract)
|
|
1647
1665
|
if isinstance(extract, dict) and "schema" in extract:
|
|
1648
1666
|
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
1649
|
-
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
1667
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
|
|
1650
1668
|
if json_options is not None:
|
|
1651
1669
|
json_options = self._ensure_schema_dict(json_options)
|
|
1652
1670
|
if isinstance(json_options, dict) and "schema" in json_options:
|
|
1653
1671
|
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
1654
|
-
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
1672
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
|
|
1655
1673
|
if actions:
|
|
1656
|
-
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
1674
|
+
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
|
|
1657
1675
|
if agent is not None:
|
|
1658
|
-
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1676
|
+
scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
|
|
1659
1677
|
if max_concurrency is not None:
|
|
1660
1678
|
scrape_params['maxConcurrency'] = max_concurrency
|
|
1661
1679
|
if zero_data_retention is not None:
|
|
@@ -1665,8 +1683,8 @@ class FirecrawlApp:
|
|
|
1665
1683
|
scrape_params.update(kwargs)
|
|
1666
1684
|
|
|
1667
1685
|
# Create final params object
|
|
1668
|
-
final_params =
|
|
1669
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
1686
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
1687
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
1670
1688
|
params_dict['urls'] = urls
|
|
1671
1689
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
1672
1690
|
|
|
@@ -1681,9 +1699,9 @@ class FirecrawlApp:
|
|
|
1681
1699
|
|
|
1682
1700
|
if response.status_code == 200:
|
|
1683
1701
|
try:
|
|
1684
|
-
crawl_response =
|
|
1702
|
+
crawl_response = V1BatchScrapeResponse(**response.json())
|
|
1685
1703
|
if crawl_response.success and crawl_response.id:
|
|
1686
|
-
return
|
|
1704
|
+
return V1CrawlWatcher(crawl_response.id, self)
|
|
1687
1705
|
else:
|
|
1688
1706
|
raise Exception("Batch scrape job failed to start")
|
|
1689
1707
|
except:
|
|
@@ -1691,7 +1709,7 @@ class FirecrawlApp:
|
|
|
1691
1709
|
else:
|
|
1692
1710
|
self._handle_error(response, 'start batch scrape job')
|
|
1693
1711
|
|
|
1694
|
-
def check_batch_scrape_status(self, id: str) ->
|
|
1712
|
+
def check_batch_scrape_status(self, id: str) -> V1BatchScrapeStatusResponse:
|
|
1695
1713
|
"""
|
|
1696
1714
|
Check the status of a batch scrape job using the Firecrawl API.
|
|
1697
1715
|
|
|
@@ -1699,7 +1717,7 @@ class FirecrawlApp:
|
|
|
1699
1717
|
id (str): The ID of the batch scrape job.
|
|
1700
1718
|
|
|
1701
1719
|
Returns:
|
|
1702
|
-
|
|
1720
|
+
V1BatchScrapeStatusResponse: The status of the batch scrape job.
|
|
1703
1721
|
|
|
1704
1722
|
Raises:
|
|
1705
1723
|
Exception: If the status check request fails.
|
|
@@ -1739,7 +1757,7 @@ class FirecrawlApp:
|
|
|
1739
1757
|
break
|
|
1740
1758
|
status_data['data'] = data
|
|
1741
1759
|
|
|
1742
|
-
return
|
|
1760
|
+
return V1BatchScrapeStatusResponse(**{
|
|
1743
1761
|
'success': False if 'error' in status_data else True,
|
|
1744
1762
|
'status': status_data.get('status'),
|
|
1745
1763
|
'total': status_data.get('total'),
|
|
@@ -1753,7 +1771,7 @@ class FirecrawlApp:
|
|
|
1753
1771
|
else:
|
|
1754
1772
|
self._handle_error(response, 'check batch scrape status')
|
|
1755
1773
|
|
|
1756
|
-
def check_batch_scrape_errors(self, id: str) ->
|
|
1774
|
+
def check_batch_scrape_errors(self, id: str) -> V1CrawlErrorsResponse:
|
|
1757
1775
|
"""
|
|
1758
1776
|
Returns information about batch scrape errors.
|
|
1759
1777
|
|
|
@@ -1761,7 +1779,7 @@ class FirecrawlApp:
|
|
|
1761
1779
|
id (str): The ID of the crawl job.
|
|
1762
1780
|
|
|
1763
1781
|
Returns:
|
|
1764
|
-
|
|
1782
|
+
V1CrawlErrorsResponse containing:
|
|
1765
1783
|
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
1766
1784
|
* id (str): Error ID
|
|
1767
1785
|
* timestamp (str): When the error occurred
|
|
@@ -1776,7 +1794,7 @@ class FirecrawlApp:
|
|
|
1776
1794
|
response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
|
|
1777
1795
|
if response.status_code == 200:
|
|
1778
1796
|
try:
|
|
1779
|
-
return
|
|
1797
|
+
return V1CrawlErrorsResponse(**response.json())
|
|
1780
1798
|
except:
|
|
1781
1799
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1782
1800
|
else:
|
|
@@ -1793,7 +1811,7 @@ class FirecrawlApp:
|
|
|
1793
1811
|
enable_web_search: Optional[bool] = False,
|
|
1794
1812
|
show_sources: Optional[bool] = False,
|
|
1795
1813
|
agent: Optional[Dict[str, Any]] = None,
|
|
1796
|
-
**kwargs) ->
|
|
1814
|
+
**kwargs) -> V1ExtractResponse[Any]:
|
|
1797
1815
|
"""
|
|
1798
1816
|
Extract structured information from URLs.
|
|
1799
1817
|
|
|
@@ -1809,7 +1827,7 @@ class FirecrawlApp:
|
|
|
1809
1827
|
**kwargs: Additional parameters to pass to the API
|
|
1810
1828
|
|
|
1811
1829
|
Returns:
|
|
1812
|
-
|
|
1830
|
+
V1ExtractResponse[Any] with:
|
|
1813
1831
|
* success (bool): Whether request succeeded
|
|
1814
1832
|
* data (Optional[Any]): Extracted data matching schema
|
|
1815
1833
|
* error (Optional[str]): Error message if any
|
|
@@ -1881,7 +1899,7 @@ class FirecrawlApp:
|
|
|
1881
1899
|
except:
|
|
1882
1900
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1883
1901
|
if status_data['status'] == 'completed':
|
|
1884
|
-
return
|
|
1902
|
+
return V1ExtractResponse(**status_data)
|
|
1885
1903
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
1886
1904
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
1887
1905
|
else:
|
|
@@ -1895,9 +1913,9 @@ class FirecrawlApp:
|
|
|
1895
1913
|
except Exception as e:
|
|
1896
1914
|
raise ValueError(str(e), 500)
|
|
1897
1915
|
|
|
1898
|
-
return
|
|
1916
|
+
return V1ExtractResponse(success=False, error="Internal server error.")
|
|
1899
1917
|
|
|
1900
|
-
def get_extract_status(self, job_id: str) ->
|
|
1918
|
+
def get_extract_status(self, job_id: str) -> V1ExtractResponse[Any]:
|
|
1901
1919
|
"""
|
|
1902
1920
|
Retrieve the status of an extract job.
|
|
1903
1921
|
|
|
@@ -1915,7 +1933,7 @@ class FirecrawlApp:
|
|
|
1915
1933
|
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
|
1916
1934
|
if response.status_code == 200:
|
|
1917
1935
|
try:
|
|
1918
|
-
return
|
|
1936
|
+
return V1ExtractResponse(**response.json())
|
|
1919
1937
|
except:
|
|
1920
1938
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1921
1939
|
else:
|
|
@@ -1933,7 +1951,7 @@ class FirecrawlApp:
|
|
|
1933
1951
|
allow_external_links: Optional[bool] = False,
|
|
1934
1952
|
enable_web_search: Optional[bool] = False,
|
|
1935
1953
|
show_sources: Optional[bool] = False,
|
|
1936
|
-
agent: Optional[Dict[str, Any]] = None) ->
|
|
1954
|
+
agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
|
|
1937
1955
|
"""
|
|
1938
1956
|
Initiate an asynchronous extract job.
|
|
1939
1957
|
|
|
@@ -1983,7 +2001,7 @@ class FirecrawlApp:
|
|
|
1983
2001
|
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
|
1984
2002
|
if response.status_code == 200:
|
|
1985
2003
|
try:
|
|
1986
|
-
return
|
|
2004
|
+
return V1ExtractResponse(**response.json())
|
|
1987
2005
|
except:
|
|
1988
2006
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1989
2007
|
else:
|
|
@@ -1998,7 +2016,7 @@ class FirecrawlApp:
|
|
|
1998
2016
|
max_urls: Optional[int] = None,
|
|
1999
2017
|
show_full_text: Optional[bool] = None,
|
|
2000
2018
|
cache: Optional[bool] = None,
|
|
2001
|
-
experimental_stream: Optional[bool] = None) ->
|
|
2019
|
+
experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextStatusResponse:
|
|
2002
2020
|
"""
|
|
2003
2021
|
Generate LLMs.txt for a given URL and poll until completion.
|
|
2004
2022
|
|
|
@@ -2019,7 +2037,7 @@ class FirecrawlApp:
|
|
|
2019
2037
|
Raises:
|
|
2020
2038
|
Exception: If generation fails
|
|
2021
2039
|
"""
|
|
2022
|
-
params =
|
|
2040
|
+
params = V1GenerateLLMsTextParams(
|
|
2023
2041
|
maxUrls=max_urls,
|
|
2024
2042
|
showFullText=show_full_text,
|
|
2025
2043
|
cache=cache,
|
|
@@ -2035,7 +2053,7 @@ class FirecrawlApp:
|
|
|
2035
2053
|
)
|
|
2036
2054
|
|
|
2037
2055
|
if not response.success or not response.id:
|
|
2038
|
-
return
|
|
2056
|
+
return V1GenerateLLMsTextStatusResponse(
|
|
2039
2057
|
success=False,
|
|
2040
2058
|
error='Failed to start LLMs.txt generation',
|
|
2041
2059
|
status='failed',
|
|
@@ -2051,7 +2069,7 @@ class FirecrawlApp:
|
|
|
2051
2069
|
elif status.status == 'failed':
|
|
2052
2070
|
return status
|
|
2053
2071
|
elif status.status != 'processing':
|
|
2054
|
-
return
|
|
2072
|
+
return V1GenerateLLMsTextStatusResponse(
|
|
2055
2073
|
success=False,
|
|
2056
2074
|
error='LLMs.txt generation job terminated unexpectedly',
|
|
2057
2075
|
status='failed',
|
|
@@ -2067,7 +2085,7 @@ class FirecrawlApp:
|
|
|
2067
2085
|
max_urls: Optional[int] = None,
|
|
2068
2086
|
show_full_text: Optional[bool] = None,
|
|
2069
2087
|
cache: Optional[bool] = None,
|
|
2070
|
-
experimental_stream: Optional[bool] = None) ->
|
|
2088
|
+
experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextResponse:
|
|
2071
2089
|
"""
|
|
2072
2090
|
Initiate an asynchronous LLMs.txt generation operation.
|
|
2073
2091
|
|
|
@@ -2087,7 +2105,7 @@ class FirecrawlApp:
|
|
|
2087
2105
|
Raises:
|
|
2088
2106
|
Exception: If the generation job initiation fails.
|
|
2089
2107
|
"""
|
|
2090
|
-
params =
|
|
2108
|
+
params = V1GenerateLLMsTextParams(
|
|
2091
2109
|
maxUrls=max_urls,
|
|
2092
2110
|
showFullText=show_full_text,
|
|
2093
2111
|
cache=cache,
|
|
@@ -2095,7 +2113,7 @@ class FirecrawlApp:
|
|
|
2095
2113
|
)
|
|
2096
2114
|
|
|
2097
2115
|
headers = self._prepare_headers()
|
|
2098
|
-
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
2116
|
+
json_data = {'url': url, **params.dict(by_alias=True, exclude_none=True)}
|
|
2099
2117
|
json_data['origin'] = f"python-sdk@{version}"
|
|
2100
2118
|
|
|
2101
2119
|
try:
|
|
@@ -2105,7 +2123,7 @@ class FirecrawlApp:
|
|
|
2105
2123
|
print("response", response)
|
|
2106
2124
|
if response.get('success'):
|
|
2107
2125
|
try:
|
|
2108
|
-
return
|
|
2126
|
+
return V1GenerateLLMsTextResponse(**response)
|
|
2109
2127
|
except:
|
|
2110
2128
|
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
2111
2129
|
else:
|
|
@@ -2113,12 +2131,12 @@ class FirecrawlApp:
|
|
|
2113
2131
|
except Exception as e:
|
|
2114
2132
|
raise ValueError(str(e))
|
|
2115
2133
|
|
|
2116
|
-
return
|
|
2134
|
+
return V1GenerateLLMsTextResponse(
|
|
2117
2135
|
success=False,
|
|
2118
2136
|
error='Internal server error'
|
|
2119
2137
|
)
|
|
2120
2138
|
|
|
2121
|
-
def check_generate_llms_text_status(self, id: str) ->
|
|
2139
|
+
def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextStatusResponse:
|
|
2122
2140
|
"""
|
|
2123
2141
|
Check the status of a LLMs.txt generation operation.
|
|
2124
2142
|
|
|
@@ -2144,7 +2162,7 @@ class FirecrawlApp:
|
|
|
2144
2162
|
if response.status_code == 200:
|
|
2145
2163
|
try:
|
|
2146
2164
|
json_data = response.json()
|
|
2147
|
-
return
|
|
2165
|
+
return V1GenerateLLMsTextStatusResponse(**json_data)
|
|
2148
2166
|
except Exception as e:
|
|
2149
2167
|
raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
|
|
2150
2168
|
elif response.status_code == 404:
|
|
@@ -2154,7 +2172,7 @@ class FirecrawlApp:
|
|
|
2154
2172
|
except Exception as e:
|
|
2155
2173
|
raise ValueError(str(e))
|
|
2156
2174
|
|
|
2157
|
-
return
|
|
2175
|
+
return V1GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
|
|
2158
2176
|
|
|
2159
2177
|
def _prepare_headers(
|
|
2160
2178
|
self,
|
|
@@ -2204,7 +2222,7 @@ class FirecrawlApp:
|
|
|
2204
2222
|
requests.RequestException: If the request fails after the specified retries.
|
|
2205
2223
|
"""
|
|
2206
2224
|
for attempt in range(retries):
|
|
2207
|
-
response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] +
|
|
2225
|
+
response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] / 1000.0 + 5) if "timeout" in data and data["timeout"] is not None else None))
|
|
2208
2226
|
if response.status_code == 502:
|
|
2209
2227
|
time.sleep(backoff_factor * (2 ** attempt))
|
|
2210
2228
|
else:
|
|
@@ -2273,7 +2291,7 @@ class FirecrawlApp:
|
|
|
2273
2291
|
self,
|
|
2274
2292
|
id: str,
|
|
2275
2293
|
headers: Dict[str, str],
|
|
2276
|
-
poll_interval: int) ->
|
|
2294
|
+
poll_interval: int) -> V1CrawlStatusResponse:
|
|
2277
2295
|
"""
|
|
2278
2296
|
Monitor the status of a crawl job until completion.
|
|
2279
2297
|
|
|
@@ -2310,7 +2328,7 @@ class FirecrawlApp:
|
|
|
2310
2328
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
2311
2329
|
data.extend(status_data.get('data', []))
|
|
2312
2330
|
status_data['data'] = data
|
|
2313
|
-
return
|
|
2331
|
+
return V1CrawlStatusResponse(**status_data)
|
|
2314
2332
|
else:
|
|
2315
2333
|
raise Exception('Crawl job completed but no data was returned')
|
|
2316
2334
|
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
@@ -2336,10 +2354,22 @@ class FirecrawlApp:
|
|
|
2336
2354
|
Exception: An exception with a message containing the status code and error details from the response.
|
|
2337
2355
|
"""
|
|
2338
2356
|
try:
|
|
2339
|
-
|
|
2340
|
-
|
|
2357
|
+
response_json = response.json()
|
|
2358
|
+
error_message = response_json.get('error', 'No error message provided.')
|
|
2359
|
+
error_details = response_json.get('details', 'No additional error details provided.')
|
|
2341
2360
|
except:
|
|
2342
|
-
|
|
2361
|
+
# If we can't parse JSON, provide a helpful error message with response content
|
|
2362
|
+
try:
|
|
2363
|
+
response_text = response.text[:500] # Limit to first 500 chars
|
|
2364
|
+
if response_text.strip():
|
|
2365
|
+
error_message = f"Server returned non-JSON response: {response_text}"
|
|
2366
|
+
error_details = f"Full response status: {response.status_code}"
|
|
2367
|
+
else:
|
|
2368
|
+
error_message = f"Server returned empty response with status {response.status_code}"
|
|
2369
|
+
error_details = "No additional details available"
|
|
2370
|
+
except ValueError:
|
|
2371
|
+
error_message = f"Server returned unreadable response with status {response.status_code}"
|
|
2372
|
+
error_details = "No additional details available"
|
|
2343
2373
|
|
|
2344
2374
|
message = self._get_error_message(response.status_code, action, error_message, error_details)
|
|
2345
2375
|
|
|
@@ -2362,7 +2392,7 @@ class FirecrawlApp:
|
|
|
2362
2392
|
if status_code == 402:
|
|
2363
2393
|
return f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
2364
2394
|
elif status_code == 403:
|
|
2365
|
-
|
|
2395
|
+
return f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
2366
2396
|
elif status_code == 408:
|
|
2367
2397
|
return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
2368
2398
|
elif status_code == 409:
|
|
@@ -2383,7 +2413,7 @@ class FirecrawlApp:
|
|
|
2383
2413
|
system_prompt: Optional[str] = None,
|
|
2384
2414
|
__experimental_stream_steps: Optional[bool] = None,
|
|
2385
2415
|
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
2386
|
-
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) ->
|
|
2416
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> V1DeepResearchStatusResponse:
|
|
2387
2417
|
"""
|
|
2388
2418
|
Initiates a deep research operation on a given query and polls until completion.
|
|
2389
2419
|
|
|
@@ -2425,7 +2455,7 @@ class FirecrawlApp:
|
|
|
2425
2455
|
research_params['systemPrompt'] = system_prompt
|
|
2426
2456
|
if __experimental_stream_steps is not None:
|
|
2427
2457
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2428
|
-
research_params =
|
|
2458
|
+
research_params = V1DeepResearchParams(**research_params)
|
|
2429
2459
|
|
|
2430
2460
|
response = self.async_deep_research(
|
|
2431
2461
|
query,
|
|
@@ -2512,11 +2542,11 @@ class FirecrawlApp:
|
|
|
2512
2542
|
research_params['systemPrompt'] = system_prompt
|
|
2513
2543
|
if __experimental_stream_steps is not None:
|
|
2514
2544
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2515
|
-
research_params =
|
|
2545
|
+
research_params = V1DeepResearchParams(**research_params)
|
|
2516
2546
|
|
|
2517
2547
|
headers = self._prepare_headers()
|
|
2518
2548
|
|
|
2519
|
-
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
|
2549
|
+
json_data = {'query': query, **research_params.dict(by_alias=True, exclude_none=True)}
|
|
2520
2550
|
json_data['origin'] = f"python-sdk@{version}"
|
|
2521
2551
|
|
|
2522
2552
|
# Handle json options schema if present
|
|
@@ -2539,7 +2569,7 @@ class FirecrawlApp:
|
|
|
2539
2569
|
|
|
2540
2570
|
return {'success': False, 'error': 'Internal server error'}
|
|
2541
2571
|
|
|
2542
|
-
def check_deep_research_status(self, id: str) ->
|
|
2572
|
+
def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse:
|
|
2543
2573
|
"""
|
|
2544
2574
|
Check the status of a deep research operation.
|
|
2545
2575
|
|
|
@@ -2650,19 +2680,19 @@ class FirecrawlApp:
|
|
|
2650
2680
|
return [self._ensure_schema_dict(v) for v in schema]
|
|
2651
2681
|
return schema
|
|
2652
2682
|
|
|
2653
|
-
class
|
|
2683
|
+
class V1CrawlWatcher:
|
|
2654
2684
|
"""
|
|
2655
2685
|
A class to watch and handle crawl job events via WebSocket connection.
|
|
2656
2686
|
|
|
2657
2687
|
Attributes:
|
|
2658
2688
|
id (str): The ID of the crawl job to watch
|
|
2659
|
-
app (
|
|
2689
|
+
app (V1FirecrawlApp): The V1FirecrawlApp instance
|
|
2660
2690
|
data (List[Dict[str, Any]]): List of crawled documents/data
|
|
2661
2691
|
status (str): Current status of the crawl job
|
|
2662
2692
|
ws_url (str): WebSocket URL for the crawl job
|
|
2663
2693
|
event_handlers (dict): Dictionary of event type to list of handler functions
|
|
2664
2694
|
"""
|
|
2665
|
-
def __init__(self, id: str, app:
|
|
2695
|
+
def __init__(self, id: str, app: V1FirecrawlApp):
|
|
2666
2696
|
self.id = id
|
|
2667
2697
|
self.app = app
|
|
2668
2698
|
self.data: List[Dict[str, Any]] = []
|
|
@@ -2741,12 +2771,16 @@ class CrawlWatcher:
|
|
|
2741
2771
|
self.data.append(msg['data'])
|
|
2742
2772
|
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
|
2743
2773
|
|
|
2744
|
-
class
|
|
2774
|
+
class AsyncV1FirecrawlApp(V1FirecrawlApp):
|
|
2745
2775
|
"""
|
|
2746
|
-
Asynchronous version of
|
|
2747
|
-
Provides non-blocking alternatives to all
|
|
2776
|
+
Asynchronous version of V1FirecrawlApp that implements async methods using aiohttp.
|
|
2777
|
+
Provides non-blocking alternatives to all V1FirecrawlApp operations.
|
|
2748
2778
|
"""
|
|
2749
2779
|
|
|
2780
|
+
def __init__(self, api_key: str, api_url: str = "https://api.firecrawl.dev"):
|
|
2781
|
+
# Reuse V1 helpers (_prepare_headers, _validate_kwargs, _ensure_schema_dict, _get_error_message)
|
|
2782
|
+
super().__init__(api_key=api_key, api_url=api_url)
|
|
2783
|
+
|
|
2750
2784
|
async def _async_request(
|
|
2751
2785
|
self,
|
|
2752
2786
|
method: str,
|
|
@@ -2882,14 +2916,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2882
2916
|
async def crawl_url_and_watch(
|
|
2883
2917
|
self,
|
|
2884
2918
|
url: str,
|
|
2885
|
-
params: Optional[
|
|
2886
|
-
idempotency_key: Optional[str] = None) -> '
|
|
2919
|
+
params: Optional[V1CrawlParams] = None,
|
|
2920
|
+
idempotency_key: Optional[str] = None) -> 'AsyncV1CrawlWatcher':
|
|
2887
2921
|
"""
|
|
2888
|
-
Initiate an async crawl job and return an
|
|
2922
|
+
Initiate an async crawl job and return an AsyncV1CrawlWatcher to monitor progress via WebSocket.
|
|
2889
2923
|
|
|
2890
2924
|
Args:
|
|
2891
2925
|
url (str): Target URL to start crawling from
|
|
2892
|
-
params (Optional[
|
|
2926
|
+
params (Optional[V1CrawlParams]): See V1CrawlParams model for configuration:
|
|
2893
2927
|
URL Discovery:
|
|
2894
2928
|
* includePaths - Patterns of URLs to include
|
|
2895
2929
|
* excludePaths - Patterns of URLs to exclude
|
|
@@ -2912,28 +2946,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2912
2946
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2913
2947
|
|
|
2914
2948
|
Returns:
|
|
2915
|
-
|
|
2949
|
+
AsyncV1CrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
2916
2950
|
|
|
2917
2951
|
Raises:
|
|
2918
2952
|
Exception: If crawl job fails to start
|
|
2919
2953
|
"""
|
|
2920
2954
|
crawl_response = await self.async_crawl_url(url, params, idempotency_key)
|
|
2921
2955
|
if crawl_response.get('success') and 'id' in crawl_response:
|
|
2922
|
-
return
|
|
2956
|
+
return AsyncV1CrawlWatcher(crawl_response['id'], self)
|
|
2923
2957
|
else:
|
|
2924
2958
|
raise Exception("Crawl job failed to start")
|
|
2925
2959
|
|
|
2926
2960
|
async def batch_scrape_urls_and_watch(
|
|
2927
2961
|
self,
|
|
2928
2962
|
urls: List[str],
|
|
2929
|
-
params: Optional[
|
|
2930
|
-
idempotency_key: Optional[str] = None) -> '
|
|
2963
|
+
params: Optional[V1ScrapeParams] = None,
|
|
2964
|
+
idempotency_key: Optional[str] = None) -> 'AsyncV1CrawlWatcher':
|
|
2931
2965
|
"""
|
|
2932
|
-
Initiate an async batch scrape job and return an
|
|
2966
|
+
Initiate an async batch scrape job and return an AsyncV1CrawlWatcher to monitor progress.
|
|
2933
2967
|
|
|
2934
2968
|
Args:
|
|
2935
2969
|
urls (List[str]): List of URLs to scrape
|
|
2936
|
-
params (Optional[
|
|
2970
|
+
params (Optional[V1ScrapeParams]): See V1ScrapeParams model for configuration:
|
|
2937
2971
|
|
|
2938
2972
|
Content Options:
|
|
2939
2973
|
* formats - Content formats to retrieve
|
|
@@ -2954,14 +2988,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2954
2988
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2955
2989
|
|
|
2956
2990
|
Returns:
|
|
2957
|
-
|
|
2991
|
+
AsyncV1CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
2958
2992
|
|
|
2959
2993
|
Raises:
|
|
2960
2994
|
Exception: If batch scrape job fails to start
|
|
2961
2995
|
"""
|
|
2962
2996
|
batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
|
|
2963
2997
|
if batch_response.get('success') and 'id' in batch_response:
|
|
2964
|
-
return
|
|
2998
|
+
return AsyncV1CrawlWatcher(batch_response['id'], self)
|
|
2965
2999
|
else:
|
|
2966
3000
|
raise Exception("Batch scrape job failed to start")
|
|
2967
3001
|
|
|
@@ -2975,18 +3009,18 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2975
3009
|
exclude_tags: Optional[List[str]] = None,
|
|
2976
3010
|
only_main_content: Optional[bool] = None,
|
|
2977
3011
|
wait_for: Optional[int] = None,
|
|
2978
|
-
timeout: Optional[int] =
|
|
2979
|
-
location: Optional[
|
|
3012
|
+
timeout: Optional[int] = 30000,
|
|
3013
|
+
location: Optional[V1LocationConfig] = None,
|
|
2980
3014
|
mobile: Optional[bool] = None,
|
|
2981
3015
|
skip_tls_verification: Optional[bool] = None,
|
|
2982
3016
|
remove_base64_images: Optional[bool] = None,
|
|
2983
3017
|
block_ads: Optional[bool] = None,
|
|
2984
3018
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2985
3019
|
parse_pdf: Optional[bool] = None,
|
|
2986
|
-
extract: Optional[
|
|
2987
|
-
json_options: Optional[
|
|
2988
|
-
actions: Optional[List[Union[
|
|
2989
|
-
**kwargs) ->
|
|
3020
|
+
extract: Optional[V1JsonConfig] = None,
|
|
3021
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
3022
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
3023
|
+
**kwargs) -> V1ScrapeResponse[Any]:
|
|
2990
3024
|
"""
|
|
2991
3025
|
Scrape a single URL asynchronously.
|
|
2992
3026
|
|
|
@@ -2999,19 +3033,19 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2999
3033
|
only_main_content (Optional[bool]): Extract main content only
|
|
3000
3034
|
wait_for (Optional[int]): Wait for a specific element to appear
|
|
3001
3035
|
timeout (Optional[int]): Request timeout (ms)
|
|
3002
|
-
location (Optional[
|
|
3036
|
+
location (Optional[V1LocationConfig]): Location configuration
|
|
3003
3037
|
mobile (Optional[bool]): Use mobile user agent
|
|
3004
3038
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
3005
3039
|
remove_base64_images (Optional[bool]): Remove base64 images
|
|
3006
3040
|
block_ads (Optional[bool]): Block ads
|
|
3007
3041
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
3008
|
-
extract (Optional[
|
|
3009
|
-
json_options (Optional[
|
|
3010
|
-
actions (Optional[List[Union[
|
|
3042
|
+
extract (Optional[V1JsonConfig]): Content extraction settings
|
|
3043
|
+
json_options (Optional[V1JsonConfig]): JSON extraction settings
|
|
3044
|
+
actions (Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]]): Actions to perform
|
|
3011
3045
|
**kwargs: Additional parameters to pass to the API
|
|
3012
3046
|
|
|
3013
3047
|
Returns:
|
|
3014
|
-
|
|
3048
|
+
V1ScrapeResponse with:
|
|
3015
3049
|
* success - Whether scrape was successful
|
|
3016
3050
|
* markdown - Markdown content if requested
|
|
3017
3051
|
* html - HTML content if requested
|
|
@@ -3052,7 +3086,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3052
3086
|
if timeout:
|
|
3053
3087
|
scrape_params['timeout'] = timeout
|
|
3054
3088
|
if location:
|
|
3055
|
-
scrape_params['location'] = location.dict(exclude_none=True)
|
|
3089
|
+
scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
3056
3090
|
if mobile is not None:
|
|
3057
3091
|
scrape_params['mobile'] = mobile
|
|
3058
3092
|
if skip_tls_verification is not None:
|
|
@@ -3069,14 +3103,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3069
3103
|
extract = self._ensure_schema_dict(extract)
|
|
3070
3104
|
if isinstance(extract, dict) and "schema" in extract:
|
|
3071
3105
|
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
3072
|
-
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
3106
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
|
|
3073
3107
|
if json_options is not None:
|
|
3074
3108
|
json_options = self._ensure_schema_dict(json_options)
|
|
3075
3109
|
if isinstance(json_options, dict) and "schema" in json_options:
|
|
3076
3110
|
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
3077
|
-
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
3111
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
|
|
3078
3112
|
if actions:
|
|
3079
|
-
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
3113
|
+
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
|
|
3080
3114
|
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
|
|
3081
3115
|
scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
|
|
3082
3116
|
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
|
|
@@ -3091,7 +3125,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3091
3125
|
)
|
|
3092
3126
|
|
|
3093
3127
|
if response.get('success') and 'data' in response:
|
|
3094
|
-
return
|
|
3128
|
+
return V1ScrapeResponse(**response['data'])
|
|
3095
3129
|
elif "error" in response:
|
|
3096
3130
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
3097
3131
|
else:
|
|
@@ -3109,21 +3143,21 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3109
3143
|
exclude_tags: Optional[List[str]] = None,
|
|
3110
3144
|
only_main_content: Optional[bool] = None,
|
|
3111
3145
|
wait_for: Optional[int] = None,
|
|
3112
|
-
timeout: Optional[int] =
|
|
3113
|
-
location: Optional[
|
|
3146
|
+
timeout: Optional[int] = 30000,
|
|
3147
|
+
location: Optional[V1LocationConfig] = None,
|
|
3114
3148
|
mobile: Optional[bool] = None,
|
|
3115
3149
|
skip_tls_verification: Optional[bool] = None,
|
|
3116
3150
|
remove_base64_images: Optional[bool] = None,
|
|
3117
3151
|
block_ads: Optional[bool] = None,
|
|
3118
3152
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3119
|
-
extract: Optional[
|
|
3120
|
-
json_options: Optional[
|
|
3121
|
-
actions: Optional[List[Union[
|
|
3122
|
-
agent: Optional[
|
|
3153
|
+
extract: Optional[V1JsonConfig] = None,
|
|
3154
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
3155
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
3156
|
+
agent: Optional[V1AgentOptions] = None,
|
|
3123
3157
|
poll_interval: Optional[int] = 2,
|
|
3124
3158
|
idempotency_key: Optional[str] = None,
|
|
3125
3159
|
**kwargs
|
|
3126
|
-
) ->
|
|
3160
|
+
) -> V1BatchScrapeStatusResponse:
|
|
3127
3161
|
"""
|
|
3128
3162
|
Asynchronously scrape multiple URLs and monitor until completion.
|
|
3129
3163
|
|
|
@@ -3151,7 +3185,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3151
3185
|
**kwargs: Additional parameters to pass to the API
|
|
3152
3186
|
|
|
3153
3187
|
Returns:
|
|
3154
|
-
|
|
3188
|
+
V1BatchScrapeStatusResponse with:
|
|
3155
3189
|
* Scraping status and progress
|
|
3156
3190
|
* Scraped content for each URL
|
|
3157
3191
|
* Success/error information
|
|
@@ -3180,7 +3214,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3180
3214
|
if timeout is not None:
|
|
3181
3215
|
scrape_params['timeout'] = timeout
|
|
3182
3216
|
if location is not None:
|
|
3183
|
-
scrape_params['location'] = location.dict(exclude_none=True)
|
|
3217
|
+
scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
3184
3218
|
if mobile is not None:
|
|
3185
3219
|
scrape_params['mobile'] = mobile
|
|
3186
3220
|
if skip_tls_verification is not None:
|
|
@@ -3195,22 +3229,23 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3195
3229
|
extract = self._ensure_schema_dict(extract)
|
|
3196
3230
|
if isinstance(extract, dict) and "schema" in extract:
|
|
3197
3231
|
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
3198
|
-
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
3232
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
|
|
3199
3233
|
if json_options is not None:
|
|
3200
3234
|
json_options = self._ensure_schema_dict(json_options)
|
|
3201
3235
|
if isinstance(json_options, dict) and "schema" in json_options:
|
|
3202
3236
|
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
3203
|
-
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
3204
|
-
|
|
3237
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
|
|
3238
|
+
if actions is not None:
|
|
3239
|
+
scrape_params['actions'] = [action.dict(by_alias=True, exclude_none=True) for action in actions]
|
|
3205
3240
|
if agent is not None:
|
|
3206
|
-
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
3241
|
+
scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
|
|
3207
3242
|
|
|
3208
3243
|
# Add any additional kwargs
|
|
3209
3244
|
scrape_params.update(kwargs)
|
|
3210
3245
|
|
|
3211
3246
|
# Create final params object
|
|
3212
|
-
final_params =
|
|
3213
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
3247
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
3248
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3214
3249
|
params_dict['urls'] = urls
|
|
3215
3250
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3216
3251
|
|
|
@@ -3247,21 +3282,21 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3247
3282
|
exclude_tags: Optional[List[str]] = None,
|
|
3248
3283
|
only_main_content: Optional[bool] = None,
|
|
3249
3284
|
wait_for: Optional[int] = None,
|
|
3250
|
-
timeout: Optional[int] =
|
|
3251
|
-
location: Optional[
|
|
3285
|
+
timeout: Optional[int] = 30000,
|
|
3286
|
+
location: Optional[V1LocationConfig] = None,
|
|
3252
3287
|
mobile: Optional[bool] = None,
|
|
3253
3288
|
skip_tls_verification: Optional[bool] = None,
|
|
3254
3289
|
remove_base64_images: Optional[bool] = None,
|
|
3255
3290
|
block_ads: Optional[bool] = None,
|
|
3256
3291
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3257
|
-
extract: Optional[
|
|
3258
|
-
json_options: Optional[
|
|
3259
|
-
actions: Optional[List[Union[
|
|
3260
|
-
agent: Optional[
|
|
3292
|
+
extract: Optional[V1JsonConfig] = None,
|
|
3293
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
3294
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
3295
|
+
agent: Optional[V1AgentOptions] = None,
|
|
3261
3296
|
zero_data_retention: Optional[bool] = None,
|
|
3262
3297
|
idempotency_key: Optional[str] = None,
|
|
3263
3298
|
**kwargs
|
|
3264
|
-
) ->
|
|
3299
|
+
) -> V1BatchScrapeResponse:
|
|
3265
3300
|
"""
|
|
3266
3301
|
Initiate a batch scrape job asynchronously.
|
|
3267
3302
|
|
|
@@ -3289,7 +3324,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3289
3324
|
**kwargs: Additional parameters to pass to the API
|
|
3290
3325
|
|
|
3291
3326
|
Returns:
|
|
3292
|
-
|
|
3327
|
+
V1BatchScrapeResponse with:
|
|
3293
3328
|
* success - Whether job started successfully
|
|
3294
3329
|
* id - Unique identifier for the job
|
|
3295
3330
|
* url - Status check URL
|
|
@@ -3319,7 +3354,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3319
3354
|
if timeout is not None:
|
|
3320
3355
|
scrape_params['timeout'] = timeout
|
|
3321
3356
|
if location is not None:
|
|
3322
|
-
scrape_params['location'] = location.dict(exclude_none=True)
|
|
3357
|
+
scrape_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
3323
3358
|
if mobile is not None:
|
|
3324
3359
|
scrape_params['mobile'] = mobile
|
|
3325
3360
|
if skip_tls_verification is not None:
|
|
@@ -3334,16 +3369,16 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3334
3369
|
extract = self._ensure_schema_dict(extract)
|
|
3335
3370
|
if isinstance(extract, dict) and "schema" in extract:
|
|
3336
3371
|
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
|
3337
|
-
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
|
3372
|
+
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(by_alias=True, exclude_none=True)
|
|
3338
3373
|
if json_options is not None:
|
|
3339
3374
|
json_options = self._ensure_schema_dict(json_options)
|
|
3340
3375
|
if isinstance(json_options, dict) and "schema" in json_options:
|
|
3341
3376
|
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
|
3342
|
-
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
|
3377
|
+
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
|
|
3343
3378
|
if actions:
|
|
3344
|
-
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
3379
|
+
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
|
|
3345
3380
|
if agent is not None:
|
|
3346
|
-
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
3381
|
+
scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
|
|
3347
3382
|
if zero_data_retention is not None:
|
|
3348
3383
|
scrape_params['zeroDataRetention'] = zero_data_retention
|
|
3349
3384
|
|
|
@@ -3351,8 +3386,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3351
3386
|
scrape_params.update(kwargs)
|
|
3352
3387
|
|
|
3353
3388
|
# Create final params object
|
|
3354
|
-
final_params =
|
|
3355
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
3389
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
3390
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3356
3391
|
params_dict['urls'] = urls
|
|
3357
3392
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3358
3393
|
|
|
@@ -3371,7 +3406,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3371
3406
|
|
|
3372
3407
|
if response.get('status_code') == 200:
|
|
3373
3408
|
try:
|
|
3374
|
-
return
|
|
3409
|
+
return V1BatchScrapeResponse(**response.json())
|
|
3375
3410
|
except:
|
|
3376
3411
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3377
3412
|
else:
|
|
@@ -3390,8 +3425,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3390
3425
|
crawl_entire_domain: Optional[bool] = None,
|
|
3391
3426
|
allow_external_links: Optional[bool] = None,
|
|
3392
3427
|
ignore_sitemap: Optional[bool] = None,
|
|
3393
|
-
scrape_options: Optional[
|
|
3394
|
-
webhook: Optional[Union[str,
|
|
3428
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
3429
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
3395
3430
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3396
3431
|
ignore_query_parameters: Optional[bool] = None,
|
|
3397
3432
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -3400,7 +3435,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3400
3435
|
poll_interval: Optional[int] = 2,
|
|
3401
3436
|
idempotency_key: Optional[str] = None,
|
|
3402
3437
|
**kwargs
|
|
3403
|
-
) ->
|
|
3438
|
+
) -> V1CrawlStatusResponse:
|
|
3404
3439
|
"""
|
|
3405
3440
|
Crawl a website starting from a URL.
|
|
3406
3441
|
|
|
@@ -3415,8 +3450,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3415
3450
|
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3416
3451
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3417
3452
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3418
|
-
scrape_options (Optional[
|
|
3419
|
-
webhook (Optional[Union[str,
|
|
3453
|
+
scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
|
|
3454
|
+
webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
|
|
3420
3455
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3421
3456
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3422
3457
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
@@ -3427,7 +3462,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3427
3462
|
**kwargs: Additional parameters to pass to the API
|
|
3428
3463
|
|
|
3429
3464
|
Returns:
|
|
3430
|
-
|
|
3465
|
+
V1CrawlStatusResponse with:
|
|
3431
3466
|
* Crawling status and progress
|
|
3432
3467
|
* Crawled page contents
|
|
3433
3468
|
* Success/error information
|
|
@@ -3460,7 +3495,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3460
3495
|
if ignore_sitemap is not None:
|
|
3461
3496
|
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
3462
3497
|
if scrape_options is not None:
|
|
3463
|
-
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
3498
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
|
|
3464
3499
|
if webhook is not None:
|
|
3465
3500
|
crawl_params['webhook'] = webhook
|
|
3466
3501
|
if deduplicate_similar_urls is not None:
|
|
@@ -3478,8 +3513,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3478
3513
|
crawl_params.update(kwargs)
|
|
3479
3514
|
|
|
3480
3515
|
# Create final params object
|
|
3481
|
-
final_params =
|
|
3482
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
3516
|
+
final_params = V1CrawlParams(**crawl_params)
|
|
3517
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3483
3518
|
params_dict['url'] = url
|
|
3484
3519
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3485
3520
|
# Make request
|
|
@@ -3510,8 +3545,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3510
3545
|
crawl_entire_domain: Optional[bool] = None,
|
|
3511
3546
|
allow_external_links: Optional[bool] = None,
|
|
3512
3547
|
ignore_sitemap: Optional[bool] = None,
|
|
3513
|
-
scrape_options: Optional[
|
|
3514
|
-
webhook: Optional[Union[str,
|
|
3548
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
3549
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
3515
3550
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3516
3551
|
ignore_query_parameters: Optional[bool] = None,
|
|
3517
3552
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -3520,7 +3555,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3520
3555
|
poll_interval: Optional[int] = 2,
|
|
3521
3556
|
idempotency_key: Optional[str] = None,
|
|
3522
3557
|
**kwargs
|
|
3523
|
-
) ->
|
|
3558
|
+
) -> V1CrawlResponse:
|
|
3524
3559
|
"""
|
|
3525
3560
|
Start an asynchronous crawl job.
|
|
3526
3561
|
|
|
@@ -3544,7 +3579,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3544
3579
|
**kwargs: Additional parameters to pass to the API
|
|
3545
3580
|
|
|
3546
3581
|
Returns:
|
|
3547
|
-
|
|
3582
|
+
V1CrawlResponse with:
|
|
3548
3583
|
* success - Whether crawl started successfully
|
|
3549
3584
|
* id - Unique identifier for the crawl job
|
|
3550
3585
|
* url - Status check URL for the crawl
|
|
@@ -3575,7 +3610,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3575
3610
|
if ignore_sitemap is not None:
|
|
3576
3611
|
crawl_params['ignoreSitemap'] = ignore_sitemap
|
|
3577
3612
|
if scrape_options is not None:
|
|
3578
|
-
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
3613
|
+
crawl_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
|
|
3579
3614
|
if webhook is not None:
|
|
3580
3615
|
crawl_params['webhook'] = webhook
|
|
3581
3616
|
if deduplicate_similar_urls is not None:
|
|
@@ -3593,8 +3628,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3593
3628
|
crawl_params.update(kwargs)
|
|
3594
3629
|
|
|
3595
3630
|
# Create final params object
|
|
3596
|
-
final_params =
|
|
3597
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
3631
|
+
final_params = V1CrawlParams(**crawl_params)
|
|
3632
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3598
3633
|
params_dict['url'] = url
|
|
3599
3634
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3600
3635
|
|
|
@@ -3608,13 +3643,13 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3608
3643
|
|
|
3609
3644
|
if response.get('success'):
|
|
3610
3645
|
try:
|
|
3611
|
-
return
|
|
3646
|
+
return V1CrawlResponse(**response)
|
|
3612
3647
|
except:
|
|
3613
3648
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3614
3649
|
else:
|
|
3615
3650
|
await self._handle_error(response, 'start crawl job')
|
|
3616
3651
|
|
|
3617
|
-
async def check_crawl_status(self, id: str) ->
|
|
3652
|
+
async def check_crawl_status(self, id: str) -> V1CrawlStatusResponse:
|
|
3618
3653
|
"""
|
|
3619
3654
|
Check the status and results of an asynchronous crawl job.
|
|
3620
3655
|
|
|
@@ -3622,7 +3657,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3622
3657
|
id (str): Unique identifier for the crawl job
|
|
3623
3658
|
|
|
3624
3659
|
Returns:
|
|
3625
|
-
|
|
3660
|
+
V1CrawlStatusResponse containing:
|
|
3626
3661
|
Status Information:
|
|
3627
3662
|
* status - Current state (scraping/completed/failed/cancelled)
|
|
3628
3663
|
* completed - Number of pages crawled
|
|
@@ -3661,8 +3696,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3661
3696
|
data.extend(next_data.get('data', []))
|
|
3662
3697
|
status_data = next_data
|
|
3663
3698
|
status_data['data'] = data
|
|
3664
|
-
# Create
|
|
3665
|
-
response =
|
|
3699
|
+
# Create V1CrawlStatusResponse object from status data
|
|
3700
|
+
response = V1CrawlStatusResponse(
|
|
3666
3701
|
status=status_data.get('status'),
|
|
3667
3702
|
total=status_data.get('total'),
|
|
3668
3703
|
completed=status_data.get('completed'),
|
|
@@ -3680,7 +3715,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3680
3715
|
|
|
3681
3716
|
return response
|
|
3682
3717
|
|
|
3683
|
-
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) ->
|
|
3718
|
+
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> V1CrawlStatusResponse:
|
|
3684
3719
|
"""
|
|
3685
3720
|
Monitor the status of an asynchronous job until completion.
|
|
3686
3721
|
|
|
@@ -3690,7 +3725,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3690
3725
|
poll_interval (int): Seconds between status checks (default: 2)
|
|
3691
3726
|
|
|
3692
3727
|
Returns:
|
|
3693
|
-
|
|
3728
|
+
V1CrawlStatusResponse: The job results if completed successfully
|
|
3694
3729
|
|
|
3695
3730
|
Raises:
|
|
3696
3731
|
Exception: If the job fails or an error occurs during status checks
|
|
@@ -3715,7 +3750,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3715
3750
|
data.extend(next_data.get('data', []))
|
|
3716
3751
|
status_data = next_data
|
|
3717
3752
|
status_data['data'] = data
|
|
3718
|
-
return
|
|
3753
|
+
return V1CrawlStatusResponse(**status_data)
|
|
3719
3754
|
else:
|
|
3720
3755
|
raise Exception('Job completed but no data was returned')
|
|
3721
3756
|
elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
@@ -3732,14 +3767,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3732
3767
|
include_subdomains: Optional[bool] = None,
|
|
3733
3768
|
sitemap_only: Optional[bool] = None,
|
|
3734
3769
|
limit: Optional[int] = None,
|
|
3735
|
-
timeout: Optional[int] =
|
|
3736
|
-
params: Optional[
|
|
3770
|
+
timeout: Optional[int] = 30000,
|
|
3771
|
+
params: Optional[V1MapParams] = None) -> V1MapResponse:
|
|
3737
3772
|
"""
|
|
3738
3773
|
Asynchronously map and discover links from a URL.
|
|
3739
3774
|
|
|
3740
3775
|
Args:
|
|
3741
3776
|
url (str): Target URL to map
|
|
3742
|
-
params (Optional[
|
|
3777
|
+
params (Optional[V1MapParams]): See V1MapParams model:
|
|
3743
3778
|
Discovery Options:
|
|
3744
3779
|
* search - Filter pattern for URLs
|
|
3745
3780
|
* ignoreSitemap - Skip sitemap.xml
|
|
@@ -3751,7 +3786,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3751
3786
|
* timeout - Request timeout (ms)
|
|
3752
3787
|
|
|
3753
3788
|
Returns:
|
|
3754
|
-
|
|
3789
|
+
V1MapResponse with:
|
|
3755
3790
|
* Discovered URLs
|
|
3756
3791
|
* Success/error status
|
|
3757
3792
|
|
|
@@ -3760,7 +3795,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3760
3795
|
"""
|
|
3761
3796
|
map_params = {}
|
|
3762
3797
|
if params:
|
|
3763
|
-
map_params.update(params.dict(exclude_none=True))
|
|
3798
|
+
map_params.update(params.dict(by_alias=True, exclude_none=True))
|
|
3764
3799
|
|
|
3765
3800
|
# Add individual parameters
|
|
3766
3801
|
if search is not None:
|
|
@@ -3777,8 +3812,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3777
3812
|
map_params['timeout'] = timeout
|
|
3778
3813
|
|
|
3779
3814
|
# Create final params object
|
|
3780
|
-
final_params =
|
|
3781
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
3815
|
+
final_params = V1MapParams(**map_params)
|
|
3816
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3782
3817
|
params_dict['url'] = url
|
|
3783
3818
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3784
3819
|
|
|
@@ -3791,7 +3826,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3791
3826
|
)
|
|
3792
3827
|
|
|
3793
3828
|
if response.get('success') and 'links' in response:
|
|
3794
|
-
return
|
|
3829
|
+
return V1MapResponse(**response)
|
|
3795
3830
|
elif 'error' in response:
|
|
3796
3831
|
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
|
3797
3832
|
else:
|
|
@@ -3807,7 +3842,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3807
3842
|
allow_external_links: Optional[bool] = False,
|
|
3808
3843
|
enable_web_search: Optional[bool] = False,
|
|
3809
3844
|
show_sources: Optional[bool] = False,
|
|
3810
|
-
agent: Optional[Dict[str, Any]] = None) ->
|
|
3845
|
+
agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
|
|
3811
3846
|
|
|
3812
3847
|
"""
|
|
3813
3848
|
Asynchronously extract structured information from URLs.
|
|
@@ -3823,7 +3858,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3823
3858
|
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
3824
3859
|
|
|
3825
3860
|
Returns:
|
|
3826
|
-
|
|
3861
|
+
V1ExtractResponse with:
|
|
3827
3862
|
* Structured data matching schema
|
|
3828
3863
|
* Source information if requested
|
|
3829
3864
|
* Success/error status
|
|
@@ -3878,7 +3913,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3878
3913
|
)
|
|
3879
3914
|
|
|
3880
3915
|
if status_data['status'] == 'completed':
|
|
3881
|
-
return
|
|
3916
|
+
return V1ExtractResponse(**status_data)
|
|
3882
3917
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
3883
3918
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
3884
3919
|
|
|
@@ -3886,7 +3921,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3886
3921
|
else:
|
|
3887
3922
|
raise Exception(f'Failed to extract. Error: {response.get("error")}')
|
|
3888
3923
|
|
|
3889
|
-
async def check_batch_scrape_status(self, id: str) ->
|
|
3924
|
+
async def check_batch_scrape_status(self, id: str) -> V1BatchScrapeStatusResponse:
|
|
3890
3925
|
"""
|
|
3891
3926
|
Check the status of an asynchronous batch scrape job.
|
|
3892
3927
|
|
|
@@ -3894,7 +3929,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3894
3929
|
id (str): The ID of the batch scrape job
|
|
3895
3930
|
|
|
3896
3931
|
Returns:
|
|
3897
|
-
|
|
3932
|
+
V1BatchScrapeStatusResponse containing:
|
|
3898
3933
|
Status Information:
|
|
3899
3934
|
* status - Current state (scraping/completed/failed/cancelled)
|
|
3900
3935
|
* completed - Number of URLs scraped
|
|
@@ -3934,7 +3969,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3934
3969
|
status_data = next_data
|
|
3935
3970
|
status_data['data'] = data
|
|
3936
3971
|
|
|
3937
|
-
response =
|
|
3972
|
+
response = V1BatchScrapeStatusResponse(
|
|
3938
3973
|
status=status_data.get('status'),
|
|
3939
3974
|
total=status_data.get('total'),
|
|
3940
3975
|
completed=status_data.get('completed'),
|
|
@@ -3954,7 +3989,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3954
3989
|
**response
|
|
3955
3990
|
}
|
|
3956
3991
|
|
|
3957
|
-
async def check_batch_scrape_errors(self, id: str) ->
|
|
3992
|
+
async def check_batch_scrape_errors(self, id: str) -> V1CrawlErrorsResponse:
|
|
3958
3993
|
"""
|
|
3959
3994
|
Get information about errors from an asynchronous batch scrape job.
|
|
3960
3995
|
|
|
@@ -3962,7 +3997,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3962
3997
|
id (str): The ID of the batch scrape job
|
|
3963
3998
|
|
|
3964
3999
|
Returns:
|
|
3965
|
-
|
|
4000
|
+
V1CrawlErrorsResponse containing:
|
|
3966
4001
|
errors (List[Dict[str, str]]): List of errors with fields:
|
|
3967
4002
|
* id (str): Error ID
|
|
3968
4003
|
* timestamp (str): When the error occurred
|
|
@@ -3979,7 +4014,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3979
4014
|
headers
|
|
3980
4015
|
)
|
|
3981
4016
|
|
|
3982
|
-
async def check_crawl_errors(self, id: str) ->
|
|
4017
|
+
async def check_crawl_errors(self, id: str) -> V1CrawlErrorsResponse:
|
|
3983
4018
|
"""
|
|
3984
4019
|
Get information about errors from an asynchronous crawl job.
|
|
3985
4020
|
|
|
@@ -3987,7 +4022,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3987
4022
|
id (str): The ID of the crawl job
|
|
3988
4023
|
|
|
3989
4024
|
Returns:
|
|
3990
|
-
|
|
4025
|
+
V1CrawlErrorsResponse containing:
|
|
3991
4026
|
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
3992
4027
|
- id (str): Error ID
|
|
3993
4028
|
- timestamp (str): When the error occurred
|
|
@@ -4024,7 +4059,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4024
4059
|
async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
|
|
4025
4060
|
return await response.json()
|
|
4026
4061
|
|
|
4027
|
-
async def get_extract_status(self, job_id: str) ->
|
|
4062
|
+
async def get_extract_status(self, job_id: str) -> V1ExtractResponse[Any]:
|
|
4028
4063
|
"""
|
|
4029
4064
|
Check the status of an asynchronous extraction job.
|
|
4030
4065
|
|
|
@@ -4032,7 +4067,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4032
4067
|
job_id (str): The ID of the extraction job
|
|
4033
4068
|
|
|
4034
4069
|
Returns:
|
|
4035
|
-
|
|
4070
|
+
V1ExtractResponse[Any] with:
|
|
4036
4071
|
* success (bool): Whether request succeeded
|
|
4037
4072
|
* data (Optional[Any]): Extracted data matching schema
|
|
4038
4073
|
* error (Optional[str]): Error message if any
|
|
@@ -4061,7 +4096,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4061
4096
|
allow_external_links: Optional[bool] = False,
|
|
4062
4097
|
enable_web_search: Optional[bool] = False,
|
|
4063
4098
|
show_sources: Optional[bool] = False,
|
|
4064
|
-
agent: Optional[Dict[str, Any]] = None) ->
|
|
4099
|
+
agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
|
|
4065
4100
|
"""
|
|
4066
4101
|
Initiate an asynchronous extraction job without waiting for completion.
|
|
4067
4102
|
|
|
@@ -4077,7 +4112,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4077
4112
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
4078
4113
|
|
|
4079
4114
|
Returns:
|
|
4080
|
-
|
|
4115
|
+
V1ExtractResponse[Any] with:
|
|
4081
4116
|
* success (bool): Whether request succeeded
|
|
4082
4117
|
* data (Optional[Any]): Extracted data matching schema
|
|
4083
4118
|
* error (Optional[str]): Error message if any
|
|
@@ -4096,7 +4131,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4096
4131
|
if schema:
|
|
4097
4132
|
schema = self._ensure_schema_dict(schema)
|
|
4098
4133
|
|
|
4099
|
-
request_data =
|
|
4134
|
+
request_data = V1ExtractResponse(
|
|
4100
4135
|
urls=urls or [],
|
|
4101
4136
|
allowExternalLinks=allow_external_links,
|
|
4102
4137
|
enableWebSearch=enable_web_search,
|
|
@@ -4127,7 +4162,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4127
4162
|
*,
|
|
4128
4163
|
max_urls: Optional[int] = None,
|
|
4129
4164
|
show_full_text: Optional[bool] = None,
|
|
4130
|
-
experimental_stream: Optional[bool] = None) ->
|
|
4165
|
+
experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextStatusResponse:
|
|
4131
4166
|
"""
|
|
4132
4167
|
Generate LLMs.txt for a given URL and monitor until completion.
|
|
4133
4168
|
|
|
@@ -4138,7 +4173,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4138
4173
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
4139
4174
|
|
|
4140
4175
|
Returns:
|
|
4141
|
-
|
|
4176
|
+
V1GenerateLLMsTextStatusResponse containing:
|
|
4142
4177
|
* success (bool): Whether generation completed successfully
|
|
4143
4178
|
* status (str): Status of generation (processing/completed/failed)
|
|
4144
4179
|
* data (Dict[str, str], optional): Generated text with fields:
|
|
@@ -4162,7 +4197,6 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4162
4197
|
url,
|
|
4163
4198
|
max_urls=max_urls,
|
|
4164
4199
|
show_full_text=show_full_text,
|
|
4165
|
-
cache=cache,
|
|
4166
4200
|
experimental_stream=experimental_stream
|
|
4167
4201
|
)
|
|
4168
4202
|
if not response.get('success') or 'id' not in response:
|
|
@@ -4181,7 +4215,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4181
4215
|
|
|
4182
4216
|
await asyncio.sleep(2)
|
|
4183
4217
|
|
|
4184
|
-
return
|
|
4218
|
+
return V1GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly', status='failed', expiresAt='')
|
|
4185
4219
|
|
|
4186
4220
|
async def async_generate_llms_text(
|
|
4187
4221
|
self,
|
|
@@ -4190,7 +4224,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4190
4224
|
max_urls: Optional[int] = None,
|
|
4191
4225
|
show_full_text: Optional[bool] = None,
|
|
4192
4226
|
cache: Optional[bool] = None,
|
|
4193
|
-
experimental_stream: Optional[bool] = None) ->
|
|
4227
|
+
experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextResponse:
|
|
4194
4228
|
"""
|
|
4195
4229
|
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
|
4196
4230
|
|
|
@@ -4202,7 +4236,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4202
4236
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
4203
4237
|
|
|
4204
4238
|
Returns:
|
|
4205
|
-
|
|
4239
|
+
V1GenerateLLMsTextResponse containing:
|
|
4206
4240
|
* success (bool): Whether job started successfully
|
|
4207
4241
|
* id (str): Unique identifier for the job
|
|
4208
4242
|
* error (str, optional): Error message if start failed
|
|
@@ -4218,7 +4252,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4218
4252
|
if experimental_stream is not None:
|
|
4219
4253
|
params['__experimental_stream'] = experimental_stream
|
|
4220
4254
|
|
|
4221
|
-
params =
|
|
4255
|
+
params = V1GenerateLLMsTextParams(
|
|
4222
4256
|
maxUrls=max_urls,
|
|
4223
4257
|
showFullText=show_full_text,
|
|
4224
4258
|
cache=cache,
|
|
@@ -4226,7 +4260,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4226
4260
|
)
|
|
4227
4261
|
|
|
4228
4262
|
headers = self._prepare_headers()
|
|
4229
|
-
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
4263
|
+
json_data = {'url': url, **params.dict(by_alias=True, exclude_none=True)}
|
|
4230
4264
|
json_data['origin'] = f"python-sdk@{version}"
|
|
4231
4265
|
|
|
4232
4266
|
try:
|
|
@@ -4238,7 +4272,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4238
4272
|
except Exception as e:
|
|
4239
4273
|
raise ValueError(str(e))
|
|
4240
4274
|
|
|
4241
|
-
async def check_generate_llms_text_status(self, id: str) ->
|
|
4275
|
+
async def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextStatusResponse:
|
|
4242
4276
|
"""
|
|
4243
4277
|
Check the status of an asynchronous LLMs.txt generation job.
|
|
4244
4278
|
|
|
@@ -4246,7 +4280,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4246
4280
|
id (str): The ID of the generation job
|
|
4247
4281
|
|
|
4248
4282
|
Returns:
|
|
4249
|
-
|
|
4283
|
+
V1GenerateLLMsTextStatusResponse containing:
|
|
4250
4284
|
* success (bool): Whether generation completed successfully
|
|
4251
4285
|
* status (str): Status of generation (processing/completed/failed)
|
|
4252
4286
|
* data (Dict[str, str], optional): Generated text with fields:
|
|
@@ -4278,7 +4312,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4278
4312
|
system_prompt: Optional[str] = None,
|
|
4279
4313
|
__experimental_stream_steps: Optional[bool] = None,
|
|
4280
4314
|
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
4281
|
-
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) ->
|
|
4315
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> V1DeepResearchStatusResponse:
|
|
4282
4316
|
"""
|
|
4283
4317
|
Initiates a deep research operation on a given query and polls until completion.
|
|
4284
4318
|
|
|
@@ -4320,7 +4354,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4320
4354
|
research_params['systemPrompt'] = system_prompt
|
|
4321
4355
|
if __experimental_stream_steps is not None:
|
|
4322
4356
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
4323
|
-
research_params =
|
|
4357
|
+
research_params = V1DeepResearchParams(**research_params)
|
|
4324
4358
|
|
|
4325
4359
|
response = await self.async_deep_research(
|
|
4326
4360
|
query,
|
|
@@ -4361,7 +4395,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4361
4395
|
|
|
4362
4396
|
await asyncio.sleep(2)
|
|
4363
4397
|
|
|
4364
|
-
return
|
|
4398
|
+
return V1DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
|
|
4365
4399
|
|
|
4366
4400
|
async def async_deep_research(
|
|
4367
4401
|
self,
|
|
@@ -4407,11 +4441,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4407
4441
|
research_params['systemPrompt'] = system_prompt
|
|
4408
4442
|
if __experimental_stream_steps is not None:
|
|
4409
4443
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
4410
|
-
research_params =
|
|
4444
|
+
research_params = V1DeepResearchParams(**research_params)
|
|
4411
4445
|
|
|
4412
4446
|
headers = self._prepare_headers()
|
|
4413
4447
|
|
|
4414
|
-
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
|
4448
|
+
json_data = {'query': query, **research_params.dict(by_alias=True, exclude_none=True)}
|
|
4415
4449
|
json_data['origin'] = f"python-sdk@{version}"
|
|
4416
4450
|
|
|
4417
4451
|
try:
|
|
@@ -4423,7 +4457,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4423
4457
|
except Exception as e:
|
|
4424
4458
|
raise ValueError(str(e))
|
|
4425
4459
|
|
|
4426
|
-
async def check_deep_research_status(self, id: str) ->
|
|
4460
|
+
async def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse:
|
|
4427
4461
|
"""
|
|
4428
4462
|
Check the status of a deep research operation.
|
|
4429
4463
|
|
|
@@ -4467,10 +4501,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4467
4501
|
lang: Optional[str] = None,
|
|
4468
4502
|
country: Optional[str] = None,
|
|
4469
4503
|
location: Optional[str] = None,
|
|
4470
|
-
timeout: Optional[int] =
|
|
4471
|
-
scrape_options: Optional[
|
|
4472
|
-
params: Optional[Union[Dict[str, Any],
|
|
4473
|
-
**kwargs) ->
|
|
4504
|
+
timeout: Optional[int] = 30000,
|
|
4505
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
4506
|
+
params: Optional[Union[Dict[str, Any], V1SearchParams]] = None,
|
|
4507
|
+
**kwargs) -> V1SearchResponse:
|
|
4474
4508
|
"""
|
|
4475
4509
|
Asynchronously search for content using Firecrawl.
|
|
4476
4510
|
|
|
@@ -4503,7 +4537,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4503
4537
|
if isinstance(params, dict):
|
|
4504
4538
|
search_params.update(params)
|
|
4505
4539
|
else:
|
|
4506
|
-
search_params.update(params.dict(exclude_none=True))
|
|
4540
|
+
search_params.update(params.dict(by_alias=True, exclude_none=True))
|
|
4507
4541
|
|
|
4508
4542
|
# Add individual parameters
|
|
4509
4543
|
if limit is not None:
|
|
@@ -4521,14 +4555,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4521
4555
|
if timeout is not None:
|
|
4522
4556
|
search_params['timeout'] = timeout
|
|
4523
4557
|
if scrape_options is not None:
|
|
4524
|
-
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
|
4558
|
+
search_params['scrapeOptions'] = scrape_options.dict(by_alias=True, exclude_none=True)
|
|
4525
4559
|
|
|
4526
4560
|
# Add any additional kwargs
|
|
4527
4561
|
search_params.update(kwargs)
|
|
4528
4562
|
|
|
4529
4563
|
# Create final params object
|
|
4530
|
-
final_params =
|
|
4531
|
-
params_dict = final_params.dict(exclude_none=True)
|
|
4564
|
+
final_params = V1SearchParams(query=query, **search_params)
|
|
4565
|
+
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
4532
4566
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
4533
4567
|
|
|
4534
4568
|
return await self._async_post_request(
|
|
@@ -4537,11 +4571,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4537
4571
|
{"Authorization": f"Bearer {self.api_key}"}
|
|
4538
4572
|
)
|
|
4539
4573
|
|
|
4540
|
-
class
|
|
4574
|
+
class AsyncV1CrawlWatcher(V1CrawlWatcher):
|
|
4541
4575
|
"""
|
|
4542
|
-
Async version of
|
|
4576
|
+
Async version of V1CrawlWatcher that properly handles async operations.
|
|
4543
4577
|
"""
|
|
4544
|
-
def __init__(self, id: str, app:
|
|
4578
|
+
def __init__(self, id: str, app: AsyncV1FirecrawlApp):
|
|
4545
4579
|
super().__init__(id, app)
|
|
4546
4580
|
|
|
4547
4581
|
async def connect(self) -> None:
|