firecrawl-py 2.16.5__py3-none-any.whl → 3.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- build/lib/firecrawl/firecrawl.py → firecrawl/firecrawl.backup.py +17 -15
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/{firecrawl.py → v1/client.py} +324 -304
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl_py-2.16.5.dist-info → firecrawl_py-3.0.2.dist-info}/LICENSE +0 -0
- {firecrawl_py-2.16.5.dist-info → firecrawl_py-3.0.2.dist-info}/METADATA +49 -32
- firecrawl_py-3.0.2.dist-info/RECORD +78 -0
- {firecrawl_py-2.16.5.dist-info → firecrawl_py-3.0.2.dist-info}/top_level.txt +0 -2
- tests/test_timeout_conversion.py +117 -0
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl_py-2.16.5.dist-info/RECORD +0 -19
- {firecrawl_py-2.16.5.dist-info → firecrawl_py-3.0.2.dist-info}/WHEEL +0 -0
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Firecrawl v1 API Client - Legacy Implementation
|
|
3
3
|
|
|
4
|
-
This module provides
|
|
5
|
-
It
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
This module provides the legacy v1 implementation of the Firecrawl SDK.
|
|
5
|
+
It contains the complete `V1FirecrawlApp` class with all v1 API methods and types
|
|
6
|
+
for backward compatibility. This is used by the unified client to provide
|
|
7
|
+
version-specific access patterns like app.v1.scrape_url().
|
|
8
8
|
|
|
9
9
|
Classes:
|
|
10
|
-
-
|
|
10
|
+
- V1FirecrawlApp: Legacy v1 client for interacting with the Firecrawl API.
|
|
11
|
+
- AsyncV1FirecrawlApp: Async version of the v1 client.
|
|
12
|
+
- CrawlWatcher: WebSocket-based crawl monitoring for v1.
|
|
11
13
|
"""
|
|
12
14
|
import logging
|
|
13
15
|
import os
|
|
@@ -16,14 +18,13 @@ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar,
|
|
|
16
18
|
import json
|
|
17
19
|
from datetime import datetime
|
|
18
20
|
import re
|
|
19
|
-
import warnings
|
|
20
21
|
import requests
|
|
21
22
|
import pydantic
|
|
22
23
|
import websockets
|
|
23
24
|
import aiohttp
|
|
24
25
|
import asyncio
|
|
25
|
-
from pydantic import Field
|
|
26
26
|
|
|
27
|
+
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
27
28
|
|
|
28
29
|
def get_version():
|
|
29
30
|
try:
|
|
@@ -39,11 +40,9 @@ def get_version():
|
|
|
39
40
|
|
|
40
41
|
version = get_version()
|
|
41
42
|
|
|
42
|
-
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
43
|
-
|
|
44
43
|
T = TypeVar('T')
|
|
45
44
|
|
|
46
|
-
# class
|
|
45
|
+
# class V1FirecrawlDocumentMetadata(pydantic.BaseModel):
|
|
47
46
|
# """Metadata for a Firecrawl document."""
|
|
48
47
|
# title: Optional[str] = None
|
|
49
48
|
# description: Optional[str] = None
|
|
@@ -78,21 +77,21 @@ T = TypeVar('T')
|
|
|
78
77
|
# statusCode: Optional[int] = None
|
|
79
78
|
# error: Optional[str] = None
|
|
80
79
|
|
|
81
|
-
class
|
|
80
|
+
class V1AgentOptions(pydantic.BaseModel):
|
|
82
81
|
"""Configuration for the agent."""
|
|
83
82
|
model: Literal["FIRE-1"] = "FIRE-1"
|
|
84
83
|
prompt: Optional[str] = None
|
|
85
84
|
|
|
86
|
-
class
|
|
85
|
+
class V1AgentOptionsExtract(pydantic.BaseModel):
|
|
87
86
|
"""Configuration for the agent in extract operations."""
|
|
88
87
|
model: Literal["FIRE-1"] = "FIRE-1"
|
|
89
88
|
|
|
90
|
-
class
|
|
89
|
+
class V1ActionsResult(pydantic.BaseModel):
|
|
91
90
|
"""Result of actions performed during scraping."""
|
|
92
91
|
screenshots: List[str]
|
|
93
92
|
pdfs: List[str]
|
|
94
93
|
|
|
95
|
-
class
|
|
94
|
+
class V1ChangeTrackingData(pydantic.BaseModel):
|
|
96
95
|
"""
|
|
97
96
|
Data for the change tracking format.
|
|
98
97
|
"""
|
|
@@ -102,7 +101,7 @@ class ChangeTrackingData(pydantic.BaseModel):
|
|
|
102
101
|
diff: Optional[Dict[str, Any]] = None
|
|
103
102
|
json_field: Optional[Any] = pydantic.Field(None, alias='json')
|
|
104
103
|
|
|
105
|
-
class
|
|
104
|
+
class V1FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
106
105
|
"""Document retrieved or processed by Firecrawl."""
|
|
107
106
|
url: Optional[str] = None
|
|
108
107
|
markdown: Optional[str] = None
|
|
@@ -113,31 +112,31 @@ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
|
113
112
|
json_field: Optional[T] = pydantic.Field(None, alias='json')
|
|
114
113
|
screenshot: Optional[str] = None
|
|
115
114
|
metadata: Optional[Any] = None
|
|
116
|
-
actions: Optional[
|
|
115
|
+
actions: Optional[V1ActionsResult] = None
|
|
117
116
|
title: Optional[str] = None # v1 search only
|
|
118
117
|
description: Optional[str] = None # v1 search only
|
|
119
|
-
changeTracking: Optional[
|
|
118
|
+
changeTracking: Optional[V1ChangeTrackingData] = None
|
|
120
119
|
|
|
121
|
-
class
|
|
120
|
+
class V1LocationConfig(pydantic.BaseModel):
|
|
122
121
|
"""Location configuration for scraping."""
|
|
123
122
|
country: Optional[str] = None
|
|
124
123
|
languages: Optional[List[str]] = None
|
|
125
124
|
|
|
126
|
-
class
|
|
125
|
+
class V1WebhookConfig(pydantic.BaseModel):
|
|
127
126
|
"""Configuration for webhooks."""
|
|
128
127
|
url: str
|
|
129
128
|
headers: Optional[Dict[str, str]] = None
|
|
130
129
|
metadata: Optional[Dict[str, str]] = None
|
|
131
130
|
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
132
131
|
|
|
133
|
-
class
|
|
132
|
+
class V1ChangeTrackingOptions(pydantic.BaseModel):
|
|
134
133
|
"""Configuration for change tracking."""
|
|
135
134
|
modes: Optional[List[Literal["git-diff", "json"]]] = None
|
|
136
135
|
schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
137
136
|
prompt: Optional[str] = None
|
|
138
137
|
tag: Optional[str] = None
|
|
139
138
|
|
|
140
|
-
class
|
|
139
|
+
class V1ScrapeOptions(pydantic.BaseModel):
|
|
141
140
|
"""Parameters for scraping operations."""
|
|
142
141
|
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
|
|
143
142
|
headers: Optional[Dict[str, str]] = None
|
|
@@ -145,93 +144,93 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
145
144
|
excludeTags: Optional[List[str]] = None
|
|
146
145
|
onlyMainContent: Optional[bool] = None
|
|
147
146
|
waitFor: Optional[int] = None
|
|
148
|
-
timeout: Optional[int] =
|
|
149
|
-
location: Optional[
|
|
147
|
+
timeout: Optional[int] = 30000
|
|
148
|
+
location: Optional[V1LocationConfig] = None
|
|
150
149
|
mobile: Optional[bool] = None
|
|
151
150
|
skipTlsVerification: Optional[bool] = None
|
|
152
151
|
removeBase64Images: Optional[bool] = None
|
|
153
152
|
blockAds: Optional[bool] = None
|
|
154
153
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
155
|
-
changeTrackingOptions: Optional[
|
|
154
|
+
changeTrackingOptions: Optional[V1ChangeTrackingOptions] = None
|
|
156
155
|
maxAge: Optional[int] = None
|
|
157
156
|
storeInCache: Optional[bool] = None
|
|
158
157
|
parsePDF: Optional[bool] = None
|
|
159
158
|
|
|
160
|
-
class
|
|
159
|
+
class V1WaitAction(pydantic.BaseModel):
|
|
161
160
|
"""Wait action to perform during scraping."""
|
|
162
161
|
type: Literal["wait"]
|
|
163
162
|
milliseconds: Optional[int] = None
|
|
164
163
|
selector: Optional[str] = None
|
|
165
164
|
|
|
166
|
-
class
|
|
165
|
+
class V1ScreenshotAction(pydantic.BaseModel):
|
|
167
166
|
"""Screenshot action to perform during scraping."""
|
|
168
167
|
type: Literal["screenshot"]
|
|
169
168
|
fullPage: Optional[bool] = None
|
|
170
169
|
quality: Optional[int] = None
|
|
171
170
|
|
|
172
|
-
class
|
|
171
|
+
class V1ClickAction(pydantic.BaseModel):
|
|
173
172
|
"""Click action to perform during scraping."""
|
|
174
173
|
type: Literal["click"]
|
|
175
174
|
selector: str
|
|
176
175
|
|
|
177
|
-
class
|
|
176
|
+
class V1WriteAction(pydantic.BaseModel):
|
|
178
177
|
"""Write action to perform during scraping."""
|
|
179
178
|
type: Literal["write"]
|
|
180
179
|
text: str
|
|
181
180
|
|
|
182
|
-
class
|
|
181
|
+
class V1PressAction(pydantic.BaseModel):
|
|
183
182
|
"""Press action to perform during scraping."""
|
|
184
183
|
type: Literal["press"]
|
|
185
184
|
key: str
|
|
186
185
|
|
|
187
|
-
class
|
|
186
|
+
class V1ScrollAction(pydantic.BaseModel):
|
|
188
187
|
"""Scroll action to perform during scraping."""
|
|
189
188
|
type: Literal["scroll"]
|
|
190
189
|
direction: Literal["up", "down"]
|
|
191
190
|
selector: Optional[str] = None
|
|
192
191
|
|
|
193
|
-
class
|
|
192
|
+
class V1ScrapeAction(pydantic.BaseModel):
|
|
194
193
|
"""Scrape action to perform during scraping."""
|
|
195
194
|
type: Literal["scrape"]
|
|
196
195
|
|
|
197
|
-
class
|
|
196
|
+
class V1ExecuteJavascriptAction(pydantic.BaseModel):
|
|
198
197
|
"""Execute javascript action to perform during scraping."""
|
|
199
198
|
type: Literal["executeJavascript"]
|
|
200
199
|
script: str
|
|
201
200
|
|
|
202
|
-
class
|
|
201
|
+
class V1PDFAction(pydantic.BaseModel):
|
|
203
202
|
"""PDF action to perform during scraping."""
|
|
204
203
|
type: Literal["pdf"]
|
|
205
204
|
format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
|
|
206
205
|
landscape: Optional[bool] = None
|
|
207
206
|
scale: Optional[float] = None
|
|
208
207
|
|
|
209
|
-
class
|
|
208
|
+
class V1ExtractAgent(pydantic.BaseModel):
|
|
210
209
|
"""Configuration for the agent in extract operations."""
|
|
211
210
|
model: Literal["FIRE-1"] = "FIRE-1"
|
|
212
211
|
|
|
213
|
-
class
|
|
212
|
+
class V1JsonConfig(pydantic.BaseModel):
|
|
214
213
|
"""Configuration for extraction."""
|
|
215
214
|
prompt: Optional[str] = None
|
|
216
215
|
schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
217
216
|
systemPrompt: Optional[str] = None
|
|
218
|
-
agent: Optional[
|
|
217
|
+
agent: Optional[V1ExtractAgent] = None
|
|
219
218
|
|
|
220
|
-
class
|
|
219
|
+
class V1ScrapeParams(V1ScrapeOptions):
|
|
221
220
|
"""Parameters for scraping operations."""
|
|
222
|
-
extract: Optional[
|
|
223
|
-
jsonOptions: Optional[
|
|
224
|
-
actions: Optional[List[Union[
|
|
225
|
-
agent: Optional[
|
|
226
|
-
webhook: Optional[
|
|
221
|
+
extract: Optional[V1JsonConfig] = None
|
|
222
|
+
jsonOptions: Optional[V1JsonConfig] = None
|
|
223
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None
|
|
224
|
+
agent: Optional[V1AgentOptions] = None
|
|
225
|
+
webhook: Optional[V1WebhookConfig] = None
|
|
227
226
|
|
|
228
|
-
class
|
|
227
|
+
class V1ScrapeResponse(V1FirecrawlDocument[T], Generic[T]):
|
|
229
228
|
"""Response from scraping operations."""
|
|
230
229
|
success: bool = True
|
|
231
230
|
warning: Optional[str] = None
|
|
232
231
|
error: Optional[str] = None
|
|
233
232
|
|
|
234
|
-
class
|
|
233
|
+
class V1BatchScrapeResponse(pydantic.BaseModel):
|
|
235
234
|
"""Response from batch scrape operations."""
|
|
236
235
|
id: Optional[str] = None
|
|
237
236
|
url: Optional[str] = None
|
|
@@ -239,7 +238,7 @@ class BatchScrapeResponse(pydantic.BaseModel):
|
|
|
239
238
|
error: Optional[str] = None
|
|
240
239
|
invalidURLs: Optional[List[str]] = None
|
|
241
240
|
|
|
242
|
-
class
|
|
241
|
+
class V1BatchScrapeStatusResponse(pydantic.BaseModel):
|
|
243
242
|
"""Response from batch scrape status checks."""
|
|
244
243
|
success: bool = True
|
|
245
244
|
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
@@ -248,9 +247,9 @@ class BatchScrapeStatusResponse(pydantic.BaseModel):
|
|
|
248
247
|
creditsUsed: int
|
|
249
248
|
expiresAt: datetime
|
|
250
249
|
next: Optional[str] = None
|
|
251
|
-
data: List[
|
|
250
|
+
data: List[V1FirecrawlDocument]
|
|
252
251
|
|
|
253
|
-
class
|
|
252
|
+
class V1CrawlParams(pydantic.BaseModel):
|
|
254
253
|
"""Parameters for crawling operations."""
|
|
255
254
|
includePaths: Optional[List[str]] = None
|
|
256
255
|
excludePaths: Optional[List[str]] = None
|
|
@@ -258,10 +257,11 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
258
257
|
maxDiscoveryDepth: Optional[int] = None
|
|
259
258
|
limit: Optional[int] = None
|
|
260
259
|
allowBackwardLinks: Optional[bool] = None
|
|
260
|
+
crawlEntireDomain: Optional[bool] = None
|
|
261
261
|
allowExternalLinks: Optional[bool] = None
|
|
262
262
|
ignoreSitemap: Optional[bool] = None
|
|
263
|
-
scrapeOptions: Optional[
|
|
264
|
-
webhook: Optional[Union[str,
|
|
263
|
+
scrapeOptions: Optional[V1ScrapeOptions] = None
|
|
264
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None
|
|
265
265
|
deduplicateSimilarURLs: Optional[bool] = None
|
|
266
266
|
ignoreQueryParameters: Optional[bool] = None
|
|
267
267
|
regexOnFullURL: Optional[bool] = None
|
|
@@ -269,14 +269,14 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
269
269
|
maxConcurrency: Optional[int] = None
|
|
270
270
|
allowSubdomains: Optional[bool] = None
|
|
271
271
|
|
|
272
|
-
class
|
|
272
|
+
class V1CrawlResponse(pydantic.BaseModel):
|
|
273
273
|
"""Response from crawling operations."""
|
|
274
274
|
id: Optional[str] = None
|
|
275
275
|
url: Optional[str] = None
|
|
276
276
|
success: bool = True
|
|
277
277
|
error: Optional[str] = None
|
|
278
278
|
|
|
279
|
-
class
|
|
279
|
+
class V1CrawlStatusResponse(pydantic.BaseModel):
|
|
280
280
|
"""Response from crawl status checks."""
|
|
281
281
|
success: bool = True
|
|
282
282
|
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
@@ -285,30 +285,38 @@ class CrawlStatusResponse(pydantic.BaseModel):
|
|
|
285
285
|
creditsUsed: int
|
|
286
286
|
expiresAt: datetime
|
|
287
287
|
next: Optional[str] = None
|
|
288
|
-
data: List[
|
|
288
|
+
data: List[V1FirecrawlDocument]
|
|
289
|
+
|
|
290
|
+
class V1CrawlError(pydantic.BaseModel):
|
|
291
|
+
"""A crawl error."""
|
|
292
|
+
id: str
|
|
293
|
+
timestamp: Optional[datetime] = None
|
|
294
|
+
url: str
|
|
295
|
+
code: Optional[str] = None
|
|
296
|
+
error: str
|
|
289
297
|
|
|
290
|
-
class
|
|
298
|
+
class V1CrawlErrorsResponse(pydantic.BaseModel):
|
|
291
299
|
"""Response from crawl/batch scrape error monitoring."""
|
|
292
|
-
errors: List[
|
|
300
|
+
errors: List[V1CrawlError]
|
|
293
301
|
robotsBlocked: List[str]
|
|
294
302
|
|
|
295
|
-
class
|
|
303
|
+
class V1MapParams(pydantic.BaseModel):
|
|
296
304
|
"""Parameters for mapping operations."""
|
|
297
305
|
search: Optional[str] = None
|
|
298
306
|
ignoreSitemap: Optional[bool] = None
|
|
299
307
|
includeSubdomains: Optional[bool] = None
|
|
300
308
|
sitemapOnly: Optional[bool] = None
|
|
301
309
|
limit: Optional[int] = None
|
|
302
|
-
timeout: Optional[int] =
|
|
310
|
+
timeout: Optional[int] = 30000
|
|
303
311
|
useIndex: Optional[bool] = None
|
|
304
312
|
|
|
305
|
-
class
|
|
313
|
+
class V1MapResponse(pydantic.BaseModel):
|
|
306
314
|
"""Response from mapping operations."""
|
|
307
315
|
success: bool = True
|
|
308
316
|
links: Optional[List[str]] = None
|
|
309
317
|
error: Optional[str] = None
|
|
310
318
|
|
|
311
|
-
class
|
|
319
|
+
class V1ExtractParams(pydantic.BaseModel):
|
|
312
320
|
"""Parameters for extracting information from URLs."""
|
|
313
321
|
prompt: Optional[str] = None
|
|
314
322
|
schema_field: Optional[Any] = pydantic.Field(None, alias='schema')
|
|
@@ -318,9 +326,9 @@ class ExtractParams(pydantic.BaseModel):
|
|
|
318
326
|
includeSubdomains: Optional[bool] = None
|
|
319
327
|
origin: Optional[str] = None
|
|
320
328
|
showSources: Optional[bool] = None
|
|
321
|
-
scrapeOptions: Optional[
|
|
329
|
+
scrapeOptions: Optional[V1ScrapeOptions] = None
|
|
322
330
|
|
|
323
|
-
class
|
|
331
|
+
class V1ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
324
332
|
"""Response from extract operations."""
|
|
325
333
|
id: Optional[str] = None
|
|
326
334
|
status: Optional[Literal["processing", "completed", "failed"]] = None
|
|
@@ -331,7 +339,7 @@ class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
|
331
339
|
warning: Optional[str] = None
|
|
332
340
|
sources: Optional[Dict[Any, Any]] = None
|
|
333
341
|
|
|
334
|
-
class
|
|
342
|
+
class V1SearchParams(pydantic.BaseModel):
|
|
335
343
|
query: str
|
|
336
344
|
limit: Optional[int] = 5
|
|
337
345
|
tbs: Optional[str] = None
|
|
@@ -341,16 +349,16 @@ class SearchParams(pydantic.BaseModel):
|
|
|
341
349
|
location: Optional[str] = None
|
|
342
350
|
origin: Optional[str] = "api"
|
|
343
351
|
timeout: Optional[int] = 60000
|
|
344
|
-
scrapeOptions: Optional[
|
|
352
|
+
scrapeOptions: Optional[V1ScrapeOptions] = None
|
|
345
353
|
|
|
346
|
-
class
|
|
354
|
+
class V1SearchResponse(pydantic.BaseModel):
|
|
347
355
|
"""Response from search operations."""
|
|
348
356
|
success: bool = True
|
|
349
|
-
data: List[
|
|
357
|
+
data: List[V1FirecrawlDocument]
|
|
350
358
|
warning: Optional[str] = None
|
|
351
359
|
error: Optional[str] = None
|
|
352
360
|
|
|
353
|
-
class
|
|
361
|
+
class V1GenerateLLMsTextParams(pydantic.BaseModel):
|
|
354
362
|
"""
|
|
355
363
|
Parameters for the LLMs.txt generation operation.
|
|
356
364
|
"""
|
|
@@ -359,7 +367,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
|
|
|
359
367
|
cache: Optional[bool] = True
|
|
360
368
|
__experimental_stream: Optional[bool] = None
|
|
361
369
|
|
|
362
|
-
class
|
|
370
|
+
class V1DeepResearchParams(pydantic.BaseModel):
|
|
363
371
|
"""
|
|
364
372
|
Parameters for the deep research operation.
|
|
365
373
|
"""
|
|
@@ -370,7 +378,7 @@ class DeepResearchParams(pydantic.BaseModel):
|
|
|
370
378
|
systemPrompt: Optional[str] = None
|
|
371
379
|
__experimental_streamSteps: Optional[bool] = None
|
|
372
380
|
|
|
373
|
-
class
|
|
381
|
+
class V1DeepResearchResponse(pydantic.BaseModel):
|
|
374
382
|
"""
|
|
375
383
|
Response from the deep research operation.
|
|
376
384
|
"""
|
|
@@ -378,7 +386,7 @@ class DeepResearchResponse(pydantic.BaseModel):
|
|
|
378
386
|
id: str
|
|
379
387
|
error: Optional[str] = None
|
|
380
388
|
|
|
381
|
-
class
|
|
389
|
+
class V1DeepResearchStatusResponse(pydantic.BaseModel):
|
|
382
390
|
"""
|
|
383
391
|
Status response from the deep research operation.
|
|
384
392
|
"""
|
|
@@ -393,25 +401,25 @@ class DeepResearchStatusResponse(pydantic.BaseModel):
|
|
|
393
401
|
sources: List[Dict[str, Any]]
|
|
394
402
|
summaries: List[str]
|
|
395
403
|
|
|
396
|
-
class
|
|
404
|
+
class V1GenerateLLMsTextResponse(pydantic.BaseModel):
|
|
397
405
|
"""Response from LLMs.txt generation operations."""
|
|
398
406
|
success: bool = True
|
|
399
407
|
id: str
|
|
400
408
|
error: Optional[str] = None
|
|
401
409
|
|
|
402
|
-
class
|
|
410
|
+
class V1GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
|
|
403
411
|
llmstxt: str
|
|
404
412
|
llmsfulltxt: Optional[str] = None
|
|
405
413
|
|
|
406
|
-
class
|
|
414
|
+
class V1GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
|
407
415
|
"""Status response from LLMs.txt generation operations."""
|
|
408
416
|
success: bool = True
|
|
409
|
-
data: Optional[
|
|
417
|
+
data: Optional[V1GenerateLLMsTextStatusResponseData] = None
|
|
410
418
|
status: Literal["processing", "completed", "failed"]
|
|
411
419
|
error: Optional[str] = None
|
|
412
420
|
expiresAt: str
|
|
413
421
|
|
|
414
|
-
class
|
|
422
|
+
class V1SearchResponse(pydantic.BaseModel):
|
|
415
423
|
"""
|
|
416
424
|
Response from the search operation.
|
|
417
425
|
"""
|
|
@@ -420,7 +428,7 @@ class SearchResponse(pydantic.BaseModel):
|
|
|
420
428
|
warning: Optional[str] = None
|
|
421
429
|
error: Optional[str] = None
|
|
422
430
|
|
|
423
|
-
class
|
|
431
|
+
class V1ExtractParams(pydantic.BaseModel):
|
|
424
432
|
"""
|
|
425
433
|
Parameters for the extract operation.
|
|
426
434
|
"""
|
|
@@ -434,10 +442,25 @@ class ExtractParams(pydantic.BaseModel):
|
|
|
434
442
|
show_sources: Optional[bool] = False
|
|
435
443
|
agent: Optional[Dict[str, Any]] = None
|
|
436
444
|
|
|
437
|
-
class
|
|
445
|
+
class V1FirecrawlApp:
|
|
446
|
+
"""
|
|
447
|
+
Legacy v1 Firecrawl client for backward compatibility.
|
|
448
|
+
|
|
449
|
+
This class provides the complete v1 API implementation including:
|
|
450
|
+
- URL scraping with various formats and options
|
|
451
|
+
- Website crawling with monitoring capabilities
|
|
452
|
+
- Batch scraping operations
|
|
453
|
+
- Search functionality
|
|
454
|
+
- Data extraction with LLM integration
|
|
455
|
+
- Deep research capabilities
|
|
456
|
+
- LLMs.txt generation
|
|
457
|
+
|
|
458
|
+
This is used by the unified client to provide version-specific access
|
|
459
|
+
through app.v1.method_name() patterns.
|
|
460
|
+
"""
|
|
438
461
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
|
439
462
|
"""
|
|
440
|
-
Initialize the
|
|
463
|
+
Initialize the V1FirecrawlApp instance with API key, API URL.
|
|
441
464
|
|
|
442
465
|
Args:
|
|
443
466
|
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
|
@@ -451,7 +474,7 @@ class FirecrawlApp:
|
|
|
451
474
|
logger.warning("No API key provided for cloud service")
|
|
452
475
|
raise ValueError('No API key provided')
|
|
453
476
|
|
|
454
|
-
logger.debug(f"Initialized
|
|
477
|
+
logger.debug(f"Initialized V1FirecrawlApp with API URL: {self.api_url}")
|
|
455
478
|
|
|
456
479
|
def scrape_url(
|
|
457
480
|
self,
|
|
@@ -463,23 +486,22 @@ class FirecrawlApp:
|
|
|
463
486
|
exclude_tags: Optional[List[str]] = None,
|
|
464
487
|
only_main_content: Optional[bool] = None,
|
|
465
488
|
wait_for: Optional[int] = None,
|
|
466
|
-
timeout: Optional[int] =
|
|
467
|
-
location: Optional[
|
|
489
|
+
timeout: Optional[int] = 30000,
|
|
490
|
+
location: Optional[V1LocationConfig] = None,
|
|
468
491
|
mobile: Optional[bool] = None,
|
|
469
492
|
skip_tls_verification: Optional[bool] = None,
|
|
470
493
|
remove_base64_images: Optional[bool] = None,
|
|
471
494
|
block_ads: Optional[bool] = None,
|
|
472
495
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
473
496
|
parse_pdf: Optional[bool] = None,
|
|
474
|
-
extract: Optional[
|
|
475
|
-
json_options: Optional[
|
|
476
|
-
actions: Optional[List[Union[
|
|
477
|
-
change_tracking_options: Optional[
|
|
497
|
+
extract: Optional[V1JsonConfig] = None,
|
|
498
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
499
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
500
|
+
change_tracking_options: Optional[V1ChangeTrackingOptions] = None,
|
|
478
501
|
max_age: Optional[int] = None,
|
|
479
502
|
store_in_cache: Optional[bool] = None,
|
|
480
503
|
zero_data_retention: Optional[bool] = None,
|
|
481
|
-
|
|
482
|
-
**kwargs) -> ScrapeResponse[Any]:
|
|
504
|
+
**kwargs) -> V1ScrapeResponse[Any]:
|
|
483
505
|
"""
|
|
484
506
|
Scrape and extract content from a URL.
|
|
485
507
|
|
|
@@ -503,7 +525,6 @@ class FirecrawlApp:
|
|
|
503
525
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
|
|
504
526
|
change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
|
|
505
527
|
zero_data_retention (Optional[bool]): Whether to delete data after scrape is done
|
|
506
|
-
agent (Optional[AgentOptions]): Agent configuration for FIRE-1 model
|
|
507
528
|
|
|
508
529
|
|
|
509
530
|
Returns:
|
|
@@ -576,8 +597,6 @@ class FirecrawlApp:
|
|
|
576
597
|
scrape_params['storeInCache'] = store_in_cache
|
|
577
598
|
if zero_data_retention is not None:
|
|
578
599
|
scrape_params['zeroDataRetention'] = zero_data_retention
|
|
579
|
-
if agent is not None:
|
|
580
|
-
scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
|
|
581
600
|
|
|
582
601
|
scrape_params.update(kwargs)
|
|
583
602
|
|
|
@@ -591,14 +610,14 @@ class FirecrawlApp:
|
|
|
591
610
|
f'{self.api_url}/v1/scrape',
|
|
592
611
|
headers=_headers,
|
|
593
612
|
json=scrape_params,
|
|
594
|
-
timeout=(timeout +
|
|
613
|
+
timeout=(timeout / 1000.0 + 5 if timeout is not None else None)
|
|
595
614
|
)
|
|
596
615
|
|
|
597
616
|
if response.status_code == 200:
|
|
598
617
|
try:
|
|
599
618
|
response_json = response.json()
|
|
600
619
|
if response_json.get('success') and 'data' in response_json:
|
|
601
|
-
return
|
|
620
|
+
return V1ScrapeResponse(**response_json['data'])
|
|
602
621
|
elif "error" in response_json:
|
|
603
622
|
raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}')
|
|
604
623
|
else:
|
|
@@ -618,9 +637,9 @@ class FirecrawlApp:
|
|
|
618
637
|
lang: Optional[str] = None,
|
|
619
638
|
country: Optional[str] = None,
|
|
620
639
|
location: Optional[str] = None,
|
|
621
|
-
timeout: Optional[int] =
|
|
622
|
-
scrape_options: Optional[
|
|
623
|
-
**kwargs) ->
|
|
640
|
+
timeout: Optional[int] = 30000,
|
|
641
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
642
|
+
**kwargs) -> V1SearchResponse:
|
|
624
643
|
"""
|
|
625
644
|
Search for content using Firecrawl.
|
|
626
645
|
|
|
@@ -675,7 +694,7 @@ class FirecrawlApp:
|
|
|
675
694
|
_integration = search_params.get('integration')
|
|
676
695
|
|
|
677
696
|
# Create final params object
|
|
678
|
-
final_params =
|
|
697
|
+
final_params = V1SearchParams(query=query, **search_params)
|
|
679
698
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
680
699
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
681
700
|
|
|
@@ -693,7 +712,7 @@ class FirecrawlApp:
|
|
|
693
712
|
try:
|
|
694
713
|
response_json = response.json()
|
|
695
714
|
if response_json.get('success') and 'data' in response_json:
|
|
696
|
-
return
|
|
715
|
+
return V1SearchResponse(**response_json)
|
|
697
716
|
elif "error" in response_json:
|
|
698
717
|
raise Exception(f'Search failed. Error: {response_json["error"]}')
|
|
699
718
|
else:
|
|
@@ -716,8 +735,8 @@ class FirecrawlApp:
|
|
|
716
735
|
crawl_entire_domain: Optional[bool] = None,
|
|
717
736
|
allow_external_links: Optional[bool] = None,
|
|
718
737
|
ignore_sitemap: Optional[bool] = None,
|
|
719
|
-
scrape_options: Optional[
|
|
720
|
-
webhook: Optional[Union[str,
|
|
738
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
739
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
721
740
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
722
741
|
ignore_query_parameters: Optional[bool] = None,
|
|
723
742
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -728,7 +747,7 @@ class FirecrawlApp:
|
|
|
728
747
|
poll_interval: Optional[int] = 2,
|
|
729
748
|
idempotency_key: Optional[str] = None,
|
|
730
749
|
**kwargs
|
|
731
|
-
) ->
|
|
750
|
+
) -> V1CrawlStatusResponse:
|
|
732
751
|
"""
|
|
733
752
|
Crawl a website starting from a URL.
|
|
734
753
|
|
|
@@ -812,7 +831,7 @@ class FirecrawlApp:
|
|
|
812
831
|
_integration = crawl_params.get('integration')
|
|
813
832
|
|
|
814
833
|
# Create final params object
|
|
815
|
-
final_params =
|
|
834
|
+
final_params = V1CrawlParams(**crawl_params)
|
|
816
835
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
817
836
|
params_dict['url'] = url
|
|
818
837
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -846,8 +865,8 @@ class FirecrawlApp:
|
|
|
846
865
|
crawl_entire_domain: Optional[bool] = None,
|
|
847
866
|
allow_external_links: Optional[bool] = None,
|
|
848
867
|
ignore_sitemap: Optional[bool] = None,
|
|
849
|
-
scrape_options: Optional[
|
|
850
|
-
webhook: Optional[Union[str,
|
|
868
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
869
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
851
870
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
852
871
|
ignore_query_parameters: Optional[bool] = None,
|
|
853
872
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -857,7 +876,7 @@ class FirecrawlApp:
|
|
|
857
876
|
zero_data_retention: Optional[bool] = None,
|
|
858
877
|
idempotency_key: Optional[str] = None,
|
|
859
878
|
**kwargs
|
|
860
|
-
) ->
|
|
879
|
+
) -> V1CrawlResponse:
|
|
861
880
|
"""
|
|
862
881
|
Start an asynchronous crawl job.
|
|
863
882
|
|
|
@@ -872,8 +891,8 @@ class FirecrawlApp:
|
|
|
872
891
|
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
873
892
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
874
893
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
875
|
-
scrape_options (Optional[
|
|
876
|
-
webhook (Optional[Union[str,
|
|
894
|
+
scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
|
|
895
|
+
webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
|
|
877
896
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
878
897
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
879
898
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
@@ -885,7 +904,7 @@ class FirecrawlApp:
|
|
|
885
904
|
**kwargs: Additional parameters to pass to the API
|
|
886
905
|
|
|
887
906
|
Returns:
|
|
888
|
-
|
|
907
|
+
V1CrawlResponse with:
|
|
889
908
|
* success - Whether crawl started successfully
|
|
890
909
|
* id - Unique identifier for the crawl job
|
|
891
910
|
* url - Status check URL for the crawl
|
|
@@ -940,7 +959,7 @@ class FirecrawlApp:
|
|
|
940
959
|
crawl_params.update(kwargs)
|
|
941
960
|
|
|
942
961
|
# Create final params object
|
|
943
|
-
final_params =
|
|
962
|
+
final_params = V1CrawlParams(**crawl_params)
|
|
944
963
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
945
964
|
params_dict['url'] = url
|
|
946
965
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -951,13 +970,13 @@ class FirecrawlApp:
|
|
|
951
970
|
|
|
952
971
|
if response.status_code == 200:
|
|
953
972
|
try:
|
|
954
|
-
return
|
|
973
|
+
return V1CrawlResponse(**response.json())
|
|
955
974
|
except:
|
|
956
975
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
957
976
|
else:
|
|
958
977
|
self._handle_error(response, 'start crawl job')
|
|
959
978
|
|
|
960
|
-
def check_crawl_status(self, id: str) ->
|
|
979
|
+
def check_crawl_status(self, id: str) -> V1CrawlStatusResponse:
|
|
961
980
|
"""
|
|
962
981
|
Check the status and results of a crawl job.
|
|
963
982
|
|
|
@@ -965,7 +984,7 @@ class FirecrawlApp:
|
|
|
965
984
|
id: Unique identifier for the crawl job
|
|
966
985
|
|
|
967
986
|
Returns:
|
|
968
|
-
|
|
987
|
+
V1CrawlStatusResponse containing:
|
|
969
988
|
|
|
970
989
|
Status Information:
|
|
971
990
|
* status - Current state (scraping/completed/failed/cancelled)
|
|
@@ -1033,14 +1052,14 @@ class FirecrawlApp:
|
|
|
1033
1052
|
if 'next' in status_data:
|
|
1034
1053
|
response['next'] = status_data['next']
|
|
1035
1054
|
|
|
1036
|
-
return
|
|
1055
|
+
return V1CrawlStatusResponse(
|
|
1037
1056
|
success=False if 'error' in status_data else True,
|
|
1038
1057
|
**response
|
|
1039
1058
|
)
|
|
1040
1059
|
else:
|
|
1041
1060
|
self._handle_error(response, 'check crawl status')
|
|
1042
1061
|
|
|
1043
|
-
def check_crawl_errors(self, id: str) ->
|
|
1062
|
+
def check_crawl_errors(self, id: str) -> V1CrawlErrorsResponse:
|
|
1044
1063
|
"""
|
|
1045
1064
|
Returns information about crawl errors.
|
|
1046
1065
|
|
|
@@ -1048,7 +1067,7 @@ class FirecrawlApp:
|
|
|
1048
1067
|
id (str): The ID of the crawl job
|
|
1049
1068
|
|
|
1050
1069
|
Returns:
|
|
1051
|
-
|
|
1070
|
+
V1CrawlErrorsResponse containing:
|
|
1052
1071
|
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
1053
1072
|
- id (str): Error ID
|
|
1054
1073
|
- timestamp (str): When the error occurred
|
|
@@ -1063,7 +1082,7 @@ class FirecrawlApp:
|
|
|
1063
1082
|
response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
|
|
1064
1083
|
if response.status_code == 200:
|
|
1065
1084
|
try:
|
|
1066
|
-
return
|
|
1085
|
+
return V1CrawlErrorsResponse(**response.json())
|
|
1067
1086
|
except:
|
|
1068
1087
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1069
1088
|
else:
|
|
@@ -1107,8 +1126,8 @@ class FirecrawlApp:
|
|
|
1107
1126
|
crawl_entire_domain: Optional[bool] = None,
|
|
1108
1127
|
allow_external_links: Optional[bool] = None,
|
|
1109
1128
|
ignore_sitemap: Optional[bool] = None,
|
|
1110
|
-
scrape_options: Optional[
|
|
1111
|
-
webhook: Optional[Union[str,
|
|
1129
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
1130
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
1112
1131
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1113
1132
|
ignore_query_parameters: Optional[bool] = None,
|
|
1114
1133
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -1118,7 +1137,7 @@ class FirecrawlApp:
|
|
|
1118
1137
|
zero_data_retention: Optional[bool] = None,
|
|
1119
1138
|
idempotency_key: Optional[str] = None,
|
|
1120
1139
|
**kwargs
|
|
1121
|
-
) -> '
|
|
1140
|
+
) -> 'V1CrawlWatcher':
|
|
1122
1141
|
"""
|
|
1123
1142
|
Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
1124
1143
|
|
|
@@ -1133,8 +1152,8 @@ class FirecrawlApp:
|
|
|
1133
1152
|
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
1134
1153
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1135
1154
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1136
|
-
scrape_options (Optional[
|
|
1137
|
-
webhook (Optional[Union[str,
|
|
1155
|
+
scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
|
|
1156
|
+
webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
|
|
1138
1157
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1139
1158
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1140
1159
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
@@ -1146,7 +1165,7 @@ class FirecrawlApp:
|
|
|
1146
1165
|
**kwargs: Additional parameters to pass to the API
|
|
1147
1166
|
|
|
1148
1167
|
Returns:
|
|
1149
|
-
|
|
1168
|
+
V1CrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
1150
1169
|
|
|
1151
1170
|
Raises:
|
|
1152
1171
|
Exception: If crawl job fails to start
|
|
@@ -1159,6 +1178,7 @@ class FirecrawlApp:
|
|
|
1159
1178
|
max_discovery_depth=max_discovery_depth,
|
|
1160
1179
|
limit=limit,
|
|
1161
1180
|
allow_backward_links=allow_backward_links,
|
|
1181
|
+
crawl_entire_domain=crawl_entire_domain,
|
|
1162
1182
|
allow_external_links=allow_external_links,
|
|
1163
1183
|
ignore_sitemap=ignore_sitemap,
|
|
1164
1184
|
scrape_options=scrape_options,
|
|
@@ -1174,7 +1194,7 @@ class FirecrawlApp:
|
|
|
1174
1194
|
**kwargs
|
|
1175
1195
|
)
|
|
1176
1196
|
if crawl_response.success and crawl_response.id:
|
|
1177
|
-
return
|
|
1197
|
+
return V1CrawlWatcher(crawl_response.id, self)
|
|
1178
1198
|
else:
|
|
1179
1199
|
raise Exception("Crawl job failed to start")
|
|
1180
1200
|
|
|
@@ -1187,9 +1207,9 @@ class FirecrawlApp:
|
|
|
1187
1207
|
include_subdomains: Optional[bool] = None,
|
|
1188
1208
|
sitemap_only: Optional[bool] = None,
|
|
1189
1209
|
limit: Optional[int] = None,
|
|
1190
|
-
timeout: Optional[int] =
|
|
1210
|
+
timeout: Optional[int] = 30000,
|
|
1191
1211
|
use_index: Optional[bool] = None,
|
|
1192
|
-
**kwargs) ->
|
|
1212
|
+
**kwargs) -> V1MapResponse:
|
|
1193
1213
|
"""
|
|
1194
1214
|
Map and discover links from a URL.
|
|
1195
1215
|
|
|
@@ -1204,7 +1224,7 @@ class FirecrawlApp:
|
|
|
1204
1224
|
**kwargs: Additional parameters to pass to the API
|
|
1205
1225
|
|
|
1206
1226
|
Returns:
|
|
1207
|
-
|
|
1227
|
+
V1MapResponse: Response containing:
|
|
1208
1228
|
* success (bool): Whether request succeeded
|
|
1209
1229
|
* links (List[str]): Discovered URLs
|
|
1210
1230
|
* error (Optional[str]): Error message if any
|
|
@@ -1239,7 +1259,7 @@ class FirecrawlApp:
|
|
|
1239
1259
|
_integration = map_params.get('integration')
|
|
1240
1260
|
|
|
1241
1261
|
# Create final params object
|
|
1242
|
-
final_params =
|
|
1262
|
+
final_params = V1MapParams(**map_params)
|
|
1243
1263
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
1244
1264
|
params_dict['url'] = url
|
|
1245
1265
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -1258,7 +1278,7 @@ class FirecrawlApp:
|
|
|
1258
1278
|
try:
|
|
1259
1279
|
response_json = response.json()
|
|
1260
1280
|
if response_json.get('success') and 'links' in response_json:
|
|
1261
|
-
return
|
|
1281
|
+
return V1MapResponse(**response_json)
|
|
1262
1282
|
elif "error" in response_json:
|
|
1263
1283
|
raise Exception(f'Map failed. Error: {response_json["error"]}')
|
|
1264
1284
|
else:
|
|
@@ -1278,23 +1298,23 @@ class FirecrawlApp:
|
|
|
1278
1298
|
exclude_tags: Optional[List[str]] = None,
|
|
1279
1299
|
only_main_content: Optional[bool] = None,
|
|
1280
1300
|
wait_for: Optional[int] = None,
|
|
1281
|
-
timeout: Optional[int] =
|
|
1282
|
-
location: Optional[
|
|
1301
|
+
timeout: Optional[int] = 30000,
|
|
1302
|
+
location: Optional[V1LocationConfig] = None,
|
|
1283
1303
|
mobile: Optional[bool] = None,
|
|
1284
1304
|
skip_tls_verification: Optional[bool] = None,
|
|
1285
1305
|
remove_base64_images: Optional[bool] = None,
|
|
1286
1306
|
block_ads: Optional[bool] = None,
|
|
1287
1307
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1288
|
-
extract: Optional[
|
|
1289
|
-
json_options: Optional[
|
|
1290
|
-
actions: Optional[List[Union[
|
|
1291
|
-
agent: Optional[
|
|
1308
|
+
extract: Optional[V1JsonConfig] = None,
|
|
1309
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
1310
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
1311
|
+
agent: Optional[V1AgentOptions] = None,
|
|
1292
1312
|
poll_interval: Optional[int] = 2,
|
|
1293
1313
|
max_concurrency: Optional[int] = None,
|
|
1294
1314
|
zero_data_retention: Optional[bool] = None,
|
|
1295
1315
|
idempotency_key: Optional[str] = None,
|
|
1296
1316
|
**kwargs
|
|
1297
|
-
) ->
|
|
1317
|
+
) -> V1BatchScrapeStatusResponse:
|
|
1298
1318
|
"""
|
|
1299
1319
|
Batch scrape multiple URLs and monitor until completion.
|
|
1300
1320
|
|
|
@@ -1323,7 +1343,7 @@ class FirecrawlApp:
|
|
|
1323
1343
|
**kwargs: Additional parameters to pass to the API
|
|
1324
1344
|
|
|
1325
1345
|
Returns:
|
|
1326
|
-
|
|
1346
|
+
V1BatchScrapeStatusResponse with:
|
|
1327
1347
|
* Scraping status and progress
|
|
1328
1348
|
* Scraped content for each URL
|
|
1329
1349
|
* Success/error information
|
|
@@ -1386,7 +1406,7 @@ class FirecrawlApp:
|
|
|
1386
1406
|
scrape_params.update(kwargs)
|
|
1387
1407
|
|
|
1388
1408
|
# Create final params object
|
|
1389
|
-
final_params =
|
|
1409
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
1390
1410
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
1391
1411
|
params_dict['urls'] = urls
|
|
1392
1412
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -1419,22 +1439,22 @@ class FirecrawlApp:
|
|
|
1419
1439
|
exclude_tags: Optional[List[str]] = None,
|
|
1420
1440
|
only_main_content: Optional[bool] = None,
|
|
1421
1441
|
wait_for: Optional[int] = None,
|
|
1422
|
-
timeout: Optional[int] =
|
|
1423
|
-
location: Optional[
|
|
1442
|
+
timeout: Optional[int] = 30000,
|
|
1443
|
+
location: Optional[V1LocationConfig] = None,
|
|
1424
1444
|
mobile: Optional[bool] = None,
|
|
1425
1445
|
skip_tls_verification: Optional[bool] = None,
|
|
1426
1446
|
remove_base64_images: Optional[bool] = None,
|
|
1427
1447
|
block_ads: Optional[bool] = None,
|
|
1428
1448
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1429
|
-
extract: Optional[
|
|
1430
|
-
json_options: Optional[
|
|
1431
|
-
actions: Optional[List[Union[
|
|
1432
|
-
agent: Optional[
|
|
1449
|
+
extract: Optional[V1JsonConfig] = None,
|
|
1450
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
1451
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
1452
|
+
agent: Optional[V1AgentOptions] = None,
|
|
1433
1453
|
max_concurrency: Optional[int] = None,
|
|
1434
1454
|
idempotency_key: Optional[str] = None,
|
|
1435
1455
|
zero_data_retention: Optional[bool] = None,
|
|
1436
1456
|
**kwargs
|
|
1437
|
-
) ->
|
|
1457
|
+
) -> V1BatchScrapeResponse:
|
|
1438
1458
|
"""
|
|
1439
1459
|
Initiate a batch scrape job asynchronously.
|
|
1440
1460
|
|
|
@@ -1463,7 +1483,7 @@ class FirecrawlApp:
|
|
|
1463
1483
|
**kwargs: Additional parameters to pass to the API
|
|
1464
1484
|
|
|
1465
1485
|
Returns:
|
|
1466
|
-
|
|
1486
|
+
V1BatchScrapeResponse with:
|
|
1467
1487
|
* success - Whether job started successfully
|
|
1468
1488
|
* id - Unique identifier for the job
|
|
1469
1489
|
* url - Status check URL
|
|
@@ -1527,7 +1547,7 @@ class FirecrawlApp:
|
|
|
1527
1547
|
scrape_params.update(kwargs)
|
|
1528
1548
|
|
|
1529
1549
|
# Create final params object
|
|
1530
|
-
final_params =
|
|
1550
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
1531
1551
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
1532
1552
|
params_dict['urls'] = urls
|
|
1533
1553
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -1543,7 +1563,7 @@ class FirecrawlApp:
|
|
|
1543
1563
|
|
|
1544
1564
|
if response.status_code == 200:
|
|
1545
1565
|
try:
|
|
1546
|
-
return
|
|
1566
|
+
return V1BatchScrapeResponse(**response.json())
|
|
1547
1567
|
except:
|
|
1548
1568
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1549
1569
|
else:
|
|
@@ -1559,22 +1579,22 @@ class FirecrawlApp:
|
|
|
1559
1579
|
exclude_tags: Optional[List[str]] = None,
|
|
1560
1580
|
only_main_content: Optional[bool] = None,
|
|
1561
1581
|
wait_for: Optional[int] = None,
|
|
1562
|
-
timeout: Optional[int] =
|
|
1563
|
-
location: Optional[
|
|
1582
|
+
timeout: Optional[int] = 30000,
|
|
1583
|
+
location: Optional[V1LocationConfig] = None,
|
|
1564
1584
|
mobile: Optional[bool] = None,
|
|
1565
1585
|
skip_tls_verification: Optional[bool] = None,
|
|
1566
1586
|
remove_base64_images: Optional[bool] = None,
|
|
1567
1587
|
block_ads: Optional[bool] = None,
|
|
1568
1588
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1569
|
-
extract: Optional[
|
|
1570
|
-
json_options: Optional[
|
|
1571
|
-
actions: Optional[List[Union[
|
|
1572
|
-
agent: Optional[
|
|
1589
|
+
extract: Optional[V1JsonConfig] = None,
|
|
1590
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
1591
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
1592
|
+
agent: Optional[V1AgentOptions] = None,
|
|
1573
1593
|
max_concurrency: Optional[int] = None,
|
|
1574
1594
|
zero_data_retention: Optional[bool] = None,
|
|
1575
1595
|
idempotency_key: Optional[str] = None,
|
|
1576
1596
|
**kwargs
|
|
1577
|
-
) -> '
|
|
1597
|
+
) -> 'V1CrawlWatcher':
|
|
1578
1598
|
"""
|
|
1579
1599
|
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.
|
|
1580
1600
|
|
|
@@ -1603,7 +1623,7 @@ class FirecrawlApp:
|
|
|
1603
1623
|
**kwargs: Additional parameters to pass to the API
|
|
1604
1624
|
|
|
1605
1625
|
Returns:
|
|
1606
|
-
|
|
1626
|
+
V1CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
1607
1627
|
|
|
1608
1628
|
Raises:
|
|
1609
1629
|
Exception: If batch scrape job fails to start
|
|
@@ -1663,7 +1683,7 @@ class FirecrawlApp:
|
|
|
1663
1683
|
scrape_params.update(kwargs)
|
|
1664
1684
|
|
|
1665
1685
|
# Create final params object
|
|
1666
|
-
final_params =
|
|
1686
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
1667
1687
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
1668
1688
|
params_dict['urls'] = urls
|
|
1669
1689
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -1679,9 +1699,9 @@ class FirecrawlApp:
|
|
|
1679
1699
|
|
|
1680
1700
|
if response.status_code == 200:
|
|
1681
1701
|
try:
|
|
1682
|
-
crawl_response =
|
|
1702
|
+
crawl_response = V1BatchScrapeResponse(**response.json())
|
|
1683
1703
|
if crawl_response.success and crawl_response.id:
|
|
1684
|
-
return
|
|
1704
|
+
return V1CrawlWatcher(crawl_response.id, self)
|
|
1685
1705
|
else:
|
|
1686
1706
|
raise Exception("Batch scrape job failed to start")
|
|
1687
1707
|
except:
|
|
@@ -1689,7 +1709,7 @@ class FirecrawlApp:
|
|
|
1689
1709
|
else:
|
|
1690
1710
|
self._handle_error(response, 'start batch scrape job')
|
|
1691
1711
|
|
|
1692
|
-
def check_batch_scrape_status(self, id: str) ->
|
|
1712
|
+
def check_batch_scrape_status(self, id: str) -> V1BatchScrapeStatusResponse:
|
|
1693
1713
|
"""
|
|
1694
1714
|
Check the status of a batch scrape job using the Firecrawl API.
|
|
1695
1715
|
|
|
@@ -1697,7 +1717,7 @@ class FirecrawlApp:
|
|
|
1697
1717
|
id (str): The ID of the batch scrape job.
|
|
1698
1718
|
|
|
1699
1719
|
Returns:
|
|
1700
|
-
|
|
1720
|
+
V1BatchScrapeStatusResponse: The status of the batch scrape job.
|
|
1701
1721
|
|
|
1702
1722
|
Raises:
|
|
1703
1723
|
Exception: If the status check request fails.
|
|
@@ -1737,7 +1757,7 @@ class FirecrawlApp:
|
|
|
1737
1757
|
break
|
|
1738
1758
|
status_data['data'] = data
|
|
1739
1759
|
|
|
1740
|
-
return
|
|
1760
|
+
return V1BatchScrapeStatusResponse(**{
|
|
1741
1761
|
'success': False if 'error' in status_data else True,
|
|
1742
1762
|
'status': status_data.get('status'),
|
|
1743
1763
|
'total': status_data.get('total'),
|
|
@@ -1751,7 +1771,7 @@ class FirecrawlApp:
|
|
|
1751
1771
|
else:
|
|
1752
1772
|
self._handle_error(response, 'check batch scrape status')
|
|
1753
1773
|
|
|
1754
|
-
def check_batch_scrape_errors(self, id: str) ->
|
|
1774
|
+
def check_batch_scrape_errors(self, id: str) -> V1CrawlErrorsResponse:
|
|
1755
1775
|
"""
|
|
1756
1776
|
Returns information about batch scrape errors.
|
|
1757
1777
|
|
|
@@ -1759,7 +1779,7 @@ class FirecrawlApp:
|
|
|
1759
1779
|
id (str): The ID of the crawl job.
|
|
1760
1780
|
|
|
1761
1781
|
Returns:
|
|
1762
|
-
|
|
1782
|
+
V1CrawlErrorsResponse containing:
|
|
1763
1783
|
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
1764
1784
|
* id (str): Error ID
|
|
1765
1785
|
* timestamp (str): When the error occurred
|
|
@@ -1774,7 +1794,7 @@ class FirecrawlApp:
|
|
|
1774
1794
|
response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
|
|
1775
1795
|
if response.status_code == 200:
|
|
1776
1796
|
try:
|
|
1777
|
-
return
|
|
1797
|
+
return V1CrawlErrorsResponse(**response.json())
|
|
1778
1798
|
except:
|
|
1779
1799
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1780
1800
|
else:
|
|
@@ -1791,7 +1811,7 @@ class FirecrawlApp:
|
|
|
1791
1811
|
enable_web_search: Optional[bool] = False,
|
|
1792
1812
|
show_sources: Optional[bool] = False,
|
|
1793
1813
|
agent: Optional[Dict[str, Any]] = None,
|
|
1794
|
-
**kwargs) ->
|
|
1814
|
+
**kwargs) -> V1ExtractResponse[Any]:
|
|
1795
1815
|
"""
|
|
1796
1816
|
Extract structured information from URLs.
|
|
1797
1817
|
|
|
@@ -1807,7 +1827,7 @@ class FirecrawlApp:
|
|
|
1807
1827
|
**kwargs: Additional parameters to pass to the API
|
|
1808
1828
|
|
|
1809
1829
|
Returns:
|
|
1810
|
-
|
|
1830
|
+
V1ExtractResponse[Any] with:
|
|
1811
1831
|
* success (bool): Whether request succeeded
|
|
1812
1832
|
* data (Optional[Any]): Extracted data matching schema
|
|
1813
1833
|
* error (Optional[str]): Error message if any
|
|
@@ -1879,7 +1899,7 @@ class FirecrawlApp:
|
|
|
1879
1899
|
except:
|
|
1880
1900
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1881
1901
|
if status_data['status'] == 'completed':
|
|
1882
|
-
return
|
|
1902
|
+
return V1ExtractResponse(**status_data)
|
|
1883
1903
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
1884
1904
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
1885
1905
|
else:
|
|
@@ -1893,9 +1913,9 @@ class FirecrawlApp:
|
|
|
1893
1913
|
except Exception as e:
|
|
1894
1914
|
raise ValueError(str(e), 500)
|
|
1895
1915
|
|
|
1896
|
-
return
|
|
1916
|
+
return V1ExtractResponse(success=False, error="Internal server error.")
|
|
1897
1917
|
|
|
1898
|
-
def get_extract_status(self, job_id: str) ->
|
|
1918
|
+
def get_extract_status(self, job_id: str) -> V1ExtractResponse[Any]:
|
|
1899
1919
|
"""
|
|
1900
1920
|
Retrieve the status of an extract job.
|
|
1901
1921
|
|
|
@@ -1913,7 +1933,7 @@ class FirecrawlApp:
|
|
|
1913
1933
|
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
|
|
1914
1934
|
if response.status_code == 200:
|
|
1915
1935
|
try:
|
|
1916
|
-
return
|
|
1936
|
+
return V1ExtractResponse(**response.json())
|
|
1917
1937
|
except:
|
|
1918
1938
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1919
1939
|
else:
|
|
@@ -1931,7 +1951,7 @@ class FirecrawlApp:
|
|
|
1931
1951
|
allow_external_links: Optional[bool] = False,
|
|
1932
1952
|
enable_web_search: Optional[bool] = False,
|
|
1933
1953
|
show_sources: Optional[bool] = False,
|
|
1934
|
-
agent: Optional[Dict[str, Any]] = None) ->
|
|
1954
|
+
agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
|
|
1935
1955
|
"""
|
|
1936
1956
|
Initiate an asynchronous extract job.
|
|
1937
1957
|
|
|
@@ -1981,7 +2001,7 @@ class FirecrawlApp:
|
|
|
1981
2001
|
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
|
|
1982
2002
|
if response.status_code == 200:
|
|
1983
2003
|
try:
|
|
1984
|
-
return
|
|
2004
|
+
return V1ExtractResponse(**response.json())
|
|
1985
2005
|
except:
|
|
1986
2006
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
1987
2007
|
else:
|
|
@@ -1996,7 +2016,7 @@ class FirecrawlApp:
|
|
|
1996
2016
|
max_urls: Optional[int] = None,
|
|
1997
2017
|
show_full_text: Optional[bool] = None,
|
|
1998
2018
|
cache: Optional[bool] = None,
|
|
1999
|
-
experimental_stream: Optional[bool] = None) ->
|
|
2019
|
+
experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextStatusResponse:
|
|
2000
2020
|
"""
|
|
2001
2021
|
Generate LLMs.txt for a given URL and poll until completion.
|
|
2002
2022
|
|
|
@@ -2017,7 +2037,7 @@ class FirecrawlApp:
|
|
|
2017
2037
|
Raises:
|
|
2018
2038
|
Exception: If generation fails
|
|
2019
2039
|
"""
|
|
2020
|
-
params =
|
|
2040
|
+
params = V1GenerateLLMsTextParams(
|
|
2021
2041
|
maxUrls=max_urls,
|
|
2022
2042
|
showFullText=show_full_text,
|
|
2023
2043
|
cache=cache,
|
|
@@ -2033,7 +2053,7 @@ class FirecrawlApp:
|
|
|
2033
2053
|
)
|
|
2034
2054
|
|
|
2035
2055
|
if not response.success or not response.id:
|
|
2036
|
-
return
|
|
2056
|
+
return V1GenerateLLMsTextStatusResponse(
|
|
2037
2057
|
success=False,
|
|
2038
2058
|
error='Failed to start LLMs.txt generation',
|
|
2039
2059
|
status='failed',
|
|
@@ -2049,7 +2069,7 @@ class FirecrawlApp:
|
|
|
2049
2069
|
elif status.status == 'failed':
|
|
2050
2070
|
return status
|
|
2051
2071
|
elif status.status != 'processing':
|
|
2052
|
-
return
|
|
2072
|
+
return V1GenerateLLMsTextStatusResponse(
|
|
2053
2073
|
success=False,
|
|
2054
2074
|
error='LLMs.txt generation job terminated unexpectedly',
|
|
2055
2075
|
status='failed',
|
|
@@ -2065,7 +2085,7 @@ class FirecrawlApp:
|
|
|
2065
2085
|
max_urls: Optional[int] = None,
|
|
2066
2086
|
show_full_text: Optional[bool] = None,
|
|
2067
2087
|
cache: Optional[bool] = None,
|
|
2068
|
-
experimental_stream: Optional[bool] = None) ->
|
|
2088
|
+
experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextResponse:
|
|
2069
2089
|
"""
|
|
2070
2090
|
Initiate an asynchronous LLMs.txt generation operation.
|
|
2071
2091
|
|
|
@@ -2085,7 +2105,7 @@ class FirecrawlApp:
|
|
|
2085
2105
|
Raises:
|
|
2086
2106
|
Exception: If the generation job initiation fails.
|
|
2087
2107
|
"""
|
|
2088
|
-
params =
|
|
2108
|
+
params = V1GenerateLLMsTextParams(
|
|
2089
2109
|
maxUrls=max_urls,
|
|
2090
2110
|
showFullText=show_full_text,
|
|
2091
2111
|
cache=cache,
|
|
@@ -2103,7 +2123,7 @@ class FirecrawlApp:
|
|
|
2103
2123
|
print("response", response)
|
|
2104
2124
|
if response.get('success'):
|
|
2105
2125
|
try:
|
|
2106
|
-
return
|
|
2126
|
+
return V1GenerateLLMsTextResponse(**response)
|
|
2107
2127
|
except:
|
|
2108
2128
|
raise Exception('Failed to parse Firecrawl response as JSON.')
|
|
2109
2129
|
else:
|
|
@@ -2111,12 +2131,12 @@ class FirecrawlApp:
|
|
|
2111
2131
|
except Exception as e:
|
|
2112
2132
|
raise ValueError(str(e))
|
|
2113
2133
|
|
|
2114
|
-
return
|
|
2134
|
+
return V1GenerateLLMsTextResponse(
|
|
2115
2135
|
success=False,
|
|
2116
2136
|
error='Internal server error'
|
|
2117
2137
|
)
|
|
2118
2138
|
|
|
2119
|
-
def check_generate_llms_text_status(self, id: str) ->
|
|
2139
|
+
def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextStatusResponse:
|
|
2120
2140
|
"""
|
|
2121
2141
|
Check the status of a LLMs.txt generation operation.
|
|
2122
2142
|
|
|
@@ -2142,7 +2162,7 @@ class FirecrawlApp:
|
|
|
2142
2162
|
if response.status_code == 200:
|
|
2143
2163
|
try:
|
|
2144
2164
|
json_data = response.json()
|
|
2145
|
-
return
|
|
2165
|
+
return V1GenerateLLMsTextStatusResponse(**json_data)
|
|
2146
2166
|
except Exception as e:
|
|
2147
2167
|
raise Exception(f'Failed to parse Firecrawl response as GenerateLLMsTextStatusResponse: {str(e)}')
|
|
2148
2168
|
elif response.status_code == 404:
|
|
@@ -2152,7 +2172,7 @@ class FirecrawlApp:
|
|
|
2152
2172
|
except Exception as e:
|
|
2153
2173
|
raise ValueError(str(e))
|
|
2154
2174
|
|
|
2155
|
-
return
|
|
2175
|
+
return V1GenerateLLMsTextStatusResponse(success=False, error='Internal server error', status='failed', expiresAt='')
|
|
2156
2176
|
|
|
2157
2177
|
def _prepare_headers(
|
|
2158
2178
|
self,
|
|
@@ -2202,7 +2222,7 @@ class FirecrawlApp:
|
|
|
2202
2222
|
requests.RequestException: If the request fails after the specified retries.
|
|
2203
2223
|
"""
|
|
2204
2224
|
for attempt in range(retries):
|
|
2205
|
-
response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] +
|
|
2225
|
+
response = requests.post(url, headers=headers, json=data, timeout=((data["timeout"] / 1000.0 + 5) if "timeout" in data and data["timeout"] is not None else None))
|
|
2206
2226
|
if response.status_code == 502:
|
|
2207
2227
|
time.sleep(backoff_factor * (2 ** attempt))
|
|
2208
2228
|
else:
|
|
@@ -2271,7 +2291,7 @@ class FirecrawlApp:
|
|
|
2271
2291
|
self,
|
|
2272
2292
|
id: str,
|
|
2273
2293
|
headers: Dict[str, str],
|
|
2274
|
-
poll_interval: int) ->
|
|
2294
|
+
poll_interval: int) -> V1CrawlStatusResponse:
|
|
2275
2295
|
"""
|
|
2276
2296
|
Monitor the status of a crawl job until completion.
|
|
2277
2297
|
|
|
@@ -2308,7 +2328,7 @@ class FirecrawlApp:
|
|
|
2308
2328
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
2309
2329
|
data.extend(status_data.get('data', []))
|
|
2310
2330
|
status_data['data'] = data
|
|
2311
|
-
return
|
|
2331
|
+
return V1CrawlStatusResponse(**status_data)
|
|
2312
2332
|
else:
|
|
2313
2333
|
raise Exception('Crawl job completed but no data was returned')
|
|
2314
2334
|
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
@@ -2393,7 +2413,7 @@ class FirecrawlApp:
|
|
|
2393
2413
|
system_prompt: Optional[str] = None,
|
|
2394
2414
|
__experimental_stream_steps: Optional[bool] = None,
|
|
2395
2415
|
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
2396
|
-
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) ->
|
|
2416
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> V1DeepResearchStatusResponse:
|
|
2397
2417
|
"""
|
|
2398
2418
|
Initiates a deep research operation on a given query and polls until completion.
|
|
2399
2419
|
|
|
@@ -2435,7 +2455,7 @@ class FirecrawlApp:
|
|
|
2435
2455
|
research_params['systemPrompt'] = system_prompt
|
|
2436
2456
|
if __experimental_stream_steps is not None:
|
|
2437
2457
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2438
|
-
research_params =
|
|
2458
|
+
research_params = V1DeepResearchParams(**research_params)
|
|
2439
2459
|
|
|
2440
2460
|
response = self.async_deep_research(
|
|
2441
2461
|
query,
|
|
@@ -2522,7 +2542,7 @@ class FirecrawlApp:
|
|
|
2522
2542
|
research_params['systemPrompt'] = system_prompt
|
|
2523
2543
|
if __experimental_stream_steps is not None:
|
|
2524
2544
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
2525
|
-
research_params =
|
|
2545
|
+
research_params = V1DeepResearchParams(**research_params)
|
|
2526
2546
|
|
|
2527
2547
|
headers = self._prepare_headers()
|
|
2528
2548
|
|
|
@@ -2549,7 +2569,7 @@ class FirecrawlApp:
|
|
|
2549
2569
|
|
|
2550
2570
|
return {'success': False, 'error': 'Internal server error'}
|
|
2551
2571
|
|
|
2552
|
-
def check_deep_research_status(self, id: str) ->
|
|
2572
|
+
def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse:
|
|
2553
2573
|
"""
|
|
2554
2574
|
Check the status of a deep research operation.
|
|
2555
2575
|
|
|
@@ -2610,7 +2630,7 @@ class FirecrawlApp:
|
|
|
2610
2630
|
method_params = {
|
|
2611
2631
|
"scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
|
|
2612
2632
|
"timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
|
|
2613
|
-
"block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "max_age", "
|
|
2633
|
+
"block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "max_age", "integration"},
|
|
2614
2634
|
"search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options", "integration"},
|
|
2615
2635
|
"crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
|
|
2616
2636
|
"allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
|
|
@@ -2660,19 +2680,19 @@ class FirecrawlApp:
|
|
|
2660
2680
|
return [self._ensure_schema_dict(v) for v in schema]
|
|
2661
2681
|
return schema
|
|
2662
2682
|
|
|
2663
|
-
class
|
|
2683
|
+
class V1CrawlWatcher:
|
|
2664
2684
|
"""
|
|
2665
2685
|
A class to watch and handle crawl job events via WebSocket connection.
|
|
2666
2686
|
|
|
2667
2687
|
Attributes:
|
|
2668
2688
|
id (str): The ID of the crawl job to watch
|
|
2669
|
-
app (
|
|
2689
|
+
app (V1FirecrawlApp): The V1FirecrawlApp instance
|
|
2670
2690
|
data (List[Dict[str, Any]]): List of crawled documents/data
|
|
2671
2691
|
status (str): Current status of the crawl job
|
|
2672
2692
|
ws_url (str): WebSocket URL for the crawl job
|
|
2673
2693
|
event_handlers (dict): Dictionary of event type to list of handler functions
|
|
2674
2694
|
"""
|
|
2675
|
-
def __init__(self, id: str, app:
|
|
2695
|
+
def __init__(self, id: str, app: V1FirecrawlApp):
|
|
2676
2696
|
self.id = id
|
|
2677
2697
|
self.app = app
|
|
2678
2698
|
self.data: List[Dict[str, Any]] = []
|
|
@@ -2751,12 +2771,16 @@ class CrawlWatcher:
|
|
|
2751
2771
|
self.data.append(msg['data'])
|
|
2752
2772
|
self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
|
|
2753
2773
|
|
|
2754
|
-
class
|
|
2774
|
+
class AsyncV1FirecrawlApp(V1FirecrawlApp):
|
|
2755
2775
|
"""
|
|
2756
|
-
Asynchronous version of
|
|
2757
|
-
Provides non-blocking alternatives to all
|
|
2776
|
+
Asynchronous version of V1FirecrawlApp that implements async methods using aiohttp.
|
|
2777
|
+
Provides non-blocking alternatives to all V1FirecrawlApp operations.
|
|
2758
2778
|
"""
|
|
2759
2779
|
|
|
2780
|
+
def __init__(self, api_key: str, api_url: str = "https://api.firecrawl.dev"):
|
|
2781
|
+
# Reuse V1 helpers (_prepare_headers, _validate_kwargs, _ensure_schema_dict, _get_error_message)
|
|
2782
|
+
super().__init__(api_key=api_key, api_url=api_url)
|
|
2783
|
+
|
|
2760
2784
|
async def _async_request(
|
|
2761
2785
|
self,
|
|
2762
2786
|
method: str,
|
|
@@ -2892,14 +2916,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2892
2916
|
async def crawl_url_and_watch(
|
|
2893
2917
|
self,
|
|
2894
2918
|
url: str,
|
|
2895
|
-
params: Optional[
|
|
2896
|
-
idempotency_key: Optional[str] = None) -> '
|
|
2919
|
+
params: Optional[V1CrawlParams] = None,
|
|
2920
|
+
idempotency_key: Optional[str] = None) -> 'AsyncV1CrawlWatcher':
|
|
2897
2921
|
"""
|
|
2898
|
-
Initiate an async crawl job and return an
|
|
2922
|
+
Initiate an async crawl job and return an AsyncV1CrawlWatcher to monitor progress via WebSocket.
|
|
2899
2923
|
|
|
2900
2924
|
Args:
|
|
2901
2925
|
url (str): Target URL to start crawling from
|
|
2902
|
-
params (Optional[
|
|
2926
|
+
params (Optional[V1CrawlParams]): See V1CrawlParams model for configuration:
|
|
2903
2927
|
URL Discovery:
|
|
2904
2928
|
* includePaths - Patterns of URLs to include
|
|
2905
2929
|
* excludePaths - Patterns of URLs to exclude
|
|
@@ -2922,28 +2946,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2922
2946
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2923
2947
|
|
|
2924
2948
|
Returns:
|
|
2925
|
-
|
|
2949
|
+
AsyncV1CrawlWatcher: An instance to monitor the crawl job via WebSocket
|
|
2926
2950
|
|
|
2927
2951
|
Raises:
|
|
2928
2952
|
Exception: If crawl job fails to start
|
|
2929
2953
|
"""
|
|
2930
2954
|
crawl_response = await self.async_crawl_url(url, params, idempotency_key)
|
|
2931
2955
|
if crawl_response.get('success') and 'id' in crawl_response:
|
|
2932
|
-
return
|
|
2956
|
+
return AsyncV1CrawlWatcher(crawl_response['id'], self)
|
|
2933
2957
|
else:
|
|
2934
2958
|
raise Exception("Crawl job failed to start")
|
|
2935
2959
|
|
|
2936
2960
|
async def batch_scrape_urls_and_watch(
|
|
2937
2961
|
self,
|
|
2938
2962
|
urls: List[str],
|
|
2939
|
-
params: Optional[
|
|
2940
|
-
idempotency_key: Optional[str] = None) -> '
|
|
2963
|
+
params: Optional[V1ScrapeParams] = None,
|
|
2964
|
+
idempotency_key: Optional[str] = None) -> 'AsyncV1CrawlWatcher':
|
|
2941
2965
|
"""
|
|
2942
|
-
Initiate an async batch scrape job and return an
|
|
2966
|
+
Initiate an async batch scrape job and return an AsyncV1CrawlWatcher to monitor progress.
|
|
2943
2967
|
|
|
2944
2968
|
Args:
|
|
2945
2969
|
urls (List[str]): List of URLs to scrape
|
|
2946
|
-
params (Optional[
|
|
2970
|
+
params (Optional[V1ScrapeParams]): See V1ScrapeParams model for configuration:
|
|
2947
2971
|
|
|
2948
2972
|
Content Options:
|
|
2949
2973
|
* formats - Content formats to retrieve
|
|
@@ -2964,14 +2988,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2964
2988
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
2965
2989
|
|
|
2966
2990
|
Returns:
|
|
2967
|
-
|
|
2991
|
+
AsyncV1CrawlWatcher: An instance to monitor the batch scrape job via WebSocket
|
|
2968
2992
|
|
|
2969
2993
|
Raises:
|
|
2970
2994
|
Exception: If batch scrape job fails to start
|
|
2971
2995
|
"""
|
|
2972
2996
|
batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key)
|
|
2973
2997
|
if batch_response.get('success') and 'id' in batch_response:
|
|
2974
|
-
return
|
|
2998
|
+
return AsyncV1CrawlWatcher(batch_response['id'], self)
|
|
2975
2999
|
else:
|
|
2976
3000
|
raise Exception("Batch scrape job failed to start")
|
|
2977
3001
|
|
|
@@ -2985,19 +3009,18 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2985
3009
|
exclude_tags: Optional[List[str]] = None,
|
|
2986
3010
|
only_main_content: Optional[bool] = None,
|
|
2987
3011
|
wait_for: Optional[int] = None,
|
|
2988
|
-
timeout: Optional[int] =
|
|
2989
|
-
location: Optional[
|
|
3012
|
+
timeout: Optional[int] = 30000,
|
|
3013
|
+
location: Optional[V1LocationConfig] = None,
|
|
2990
3014
|
mobile: Optional[bool] = None,
|
|
2991
3015
|
skip_tls_verification: Optional[bool] = None,
|
|
2992
3016
|
remove_base64_images: Optional[bool] = None,
|
|
2993
3017
|
block_ads: Optional[bool] = None,
|
|
2994
3018
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2995
3019
|
parse_pdf: Optional[bool] = None,
|
|
2996
|
-
extract: Optional[
|
|
2997
|
-
json_options: Optional[
|
|
2998
|
-
actions: Optional[List[Union[
|
|
2999
|
-
|
|
3000
|
-
**kwargs) -> ScrapeResponse[Any]:
|
|
3020
|
+
extract: Optional[V1JsonConfig] = None,
|
|
3021
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
3022
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
3023
|
+
**kwargs) -> V1ScrapeResponse[Any]:
|
|
3001
3024
|
"""
|
|
3002
3025
|
Scrape a single URL asynchronously.
|
|
3003
3026
|
|
|
@@ -3010,20 +3033,19 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3010
3033
|
only_main_content (Optional[bool]): Extract main content only
|
|
3011
3034
|
wait_for (Optional[int]): Wait for a specific element to appear
|
|
3012
3035
|
timeout (Optional[int]): Request timeout (ms)
|
|
3013
|
-
location (Optional[
|
|
3036
|
+
location (Optional[V1LocationConfig]): Location configuration
|
|
3014
3037
|
mobile (Optional[bool]): Use mobile user agent
|
|
3015
3038
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
3016
3039
|
remove_base64_images (Optional[bool]): Remove base64 images
|
|
3017
3040
|
block_ads (Optional[bool]): Block ads
|
|
3018
3041
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
3019
|
-
extract (Optional[
|
|
3020
|
-
json_options (Optional[
|
|
3021
|
-
actions (Optional[List[Union[
|
|
3022
|
-
agent (Optional[AgentOptions]): Agent configuration for FIRE-1 model
|
|
3042
|
+
extract (Optional[V1JsonConfig]): Content extraction settings
|
|
3043
|
+
json_options (Optional[V1JsonConfig]): JSON extraction settings
|
|
3044
|
+
actions (Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]]): Actions to perform
|
|
3023
3045
|
**kwargs: Additional parameters to pass to the API
|
|
3024
3046
|
|
|
3025
3047
|
Returns:
|
|
3026
|
-
|
|
3048
|
+
V1ScrapeResponse with:
|
|
3027
3049
|
* success - Whether scrape was successful
|
|
3028
3050
|
* markdown - Markdown content if requested
|
|
3029
3051
|
* html - HTML content if requested
|
|
@@ -3089,8 +3111,6 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3089
3111
|
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(by_alias=True, exclude_none=True)
|
|
3090
3112
|
if actions:
|
|
3091
3113
|
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(by_alias=True, exclude_none=True) for action in actions]
|
|
3092
|
-
if agent is not None:
|
|
3093
|
-
scrape_params['agent'] = agent.dict(by_alias=True, exclude_none=True)
|
|
3094
3114
|
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
|
|
3095
3115
|
scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
|
|
3096
3116
|
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
|
|
@@ -3105,7 +3125,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3105
3125
|
)
|
|
3106
3126
|
|
|
3107
3127
|
if response.get('success') and 'data' in response:
|
|
3108
|
-
return
|
|
3128
|
+
return V1ScrapeResponse(**response['data'])
|
|
3109
3129
|
elif "error" in response:
|
|
3110
3130
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
3111
3131
|
else:
|
|
@@ -3123,21 +3143,21 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3123
3143
|
exclude_tags: Optional[List[str]] = None,
|
|
3124
3144
|
only_main_content: Optional[bool] = None,
|
|
3125
3145
|
wait_for: Optional[int] = None,
|
|
3126
|
-
timeout: Optional[int] =
|
|
3127
|
-
location: Optional[
|
|
3146
|
+
timeout: Optional[int] = 30000,
|
|
3147
|
+
location: Optional[V1LocationConfig] = None,
|
|
3128
3148
|
mobile: Optional[bool] = None,
|
|
3129
3149
|
skip_tls_verification: Optional[bool] = None,
|
|
3130
3150
|
remove_base64_images: Optional[bool] = None,
|
|
3131
3151
|
block_ads: Optional[bool] = None,
|
|
3132
3152
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3133
|
-
extract: Optional[
|
|
3134
|
-
json_options: Optional[
|
|
3135
|
-
actions: Optional[List[Union[
|
|
3136
|
-
agent: Optional[
|
|
3153
|
+
extract: Optional[V1JsonConfig] = None,
|
|
3154
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
3155
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
3156
|
+
agent: Optional[V1AgentOptions] = None,
|
|
3137
3157
|
poll_interval: Optional[int] = 2,
|
|
3138
3158
|
idempotency_key: Optional[str] = None,
|
|
3139
3159
|
**kwargs
|
|
3140
|
-
) ->
|
|
3160
|
+
) -> V1BatchScrapeStatusResponse:
|
|
3141
3161
|
"""
|
|
3142
3162
|
Asynchronously scrape multiple URLs and monitor until completion.
|
|
3143
3163
|
|
|
@@ -3165,7 +3185,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3165
3185
|
**kwargs: Additional parameters to pass to the API
|
|
3166
3186
|
|
|
3167
3187
|
Returns:
|
|
3168
|
-
|
|
3188
|
+
V1BatchScrapeStatusResponse with:
|
|
3169
3189
|
* Scraping status and progress
|
|
3170
3190
|
* Scraped content for each URL
|
|
3171
3191
|
* Success/error information
|
|
@@ -3224,7 +3244,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3224
3244
|
scrape_params.update(kwargs)
|
|
3225
3245
|
|
|
3226
3246
|
# Create final params object
|
|
3227
|
-
final_params =
|
|
3247
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
3228
3248
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3229
3249
|
params_dict['urls'] = urls
|
|
3230
3250
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -3262,21 +3282,21 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3262
3282
|
exclude_tags: Optional[List[str]] = None,
|
|
3263
3283
|
only_main_content: Optional[bool] = None,
|
|
3264
3284
|
wait_for: Optional[int] = None,
|
|
3265
|
-
timeout: Optional[int] =
|
|
3266
|
-
location: Optional[
|
|
3285
|
+
timeout: Optional[int] = 30000,
|
|
3286
|
+
location: Optional[V1LocationConfig] = None,
|
|
3267
3287
|
mobile: Optional[bool] = None,
|
|
3268
3288
|
skip_tls_verification: Optional[bool] = None,
|
|
3269
3289
|
remove_base64_images: Optional[bool] = None,
|
|
3270
3290
|
block_ads: Optional[bool] = None,
|
|
3271
3291
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3272
|
-
extract: Optional[
|
|
3273
|
-
json_options: Optional[
|
|
3274
|
-
actions: Optional[List[Union[
|
|
3275
|
-
agent: Optional[
|
|
3292
|
+
extract: Optional[V1JsonConfig] = None,
|
|
3293
|
+
json_options: Optional[V1JsonConfig] = None,
|
|
3294
|
+
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
3295
|
+
agent: Optional[V1AgentOptions] = None,
|
|
3276
3296
|
zero_data_retention: Optional[bool] = None,
|
|
3277
3297
|
idempotency_key: Optional[str] = None,
|
|
3278
3298
|
**kwargs
|
|
3279
|
-
) ->
|
|
3299
|
+
) -> V1BatchScrapeResponse:
|
|
3280
3300
|
"""
|
|
3281
3301
|
Initiate a batch scrape job asynchronously.
|
|
3282
3302
|
|
|
@@ -3304,7 +3324,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3304
3324
|
**kwargs: Additional parameters to pass to the API
|
|
3305
3325
|
|
|
3306
3326
|
Returns:
|
|
3307
|
-
|
|
3327
|
+
V1BatchScrapeResponse with:
|
|
3308
3328
|
* success - Whether job started successfully
|
|
3309
3329
|
* id - Unique identifier for the job
|
|
3310
3330
|
* url - Status check URL
|
|
@@ -3366,7 +3386,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3366
3386
|
scrape_params.update(kwargs)
|
|
3367
3387
|
|
|
3368
3388
|
# Create final params object
|
|
3369
|
-
final_params =
|
|
3389
|
+
final_params = V1ScrapeParams(**scrape_params)
|
|
3370
3390
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3371
3391
|
params_dict['urls'] = urls
|
|
3372
3392
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -3386,7 +3406,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3386
3406
|
|
|
3387
3407
|
if response.get('status_code') == 200:
|
|
3388
3408
|
try:
|
|
3389
|
-
return
|
|
3409
|
+
return V1BatchScrapeResponse(**response.json())
|
|
3390
3410
|
except:
|
|
3391
3411
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3392
3412
|
else:
|
|
@@ -3405,8 +3425,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3405
3425
|
crawl_entire_domain: Optional[bool] = None,
|
|
3406
3426
|
allow_external_links: Optional[bool] = None,
|
|
3407
3427
|
ignore_sitemap: Optional[bool] = None,
|
|
3408
|
-
scrape_options: Optional[
|
|
3409
|
-
webhook: Optional[Union[str,
|
|
3428
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
3429
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
3410
3430
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3411
3431
|
ignore_query_parameters: Optional[bool] = None,
|
|
3412
3432
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -3415,7 +3435,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3415
3435
|
poll_interval: Optional[int] = 2,
|
|
3416
3436
|
idempotency_key: Optional[str] = None,
|
|
3417
3437
|
**kwargs
|
|
3418
|
-
) ->
|
|
3438
|
+
) -> V1CrawlStatusResponse:
|
|
3419
3439
|
"""
|
|
3420
3440
|
Crawl a website starting from a URL.
|
|
3421
3441
|
|
|
@@ -3430,8 +3450,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3430
3450
|
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3431
3451
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3432
3452
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3433
|
-
scrape_options (Optional[
|
|
3434
|
-
webhook (Optional[Union[str,
|
|
3453
|
+
scrape_options (Optional[V1ScrapeOptions]): Page scraping configuration
|
|
3454
|
+
webhook (Optional[Union[str, V1WebhookConfig]]): Notification webhook settings
|
|
3435
3455
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3436
3456
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3437
3457
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
@@ -3442,7 +3462,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3442
3462
|
**kwargs: Additional parameters to pass to the API
|
|
3443
3463
|
|
|
3444
3464
|
Returns:
|
|
3445
|
-
|
|
3465
|
+
V1CrawlStatusResponse with:
|
|
3446
3466
|
* Crawling status and progress
|
|
3447
3467
|
* Crawled page contents
|
|
3448
3468
|
* Success/error information
|
|
@@ -3493,7 +3513,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3493
3513
|
crawl_params.update(kwargs)
|
|
3494
3514
|
|
|
3495
3515
|
# Create final params object
|
|
3496
|
-
final_params =
|
|
3516
|
+
final_params = V1CrawlParams(**crawl_params)
|
|
3497
3517
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3498
3518
|
params_dict['url'] = url
|
|
3499
3519
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -3525,8 +3545,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3525
3545
|
crawl_entire_domain: Optional[bool] = None,
|
|
3526
3546
|
allow_external_links: Optional[bool] = None,
|
|
3527
3547
|
ignore_sitemap: Optional[bool] = None,
|
|
3528
|
-
scrape_options: Optional[
|
|
3529
|
-
webhook: Optional[Union[str,
|
|
3548
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
3549
|
+
webhook: Optional[Union[str, V1WebhookConfig]] = None,
|
|
3530
3550
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3531
3551
|
ignore_query_parameters: Optional[bool] = None,
|
|
3532
3552
|
regex_on_full_url: Optional[bool] = None,
|
|
@@ -3535,7 +3555,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3535
3555
|
poll_interval: Optional[int] = 2,
|
|
3536
3556
|
idempotency_key: Optional[str] = None,
|
|
3537
3557
|
**kwargs
|
|
3538
|
-
) ->
|
|
3558
|
+
) -> V1CrawlResponse:
|
|
3539
3559
|
"""
|
|
3540
3560
|
Start an asynchronous crawl job.
|
|
3541
3561
|
|
|
@@ -3559,7 +3579,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3559
3579
|
**kwargs: Additional parameters to pass to the API
|
|
3560
3580
|
|
|
3561
3581
|
Returns:
|
|
3562
|
-
|
|
3582
|
+
V1CrawlResponse with:
|
|
3563
3583
|
* success - Whether crawl started successfully
|
|
3564
3584
|
* id - Unique identifier for the crawl job
|
|
3565
3585
|
* url - Status check URL for the crawl
|
|
@@ -3608,7 +3628,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3608
3628
|
crawl_params.update(kwargs)
|
|
3609
3629
|
|
|
3610
3630
|
# Create final params object
|
|
3611
|
-
final_params =
|
|
3631
|
+
final_params = V1CrawlParams(**crawl_params)
|
|
3612
3632
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3613
3633
|
params_dict['url'] = url
|
|
3614
3634
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -3623,13 +3643,13 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3623
3643
|
|
|
3624
3644
|
if response.get('success'):
|
|
3625
3645
|
try:
|
|
3626
|
-
return
|
|
3646
|
+
return V1CrawlResponse(**response)
|
|
3627
3647
|
except:
|
|
3628
3648
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3629
3649
|
else:
|
|
3630
3650
|
await self._handle_error(response, 'start crawl job')
|
|
3631
3651
|
|
|
3632
|
-
async def check_crawl_status(self, id: str) ->
|
|
3652
|
+
async def check_crawl_status(self, id: str) -> V1CrawlStatusResponse:
|
|
3633
3653
|
"""
|
|
3634
3654
|
Check the status and results of an asynchronous crawl job.
|
|
3635
3655
|
|
|
@@ -3637,7 +3657,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3637
3657
|
id (str): Unique identifier for the crawl job
|
|
3638
3658
|
|
|
3639
3659
|
Returns:
|
|
3640
|
-
|
|
3660
|
+
V1CrawlStatusResponse containing:
|
|
3641
3661
|
Status Information:
|
|
3642
3662
|
* status - Current state (scraping/completed/failed/cancelled)
|
|
3643
3663
|
* completed - Number of pages crawled
|
|
@@ -3676,8 +3696,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3676
3696
|
data.extend(next_data.get('data', []))
|
|
3677
3697
|
status_data = next_data
|
|
3678
3698
|
status_data['data'] = data
|
|
3679
|
-
# Create
|
|
3680
|
-
response =
|
|
3699
|
+
# Create V1CrawlStatusResponse object from status data
|
|
3700
|
+
response = V1CrawlStatusResponse(
|
|
3681
3701
|
status=status_data.get('status'),
|
|
3682
3702
|
total=status_data.get('total'),
|
|
3683
3703
|
completed=status_data.get('completed'),
|
|
@@ -3695,7 +3715,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3695
3715
|
|
|
3696
3716
|
return response
|
|
3697
3717
|
|
|
3698
|
-
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) ->
|
|
3718
|
+
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> V1CrawlStatusResponse:
|
|
3699
3719
|
"""
|
|
3700
3720
|
Monitor the status of an asynchronous job until completion.
|
|
3701
3721
|
|
|
@@ -3705,7 +3725,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3705
3725
|
poll_interval (int): Seconds between status checks (default: 2)
|
|
3706
3726
|
|
|
3707
3727
|
Returns:
|
|
3708
|
-
|
|
3728
|
+
V1CrawlStatusResponse: The job results if completed successfully
|
|
3709
3729
|
|
|
3710
3730
|
Raises:
|
|
3711
3731
|
Exception: If the job fails or an error occurs during status checks
|
|
@@ -3730,7 +3750,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3730
3750
|
data.extend(next_data.get('data', []))
|
|
3731
3751
|
status_data = next_data
|
|
3732
3752
|
status_data['data'] = data
|
|
3733
|
-
return
|
|
3753
|
+
return V1CrawlStatusResponse(**status_data)
|
|
3734
3754
|
else:
|
|
3735
3755
|
raise Exception('Job completed but no data was returned')
|
|
3736
3756
|
elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
@@ -3747,14 +3767,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3747
3767
|
include_subdomains: Optional[bool] = None,
|
|
3748
3768
|
sitemap_only: Optional[bool] = None,
|
|
3749
3769
|
limit: Optional[int] = None,
|
|
3750
|
-
timeout: Optional[int] =
|
|
3751
|
-
params: Optional[
|
|
3770
|
+
timeout: Optional[int] = 30000,
|
|
3771
|
+
params: Optional[V1MapParams] = None) -> V1MapResponse:
|
|
3752
3772
|
"""
|
|
3753
3773
|
Asynchronously map and discover links from a URL.
|
|
3754
3774
|
|
|
3755
3775
|
Args:
|
|
3756
3776
|
url (str): Target URL to map
|
|
3757
|
-
params (Optional[
|
|
3777
|
+
params (Optional[V1MapParams]): See V1MapParams model:
|
|
3758
3778
|
Discovery Options:
|
|
3759
3779
|
* search - Filter pattern for URLs
|
|
3760
3780
|
* ignoreSitemap - Skip sitemap.xml
|
|
@@ -3766,7 +3786,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3766
3786
|
* timeout - Request timeout (ms)
|
|
3767
3787
|
|
|
3768
3788
|
Returns:
|
|
3769
|
-
|
|
3789
|
+
V1MapResponse with:
|
|
3770
3790
|
* Discovered URLs
|
|
3771
3791
|
* Success/error status
|
|
3772
3792
|
|
|
@@ -3792,7 +3812,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3792
3812
|
map_params['timeout'] = timeout
|
|
3793
3813
|
|
|
3794
3814
|
# Create final params object
|
|
3795
|
-
final_params =
|
|
3815
|
+
final_params = V1MapParams(**map_params)
|
|
3796
3816
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
3797
3817
|
params_dict['url'] = url
|
|
3798
3818
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
@@ -3806,7 +3826,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3806
3826
|
)
|
|
3807
3827
|
|
|
3808
3828
|
if response.get('success') and 'links' in response:
|
|
3809
|
-
return
|
|
3829
|
+
return V1MapResponse(**response)
|
|
3810
3830
|
elif 'error' in response:
|
|
3811
3831
|
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
|
3812
3832
|
else:
|
|
@@ -3822,7 +3842,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3822
3842
|
allow_external_links: Optional[bool] = False,
|
|
3823
3843
|
enable_web_search: Optional[bool] = False,
|
|
3824
3844
|
show_sources: Optional[bool] = False,
|
|
3825
|
-
agent: Optional[Dict[str, Any]] = None) ->
|
|
3845
|
+
agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
|
|
3826
3846
|
|
|
3827
3847
|
"""
|
|
3828
3848
|
Asynchronously extract structured information from URLs.
|
|
@@ -3838,7 +3858,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3838
3858
|
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
3839
3859
|
|
|
3840
3860
|
Returns:
|
|
3841
|
-
|
|
3861
|
+
V1ExtractResponse with:
|
|
3842
3862
|
* Structured data matching schema
|
|
3843
3863
|
* Source information if requested
|
|
3844
3864
|
* Success/error status
|
|
@@ -3893,7 +3913,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3893
3913
|
)
|
|
3894
3914
|
|
|
3895
3915
|
if status_data['status'] == 'completed':
|
|
3896
|
-
return
|
|
3916
|
+
return V1ExtractResponse(**status_data)
|
|
3897
3917
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
3898
3918
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
3899
3919
|
|
|
@@ -3901,7 +3921,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3901
3921
|
else:
|
|
3902
3922
|
raise Exception(f'Failed to extract. Error: {response.get("error")}')
|
|
3903
3923
|
|
|
3904
|
-
async def check_batch_scrape_status(self, id: str) ->
|
|
3924
|
+
async def check_batch_scrape_status(self, id: str) -> V1BatchScrapeStatusResponse:
|
|
3905
3925
|
"""
|
|
3906
3926
|
Check the status of an asynchronous batch scrape job.
|
|
3907
3927
|
|
|
@@ -3909,7 +3929,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3909
3929
|
id (str): The ID of the batch scrape job
|
|
3910
3930
|
|
|
3911
3931
|
Returns:
|
|
3912
|
-
|
|
3932
|
+
V1BatchScrapeStatusResponse containing:
|
|
3913
3933
|
Status Information:
|
|
3914
3934
|
* status - Current state (scraping/completed/failed/cancelled)
|
|
3915
3935
|
* completed - Number of URLs scraped
|
|
@@ -3949,7 +3969,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3949
3969
|
status_data = next_data
|
|
3950
3970
|
status_data['data'] = data
|
|
3951
3971
|
|
|
3952
|
-
response =
|
|
3972
|
+
response = V1BatchScrapeStatusResponse(
|
|
3953
3973
|
status=status_data.get('status'),
|
|
3954
3974
|
total=status_data.get('total'),
|
|
3955
3975
|
completed=status_data.get('completed'),
|
|
@@ -3969,7 +3989,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3969
3989
|
**response
|
|
3970
3990
|
}
|
|
3971
3991
|
|
|
3972
|
-
async def check_batch_scrape_errors(self, id: str) ->
|
|
3992
|
+
async def check_batch_scrape_errors(self, id: str) -> V1CrawlErrorsResponse:
|
|
3973
3993
|
"""
|
|
3974
3994
|
Get information about errors from an asynchronous batch scrape job.
|
|
3975
3995
|
|
|
@@ -3977,7 +3997,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3977
3997
|
id (str): The ID of the batch scrape job
|
|
3978
3998
|
|
|
3979
3999
|
Returns:
|
|
3980
|
-
|
|
4000
|
+
V1CrawlErrorsResponse containing:
|
|
3981
4001
|
errors (List[Dict[str, str]]): List of errors with fields:
|
|
3982
4002
|
* id (str): Error ID
|
|
3983
4003
|
* timestamp (str): When the error occurred
|
|
@@ -3994,7 +4014,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3994
4014
|
headers
|
|
3995
4015
|
)
|
|
3996
4016
|
|
|
3997
|
-
async def check_crawl_errors(self, id: str) ->
|
|
4017
|
+
async def check_crawl_errors(self, id: str) -> V1CrawlErrorsResponse:
|
|
3998
4018
|
"""
|
|
3999
4019
|
Get information about errors from an asynchronous crawl job.
|
|
4000
4020
|
|
|
@@ -4002,7 +4022,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4002
4022
|
id (str): The ID of the crawl job
|
|
4003
4023
|
|
|
4004
4024
|
Returns:
|
|
4005
|
-
|
|
4025
|
+
V1CrawlErrorsResponse containing:
|
|
4006
4026
|
* errors (List[Dict[str, str]]): List of errors with fields:
|
|
4007
4027
|
- id (str): Error ID
|
|
4008
4028
|
- timestamp (str): When the error occurred
|
|
@@ -4039,7 +4059,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4039
4059
|
async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response:
|
|
4040
4060
|
return await response.json()
|
|
4041
4061
|
|
|
4042
|
-
async def get_extract_status(self, job_id: str) ->
|
|
4062
|
+
async def get_extract_status(self, job_id: str) -> V1ExtractResponse[Any]:
|
|
4043
4063
|
"""
|
|
4044
4064
|
Check the status of an asynchronous extraction job.
|
|
4045
4065
|
|
|
@@ -4047,7 +4067,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4047
4067
|
job_id (str): The ID of the extraction job
|
|
4048
4068
|
|
|
4049
4069
|
Returns:
|
|
4050
|
-
|
|
4070
|
+
V1ExtractResponse[Any] with:
|
|
4051
4071
|
* success (bool): Whether request succeeded
|
|
4052
4072
|
* data (Optional[Any]): Extracted data matching schema
|
|
4053
4073
|
* error (Optional[str]): Error message if any
|
|
@@ -4076,7 +4096,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4076
4096
|
allow_external_links: Optional[bool] = False,
|
|
4077
4097
|
enable_web_search: Optional[bool] = False,
|
|
4078
4098
|
show_sources: Optional[bool] = False,
|
|
4079
|
-
agent: Optional[Dict[str, Any]] = None) ->
|
|
4099
|
+
agent: Optional[Dict[str, Any]] = None) -> V1ExtractResponse[Any]:
|
|
4080
4100
|
"""
|
|
4081
4101
|
Initiate an asynchronous extraction job without waiting for completion.
|
|
4082
4102
|
|
|
@@ -4092,7 +4112,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4092
4112
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
4093
4113
|
|
|
4094
4114
|
Returns:
|
|
4095
|
-
|
|
4115
|
+
V1ExtractResponse[Any] with:
|
|
4096
4116
|
* success (bool): Whether request succeeded
|
|
4097
4117
|
* data (Optional[Any]): Extracted data matching schema
|
|
4098
4118
|
* error (Optional[str]): Error message if any
|
|
@@ -4111,7 +4131,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4111
4131
|
if schema:
|
|
4112
4132
|
schema = self._ensure_schema_dict(schema)
|
|
4113
4133
|
|
|
4114
|
-
request_data =
|
|
4134
|
+
request_data = V1ExtractResponse(
|
|
4115
4135
|
urls=urls or [],
|
|
4116
4136
|
allowExternalLinks=allow_external_links,
|
|
4117
4137
|
enableWebSearch=enable_web_search,
|
|
@@ -4142,7 +4162,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4142
4162
|
*,
|
|
4143
4163
|
max_urls: Optional[int] = None,
|
|
4144
4164
|
show_full_text: Optional[bool] = None,
|
|
4145
|
-
experimental_stream: Optional[bool] = None) ->
|
|
4165
|
+
experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextStatusResponse:
|
|
4146
4166
|
"""
|
|
4147
4167
|
Generate LLMs.txt for a given URL and monitor until completion.
|
|
4148
4168
|
|
|
@@ -4153,7 +4173,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4153
4173
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
4154
4174
|
|
|
4155
4175
|
Returns:
|
|
4156
|
-
|
|
4176
|
+
V1GenerateLLMsTextStatusResponse containing:
|
|
4157
4177
|
* success (bool): Whether generation completed successfully
|
|
4158
4178
|
* status (str): Status of generation (processing/completed/failed)
|
|
4159
4179
|
* data (Dict[str, str], optional): Generated text with fields:
|
|
@@ -4195,7 +4215,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4195
4215
|
|
|
4196
4216
|
await asyncio.sleep(2)
|
|
4197
4217
|
|
|
4198
|
-
return
|
|
4218
|
+
return V1GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly', status='failed', expiresAt='')
|
|
4199
4219
|
|
|
4200
4220
|
async def async_generate_llms_text(
|
|
4201
4221
|
self,
|
|
@@ -4204,7 +4224,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4204
4224
|
max_urls: Optional[int] = None,
|
|
4205
4225
|
show_full_text: Optional[bool] = None,
|
|
4206
4226
|
cache: Optional[bool] = None,
|
|
4207
|
-
experimental_stream: Optional[bool] = None) ->
|
|
4227
|
+
experimental_stream: Optional[bool] = None) -> V1GenerateLLMsTextResponse:
|
|
4208
4228
|
"""
|
|
4209
4229
|
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
|
4210
4230
|
|
|
@@ -4216,7 +4236,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4216
4236
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
4217
4237
|
|
|
4218
4238
|
Returns:
|
|
4219
|
-
|
|
4239
|
+
V1GenerateLLMsTextResponse containing:
|
|
4220
4240
|
* success (bool): Whether job started successfully
|
|
4221
4241
|
* id (str): Unique identifier for the job
|
|
4222
4242
|
* error (str, optional): Error message if start failed
|
|
@@ -4232,7 +4252,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4232
4252
|
if experimental_stream is not None:
|
|
4233
4253
|
params['__experimental_stream'] = experimental_stream
|
|
4234
4254
|
|
|
4235
|
-
params =
|
|
4255
|
+
params = V1GenerateLLMsTextParams(
|
|
4236
4256
|
maxUrls=max_urls,
|
|
4237
4257
|
showFullText=show_full_text,
|
|
4238
4258
|
cache=cache,
|
|
@@ -4252,7 +4272,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4252
4272
|
except Exception as e:
|
|
4253
4273
|
raise ValueError(str(e))
|
|
4254
4274
|
|
|
4255
|
-
async def check_generate_llms_text_status(self, id: str) ->
|
|
4275
|
+
async def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextStatusResponse:
|
|
4256
4276
|
"""
|
|
4257
4277
|
Check the status of an asynchronous LLMs.txt generation job.
|
|
4258
4278
|
|
|
@@ -4260,7 +4280,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4260
4280
|
id (str): The ID of the generation job
|
|
4261
4281
|
|
|
4262
4282
|
Returns:
|
|
4263
|
-
|
|
4283
|
+
V1GenerateLLMsTextStatusResponse containing:
|
|
4264
4284
|
* success (bool): Whether generation completed successfully
|
|
4265
4285
|
* status (str): Status of generation (processing/completed/failed)
|
|
4266
4286
|
* data (Dict[str, str], optional): Generated text with fields:
|
|
@@ -4292,7 +4312,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4292
4312
|
system_prompt: Optional[str] = None,
|
|
4293
4313
|
__experimental_stream_steps: Optional[bool] = None,
|
|
4294
4314
|
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
4295
|
-
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) ->
|
|
4315
|
+
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> V1DeepResearchStatusResponse:
|
|
4296
4316
|
"""
|
|
4297
4317
|
Initiates a deep research operation on a given query and polls until completion.
|
|
4298
4318
|
|
|
@@ -4334,7 +4354,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4334
4354
|
research_params['systemPrompt'] = system_prompt
|
|
4335
4355
|
if __experimental_stream_steps is not None:
|
|
4336
4356
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
4337
|
-
research_params =
|
|
4357
|
+
research_params = V1DeepResearchParams(**research_params)
|
|
4338
4358
|
|
|
4339
4359
|
response = await self.async_deep_research(
|
|
4340
4360
|
query,
|
|
@@ -4375,7 +4395,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4375
4395
|
|
|
4376
4396
|
await asyncio.sleep(2)
|
|
4377
4397
|
|
|
4378
|
-
return
|
|
4398
|
+
return V1DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
|
|
4379
4399
|
|
|
4380
4400
|
async def async_deep_research(
|
|
4381
4401
|
self,
|
|
@@ -4421,7 +4441,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4421
4441
|
research_params['systemPrompt'] = system_prompt
|
|
4422
4442
|
if __experimental_stream_steps is not None:
|
|
4423
4443
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
|
4424
|
-
research_params =
|
|
4444
|
+
research_params = V1DeepResearchParams(**research_params)
|
|
4425
4445
|
|
|
4426
4446
|
headers = self._prepare_headers()
|
|
4427
4447
|
|
|
@@ -4437,7 +4457,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4437
4457
|
except Exception as e:
|
|
4438
4458
|
raise ValueError(str(e))
|
|
4439
4459
|
|
|
4440
|
-
async def check_deep_research_status(self, id: str) ->
|
|
4460
|
+
async def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse:
|
|
4441
4461
|
"""
|
|
4442
4462
|
Check the status of a deep research operation.
|
|
4443
4463
|
|
|
@@ -4481,10 +4501,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4481
4501
|
lang: Optional[str] = None,
|
|
4482
4502
|
country: Optional[str] = None,
|
|
4483
4503
|
location: Optional[str] = None,
|
|
4484
|
-
timeout: Optional[int] =
|
|
4485
|
-
scrape_options: Optional[
|
|
4486
|
-
params: Optional[Union[Dict[str, Any],
|
|
4487
|
-
**kwargs) ->
|
|
4504
|
+
timeout: Optional[int] = 30000,
|
|
4505
|
+
scrape_options: Optional[V1ScrapeOptions] = None,
|
|
4506
|
+
params: Optional[Union[Dict[str, Any], V1SearchParams]] = None,
|
|
4507
|
+
**kwargs) -> V1SearchResponse:
|
|
4488
4508
|
"""
|
|
4489
4509
|
Asynchronously search for content using Firecrawl.
|
|
4490
4510
|
|
|
@@ -4541,7 +4561,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4541
4561
|
search_params.update(kwargs)
|
|
4542
4562
|
|
|
4543
4563
|
# Create final params object
|
|
4544
|
-
final_params =
|
|
4564
|
+
final_params = V1SearchParams(query=query, **search_params)
|
|
4545
4565
|
params_dict = final_params.dict(by_alias=True, exclude_none=True)
|
|
4546
4566
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
4547
4567
|
|
|
@@ -4551,11 +4571,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4551
4571
|
{"Authorization": f"Bearer {self.api_key}"}
|
|
4552
4572
|
)
|
|
4553
4573
|
|
|
4554
|
-
class
|
|
4574
|
+
class AsyncV1CrawlWatcher(V1CrawlWatcher):
|
|
4555
4575
|
"""
|
|
4556
|
-
Async version of
|
|
4576
|
+
Async version of V1CrawlWatcher that properly handles async operations.
|
|
4557
4577
|
"""
|
|
4558
|
-
def __init__(self, id: str, app:
|
|
4578
|
+
def __init__(self, id: str, app: AsyncV1FirecrawlApp):
|
|
4559
4579
|
super().__init__(id, app)
|
|
4560
4580
|
|
|
4561
4581
|
async def connect(self) -> None:
|