firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +4653 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
- firecrawl-3.0.3.dist-info/RECORD +78 -0
- tests/test_timeout_conversion.py +117 -0
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl-2.16.5.dist-info/RECORD +0 -12
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crawling functionality for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Optional, Dict, Any
|
|
7
|
+
from ..types import (
|
|
8
|
+
CrawlRequest,
|
|
9
|
+
CrawlJob,
|
|
10
|
+
CrawlResponse, Document, CrawlParamsRequest, CrawlParamsResponse, CrawlParamsData,
|
|
11
|
+
WebhookConfig, CrawlErrorsResponse, ActiveCrawlsResponse, ActiveCrawl
|
|
12
|
+
)
|
|
13
|
+
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _validate_crawl_request(request: CrawlRequest) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Validate crawl request parameters.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
request: CrawlRequest to validate
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
ValueError: If request is invalid
|
|
25
|
+
"""
|
|
26
|
+
if not request.url or not request.url.strip():
|
|
27
|
+
raise ValueError("URL cannot be empty")
|
|
28
|
+
|
|
29
|
+
if request.limit is not None and request.limit <= 0:
|
|
30
|
+
raise ValueError("Limit must be positive")
|
|
31
|
+
|
|
32
|
+
# Validate scrape_options (if provided)
|
|
33
|
+
if request.scrape_options is not None:
|
|
34
|
+
validate_scrape_options(request.scrape_options)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
38
|
+
"""
|
|
39
|
+
Prepare crawl request for API submission.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
request: CrawlRequest to prepare
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Dictionary ready for API submission
|
|
46
|
+
"""
|
|
47
|
+
# Validate request
|
|
48
|
+
_validate_crawl_request(request)
|
|
49
|
+
|
|
50
|
+
# Start with basic data
|
|
51
|
+
data = {"url": request.url}
|
|
52
|
+
|
|
53
|
+
# Add prompt if present
|
|
54
|
+
if request.prompt:
|
|
55
|
+
data["prompt"] = request.prompt
|
|
56
|
+
|
|
57
|
+
# Handle scrape_options conversion first (before model_dump)
|
|
58
|
+
if request.scrape_options is not None:
|
|
59
|
+
scrape_data = prepare_scrape_options(request.scrape_options)
|
|
60
|
+
if scrape_data:
|
|
61
|
+
data["scrapeOptions"] = scrape_data
|
|
62
|
+
|
|
63
|
+
# Convert request to dict
|
|
64
|
+
request_data = request.model_dump(exclude_none=True, exclude_unset=True)
|
|
65
|
+
|
|
66
|
+
# Remove url, prompt, and scrape_options (already handled)
|
|
67
|
+
request_data.pop("url", None)
|
|
68
|
+
request_data.pop("prompt", None)
|
|
69
|
+
request_data.pop("scrape_options", None)
|
|
70
|
+
|
|
71
|
+
# Handle webhook conversion first (before model_dump)
|
|
72
|
+
if request.webhook is not None:
|
|
73
|
+
if isinstance(request.webhook, str):
|
|
74
|
+
data["webhook"] = request.webhook
|
|
75
|
+
else:
|
|
76
|
+
# Convert WebhookConfig to dict
|
|
77
|
+
data["webhook"] = request.webhook.model_dump(exclude_none=True)
|
|
78
|
+
|
|
79
|
+
# Convert other snake_case fields to camelCase
|
|
80
|
+
field_mappings = {
|
|
81
|
+
"include_paths": "includePaths",
|
|
82
|
+
"exclude_paths": "excludePaths",
|
|
83
|
+
"max_discovery_depth": "maxDiscoveryDepth",
|
|
84
|
+
"sitemap": "sitemap",
|
|
85
|
+
"ignore_query_parameters": "ignoreQueryParameters",
|
|
86
|
+
"crawl_entire_domain": "crawlEntireDomain",
|
|
87
|
+
"allow_external_links": "allowExternalLinks",
|
|
88
|
+
"allow_subdomains": "allowSubdomains",
|
|
89
|
+
"delay": "delay",
|
|
90
|
+
"max_concurrency": "maxConcurrency",
|
|
91
|
+
"zero_data_retention": "zeroDataRetention"
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Apply field mappings
|
|
95
|
+
for snake_case, camel_case in field_mappings.items():
|
|
96
|
+
if snake_case in request_data:
|
|
97
|
+
data[camel_case] = request_data.pop(snake_case)
|
|
98
|
+
|
|
99
|
+
# Add any remaining fields that don't need conversion (like limit)
|
|
100
|
+
data.update(request_data)
|
|
101
|
+
|
|
102
|
+
return data
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def start_crawl(client: HttpClient, request: CrawlRequest) -> CrawlResponse:
|
|
106
|
+
"""
|
|
107
|
+
Start a crawl job for a website.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
client: HTTP client instance
|
|
111
|
+
request: CrawlRequest containing URL and options
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
CrawlResponse with job information
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
ValueError: If request is invalid
|
|
118
|
+
Exception: If the crawl operation fails to start
|
|
119
|
+
"""
|
|
120
|
+
request_data = _prepare_crawl_request(request)
|
|
121
|
+
|
|
122
|
+
response = client.post("/v2/crawl", request_data)
|
|
123
|
+
|
|
124
|
+
if not response.ok:
|
|
125
|
+
handle_response_error(response, "start crawl")
|
|
126
|
+
|
|
127
|
+
response_data = response.json()
|
|
128
|
+
|
|
129
|
+
if response_data.get("success"):
|
|
130
|
+
job_data = {
|
|
131
|
+
"id": response_data.get("id"),
|
|
132
|
+
"url": response_data.get("url")
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return CrawlResponse(**job_data)
|
|
136
|
+
else:
|
|
137
|
+
raise Exception(response_data.get("error", "Unknown error occurred"))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_crawl_status(client: HttpClient, job_id: str) -> CrawlJob:
|
|
141
|
+
"""
|
|
142
|
+
Get the status of a crawl job.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
client: HTTP client instance
|
|
146
|
+
job_id: ID of the crawl job
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
CrawlJob with current status and data
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
Exception: If the status check fails
|
|
153
|
+
"""
|
|
154
|
+
# Make the API request
|
|
155
|
+
response = client.get(f"/v2/crawl/{job_id}")
|
|
156
|
+
|
|
157
|
+
# Handle errors
|
|
158
|
+
if not response.ok:
|
|
159
|
+
handle_response_error(response, "get crawl status")
|
|
160
|
+
|
|
161
|
+
# Parse response
|
|
162
|
+
response_data = response.json()
|
|
163
|
+
|
|
164
|
+
if response_data.get("success"):
|
|
165
|
+
# The API returns status fields at the top level, not in a data field
|
|
166
|
+
|
|
167
|
+
# Convert documents
|
|
168
|
+
documents = []
|
|
169
|
+
data_list = response_data.get("data", [])
|
|
170
|
+
for doc_data in data_list:
|
|
171
|
+
if isinstance(doc_data, str):
|
|
172
|
+
# Handle case where API returns just URLs - this shouldn't happen for crawl
|
|
173
|
+
# but we'll handle it gracefully
|
|
174
|
+
continue
|
|
175
|
+
else:
|
|
176
|
+
documents.append(Document(**doc_data))
|
|
177
|
+
|
|
178
|
+
# Create CrawlJob with current status and data
|
|
179
|
+
return CrawlJob(
|
|
180
|
+
status=response_data.get("status"),
|
|
181
|
+
completed=response_data.get("completed", 0),
|
|
182
|
+
total=response_data.get("total", 0),
|
|
183
|
+
credits_used=response_data.get("creditsUsed", 0),
|
|
184
|
+
expires_at=response_data.get("expiresAt"),
|
|
185
|
+
next=response_data.get("next", None),
|
|
186
|
+
data=documents
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
raise Exception(response_data.get("error", "Unknown error occurred"))
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def cancel_crawl(client: HttpClient, job_id: str) -> bool:
|
|
193
|
+
"""
|
|
194
|
+
Cancel a running crawl job.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
client: HTTP client instance
|
|
198
|
+
job_id: ID of the crawl job to cancel
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
bool: True if the crawl was cancelled, False otherwise
|
|
202
|
+
|
|
203
|
+
Raises:
|
|
204
|
+
Exception: If the cancellation fails
|
|
205
|
+
"""
|
|
206
|
+
response = client.delete(f"/v2/crawl/{job_id}")
|
|
207
|
+
|
|
208
|
+
if not response.ok:
|
|
209
|
+
handle_response_error(response, "cancel crawl")
|
|
210
|
+
|
|
211
|
+
response_data = response.json()
|
|
212
|
+
|
|
213
|
+
return response_data.get("status") == "cancelled"
|
|
214
|
+
|
|
215
|
+
def wait_for_crawl_completion(
|
|
216
|
+
client: HttpClient,
|
|
217
|
+
job_id: str,
|
|
218
|
+
poll_interval: int = 2,
|
|
219
|
+
timeout: Optional[int] = None
|
|
220
|
+
) -> CrawlJob:
|
|
221
|
+
"""
|
|
222
|
+
Wait for a crawl job to complete, polling for status updates.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
client: HTTP client instance
|
|
226
|
+
job_id: ID of the crawl job
|
|
227
|
+
poll_interval: Seconds between status checks
|
|
228
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
CrawlJob when job completes
|
|
232
|
+
|
|
233
|
+
Raises:
|
|
234
|
+
Exception: If the job fails
|
|
235
|
+
TimeoutError: If timeout is reached
|
|
236
|
+
"""
|
|
237
|
+
start_time = time.time()
|
|
238
|
+
|
|
239
|
+
while True:
|
|
240
|
+
crawl_job = get_crawl_status(client, job_id)
|
|
241
|
+
|
|
242
|
+
# Check if job is complete
|
|
243
|
+
if crawl_job.status in ["completed", "failed"]:
|
|
244
|
+
return crawl_job
|
|
245
|
+
|
|
246
|
+
# Check timeout
|
|
247
|
+
if timeout and (time.time() - start_time) > timeout:
|
|
248
|
+
raise TimeoutError(f"Crawl job {job_id} did not complete within {timeout} seconds")
|
|
249
|
+
|
|
250
|
+
# Wait before next poll
|
|
251
|
+
time.sleep(poll_interval)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def crawl(
|
|
255
|
+
client: HttpClient,
|
|
256
|
+
request: CrawlRequest,
|
|
257
|
+
poll_interval: int = 2,
|
|
258
|
+
timeout: Optional[int] = None
|
|
259
|
+
) -> CrawlJob:
|
|
260
|
+
"""
|
|
261
|
+
Start a crawl job and wait for it to complete.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
client: HTTP client instance
|
|
265
|
+
request: CrawlRequest containing URL and options
|
|
266
|
+
poll_interval: Seconds between status checks
|
|
267
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
CrawlJob when job completes
|
|
271
|
+
|
|
272
|
+
Raises:
|
|
273
|
+
ValueError: If request is invalid
|
|
274
|
+
Exception: If the crawl fails to start or complete
|
|
275
|
+
TimeoutError: If timeout is reached
|
|
276
|
+
"""
|
|
277
|
+
# Start the crawl
|
|
278
|
+
crawl_job = start_crawl(client, request)
|
|
279
|
+
job_id = crawl_job.id
|
|
280
|
+
|
|
281
|
+
# Wait for completion
|
|
282
|
+
return wait_for_crawl_completion(
|
|
283
|
+
client, job_id, poll_interval, timeout
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def crawl_params_preview(client: HttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
|
|
288
|
+
"""
|
|
289
|
+
Get crawl parameters from LLM based on URL and prompt.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
client: HTTP client instance
|
|
293
|
+
request: CrawlParamsRequest containing URL and prompt
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
CrawlParamsData containing suggested crawl options
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
ValueError: If request is invalid
|
|
300
|
+
Exception: If the operation fails
|
|
301
|
+
"""
|
|
302
|
+
# Validate request
|
|
303
|
+
if not request.url or not request.url.strip():
|
|
304
|
+
raise ValueError("URL cannot be empty")
|
|
305
|
+
|
|
306
|
+
if not request.prompt or not request.prompt.strip():
|
|
307
|
+
raise ValueError("Prompt cannot be empty")
|
|
308
|
+
|
|
309
|
+
# Prepare request data
|
|
310
|
+
request_data = {
|
|
311
|
+
"url": request.url,
|
|
312
|
+
"prompt": request.prompt
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
# Make the API request
|
|
316
|
+
response = client.post("/v2/crawl/params-preview", request_data)
|
|
317
|
+
|
|
318
|
+
# Handle errors
|
|
319
|
+
if not response.ok:
|
|
320
|
+
handle_response_error(response, "crawl params preview")
|
|
321
|
+
|
|
322
|
+
# Parse response
|
|
323
|
+
response_data = response.json()
|
|
324
|
+
|
|
325
|
+
if response_data.get("success"):
|
|
326
|
+
params_data = response_data.get("data", {})
|
|
327
|
+
|
|
328
|
+
# Convert camelCase to snake_case for CrawlParamsData
|
|
329
|
+
converted_params = {}
|
|
330
|
+
field_mappings = {
|
|
331
|
+
"includePaths": "include_paths",
|
|
332
|
+
"excludePaths": "exclude_paths",
|
|
333
|
+
"maxDiscoveryDepth": "max_discovery_depth",
|
|
334
|
+
"sitemap": "sitemap",
|
|
335
|
+
"ignoreQueryParameters": "ignore_query_parameters",
|
|
336
|
+
"crawlEntireDomain": "crawl_entire_domain",
|
|
337
|
+
"allowExternalLinks": "allow_external_links",
|
|
338
|
+
"allowSubdomains": "allow_subdomains",
|
|
339
|
+
"maxConcurrency": "max_concurrency",
|
|
340
|
+
"scrapeOptions": "scrape_options",
|
|
341
|
+
"zeroDataRetention": "zero_data_retention"
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
# Handle webhook conversion
|
|
345
|
+
if "webhook" in params_data:
|
|
346
|
+
webhook_data = params_data["webhook"]
|
|
347
|
+
if isinstance(webhook_data, dict):
|
|
348
|
+
converted_params["webhook"] = WebhookConfig(**webhook_data)
|
|
349
|
+
else:
|
|
350
|
+
converted_params["webhook"] = webhook_data
|
|
351
|
+
|
|
352
|
+
for camel_case, snake_case in field_mappings.items():
|
|
353
|
+
if camel_case in params_data:
|
|
354
|
+
if camel_case == "scrapeOptions" and params_data[camel_case] is not None:
|
|
355
|
+
# Handle nested scrapeOptions conversion
|
|
356
|
+
scrape_opts_data = params_data[camel_case]
|
|
357
|
+
converted_scrape_opts = {}
|
|
358
|
+
scrape_field_mappings = {
|
|
359
|
+
"includeTags": "include_tags",
|
|
360
|
+
"excludeTags": "exclude_tags",
|
|
361
|
+
"onlyMainContent": "only_main_content",
|
|
362
|
+
"waitFor": "wait_for",
|
|
363
|
+
"skipTlsVerification": "skip_tls_verification",
|
|
364
|
+
"removeBase64Images": "remove_base64_images"
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
for scrape_camel, scrape_snake in scrape_field_mappings.items():
|
|
368
|
+
if scrape_camel in scrape_opts_data:
|
|
369
|
+
converted_scrape_opts[scrape_snake] = scrape_opts_data[scrape_camel]
|
|
370
|
+
|
|
371
|
+
# Handle formats field - if it's a list, convert to ScrapeFormats
|
|
372
|
+
if "formats" in scrape_opts_data:
|
|
373
|
+
formats_data = scrape_opts_data["formats"]
|
|
374
|
+
if isinstance(formats_data, list):
|
|
375
|
+
# Convert list to ScrapeFormats object
|
|
376
|
+
from ..types import ScrapeFormats
|
|
377
|
+
converted_scrape_opts["formats"] = ScrapeFormats(formats=formats_data)
|
|
378
|
+
else:
|
|
379
|
+
converted_scrape_opts["formats"] = formats_data
|
|
380
|
+
|
|
381
|
+
# Add fields that don't need conversion
|
|
382
|
+
for key, value in scrape_opts_data.items():
|
|
383
|
+
if key not in scrape_field_mappings and key != "formats":
|
|
384
|
+
converted_scrape_opts[key] = value
|
|
385
|
+
|
|
386
|
+
converted_params[snake_case] = converted_scrape_opts
|
|
387
|
+
else:
|
|
388
|
+
converted_params[snake_case] = params_data[camel_case]
|
|
389
|
+
|
|
390
|
+
# Add fields that don't need conversion
|
|
391
|
+
for key, value in params_data.items():
|
|
392
|
+
if key not in field_mappings:
|
|
393
|
+
converted_params[key] = value
|
|
394
|
+
|
|
395
|
+
# Add warning if present
|
|
396
|
+
if "warning" in response_data:
|
|
397
|
+
converted_params["warning"] = response_data["warning"]
|
|
398
|
+
|
|
399
|
+
return CrawlParamsData(**converted_params)
|
|
400
|
+
else:
|
|
401
|
+
raise Exception(response_data.get("error", "Unknown error occurred"))
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def get_crawl_errors(http_client: HttpClient, crawl_id: str) -> CrawlErrorsResponse:
|
|
405
|
+
"""
|
|
406
|
+
Get errors from a crawl job.
|
|
407
|
+
|
|
408
|
+
Args:
|
|
409
|
+
http_client: HTTP client for making requests
|
|
410
|
+
crawl_id: The ID of the crawl job
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
CrawlErrorsResponse containing errors and robots blocked URLs
|
|
414
|
+
|
|
415
|
+
Raises:
|
|
416
|
+
Exception: If the request fails
|
|
417
|
+
"""
|
|
418
|
+
response = http_client.get(f"/v2/crawl/{crawl_id}/errors")
|
|
419
|
+
|
|
420
|
+
if not response.ok:
|
|
421
|
+
handle_response_error(response, "check crawl errors")
|
|
422
|
+
|
|
423
|
+
try:
|
|
424
|
+
body = response.json()
|
|
425
|
+
payload = body.get("data", body)
|
|
426
|
+
# Manual key normalization since we avoid Pydantic aliases
|
|
427
|
+
normalized = {
|
|
428
|
+
"errors": payload.get("errors", []),
|
|
429
|
+
"robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
|
|
430
|
+
}
|
|
431
|
+
return CrawlErrorsResponse(**normalized)
|
|
432
|
+
except Exception as e:
|
|
433
|
+
raise Exception(f"Failed to parse crawl errors response: {e}")
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def get_active_crawls(client: HttpClient) -> ActiveCrawlsResponse:
|
|
437
|
+
"""
|
|
438
|
+
Get a list of currently active crawl jobs.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
client: HTTP client instance
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
ActiveCrawlsResponse containing a list of active crawl jobs
|
|
445
|
+
|
|
446
|
+
Raises:
|
|
447
|
+
Exception: If the request fails
|
|
448
|
+
"""
|
|
449
|
+
response = client.get("/v2/crawl/active")
|
|
450
|
+
|
|
451
|
+
if not response.ok:
|
|
452
|
+
handle_response_error(response, "get active crawls")
|
|
453
|
+
|
|
454
|
+
body = response.json()
|
|
455
|
+
if not body.get("success"):
|
|
456
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
457
|
+
|
|
458
|
+
crawls_in = body.get("crawls", [])
|
|
459
|
+
normalized_crawls = []
|
|
460
|
+
for c in crawls_in:
|
|
461
|
+
if isinstance(c, dict):
|
|
462
|
+
normalized_crawls.append({
|
|
463
|
+
"id": c.get("id"),
|
|
464
|
+
"team_id": c.get("teamId", c.get("team_id")),
|
|
465
|
+
"url": c.get("url"),
|
|
466
|
+
"options": c.get("options"),
|
|
467
|
+
})
|
|
468
|
+
return ActiveCrawlsResponse(success=True, crawls=[ActiveCrawl(**nc) for nc in normalized_crawls])
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
from ..types import ExtractResponse, ScrapeOptions
|
|
5
|
+
from ..utils.http_client import HttpClient
|
|
6
|
+
from ..utils.validation import prepare_scrape_options
|
|
7
|
+
from ..utils.error_handler import handle_response_error
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _prepare_extract_request(
|
|
11
|
+
urls: Optional[List[str]],
|
|
12
|
+
*,
|
|
13
|
+
prompt: Optional[str] = None,
|
|
14
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
15
|
+
system_prompt: Optional[str] = None,
|
|
16
|
+
allow_external_links: Optional[bool] = None,
|
|
17
|
+
enable_web_search: Optional[bool] = None,
|
|
18
|
+
show_sources: Optional[bool] = None,
|
|
19
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
20
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
21
|
+
) -> Dict[str, Any]:
|
|
22
|
+
body: Dict[str, Any] = {}
|
|
23
|
+
if urls is not None:
|
|
24
|
+
body["urls"] = urls
|
|
25
|
+
if prompt is not None:
|
|
26
|
+
body["prompt"] = prompt
|
|
27
|
+
if schema is not None:
|
|
28
|
+
body["schema"] = schema
|
|
29
|
+
if system_prompt is not None:
|
|
30
|
+
body["systemPrompt"] = system_prompt
|
|
31
|
+
if allow_external_links is not None:
|
|
32
|
+
body["allowExternalLinks"] = allow_external_links
|
|
33
|
+
if enable_web_search is not None:
|
|
34
|
+
body["enableWebSearch"] = enable_web_search
|
|
35
|
+
if show_sources is not None:
|
|
36
|
+
body["showSources"] = show_sources
|
|
37
|
+
if ignore_invalid_urls is not None:
|
|
38
|
+
body["ignoreInvalidURLs"] = ignore_invalid_urls
|
|
39
|
+
if scrape_options is not None:
|
|
40
|
+
prepared = prepare_scrape_options(scrape_options)
|
|
41
|
+
if prepared:
|
|
42
|
+
body["scrapeOptions"] = prepared
|
|
43
|
+
return body
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def start_extract(
|
|
47
|
+
client: HttpClient,
|
|
48
|
+
urls: Optional[List[str]],
|
|
49
|
+
*,
|
|
50
|
+
prompt: Optional[str] = None,
|
|
51
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
52
|
+
system_prompt: Optional[str] = None,
|
|
53
|
+
allow_external_links: Optional[bool] = None,
|
|
54
|
+
enable_web_search: Optional[bool] = None,
|
|
55
|
+
show_sources: Optional[bool] = None,
|
|
56
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
57
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
58
|
+
) -> ExtractResponse:
|
|
59
|
+
body = _prepare_extract_request(
|
|
60
|
+
urls,
|
|
61
|
+
prompt=prompt,
|
|
62
|
+
schema=schema,
|
|
63
|
+
system_prompt=system_prompt,
|
|
64
|
+
allow_external_links=allow_external_links,
|
|
65
|
+
enable_web_search=enable_web_search,
|
|
66
|
+
show_sources=show_sources,
|
|
67
|
+
scrape_options=scrape_options,
|
|
68
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
69
|
+
)
|
|
70
|
+
resp = client.post("/v2/extract", body)
|
|
71
|
+
if not resp.ok:
|
|
72
|
+
handle_response_error(resp, "extract")
|
|
73
|
+
return ExtractResponse(**resp.json())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_extract_status(client: HttpClient, job_id: str) -> ExtractResponse:
|
|
77
|
+
resp = client.get(f"/v2/extract/{job_id}")
|
|
78
|
+
if not resp.ok:
|
|
79
|
+
handle_response_error(resp, "extract-status")
|
|
80
|
+
return ExtractResponse(**resp.json())
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def wait_extract(
|
|
84
|
+
client: HttpClient,
|
|
85
|
+
job_id: str,
|
|
86
|
+
*,
|
|
87
|
+
poll_interval: int = 2,
|
|
88
|
+
timeout: Optional[int] = None,
|
|
89
|
+
) -> ExtractResponse:
|
|
90
|
+
start_ts = time.time()
|
|
91
|
+
while True:
|
|
92
|
+
status = get_extract_status(client, job_id)
|
|
93
|
+
if status.status in ("completed", "failed", "cancelled"):
|
|
94
|
+
return status
|
|
95
|
+
if timeout is not None and (time.time() - start_ts) > timeout:
|
|
96
|
+
return status
|
|
97
|
+
time.sleep(max(1, poll_interval))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def extract(
|
|
101
|
+
client: HttpClient,
|
|
102
|
+
urls: Optional[List[str]],
|
|
103
|
+
*,
|
|
104
|
+
prompt: Optional[str] = None,
|
|
105
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
106
|
+
system_prompt: Optional[str] = None,
|
|
107
|
+
allow_external_links: Optional[bool] = None,
|
|
108
|
+
enable_web_search: Optional[bool] = None,
|
|
109
|
+
show_sources: Optional[bool] = None,
|
|
110
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
111
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
112
|
+
poll_interval: int = 2,
|
|
113
|
+
timeout: Optional[int] = None,
|
|
114
|
+
) -> ExtractResponse:
|
|
115
|
+
started = start_extract(
|
|
116
|
+
client,
|
|
117
|
+
urls,
|
|
118
|
+
prompt=prompt,
|
|
119
|
+
schema=schema,
|
|
120
|
+
system_prompt=system_prompt,
|
|
121
|
+
allow_external_links=allow_external_links,
|
|
122
|
+
enable_web_search=enable_web_search,
|
|
123
|
+
show_sources=show_sources,
|
|
124
|
+
scrape_options=scrape_options,
|
|
125
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
126
|
+
)
|
|
127
|
+
job_id = getattr(started, "id", None)
|
|
128
|
+
if not job_id:
|
|
129
|
+
return started
|
|
130
|
+
return wait_extract(client, job_id, poll_interval=poll_interval, timeout=timeout)
|
|
131
|
+
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mapping functionality for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Dict, Any
|
|
6
|
+
from ..types import MapOptions, MapData, LinkResult
|
|
7
|
+
from ..utils import HttpClient, handle_response_error
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict[str, Any]:
|
|
11
|
+
if not url or not url.strip():
|
|
12
|
+
raise ValueError("URL cannot be empty")
|
|
13
|
+
|
|
14
|
+
payload: Dict[str, Any] = {"url": url.strip()}
|
|
15
|
+
|
|
16
|
+
if options is not None:
|
|
17
|
+
# Unified sitemap parameter already provided in options
|
|
18
|
+
data: Dict[str, Any] = {}
|
|
19
|
+
if getattr(options, "sitemap", None) is not None:
|
|
20
|
+
data["sitemap"] = options.sitemap
|
|
21
|
+
|
|
22
|
+
if options.search is not None:
|
|
23
|
+
data["search"] = options.search
|
|
24
|
+
if options.include_subdomains is not None:
|
|
25
|
+
data["includeSubdomains"] = options.include_subdomains
|
|
26
|
+
if options.limit is not None:
|
|
27
|
+
data["limit"] = options.limit
|
|
28
|
+
if options.timeout is not None:
|
|
29
|
+
data["timeout"] = options.timeout
|
|
30
|
+
payload.update(data)
|
|
31
|
+
|
|
32
|
+
return payload
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def map(client: HttpClient, url: str, options: Optional[MapOptions] = None) -> MapData:
|
|
36
|
+
"""
|
|
37
|
+
Map a URL and return MapData (links list with optional titles/descriptions).
|
|
38
|
+
"""
|
|
39
|
+
request_data = _prepare_map_request(url, options)
|
|
40
|
+
response = client.post("/v2/map", request_data)
|
|
41
|
+
if not response.ok:
|
|
42
|
+
handle_response_error(response, "map")
|
|
43
|
+
|
|
44
|
+
body = response.json()
|
|
45
|
+
if not body.get("success"):
|
|
46
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
47
|
+
|
|
48
|
+
# shouldnt return inside data?
|
|
49
|
+
# data = body.get("data", {})
|
|
50
|
+
# result_links: list[LinkResult] = []
|
|
51
|
+
# for item in data.get("links", []):
|
|
52
|
+
# if isinstance(item, dict):
|
|
53
|
+
# result_links.append(
|
|
54
|
+
# LinkResult(
|
|
55
|
+
# url=item.get("url", ""),
|
|
56
|
+
# title=item.get("title"),
|
|
57
|
+
# description=item.get("description"),
|
|
58
|
+
# )
|
|
59
|
+
# )
|
|
60
|
+
# elif isinstance(item, str):
|
|
61
|
+
# result_links.append(LinkResult(url=item))
|
|
62
|
+
|
|
63
|
+
result_links: list[LinkResult] = []
|
|
64
|
+
for item in body.get("links", []):
|
|
65
|
+
if isinstance(item, dict):
|
|
66
|
+
result_links.append(
|
|
67
|
+
LinkResult(
|
|
68
|
+
url=item.get("url", ""),
|
|
69
|
+
title=item.get("title"),
|
|
70
|
+
description=item.get("description"),
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
elif isinstance(item, str):
|
|
74
|
+
result_links.append(LinkResult(url=item))
|
|
75
|
+
|
|
76
|
+
return MapData(links=result_links)
|
|
77
|
+
|