firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crawling functionality for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Optional, Dict, Any, List
|
|
7
|
+
from ..types import (
|
|
8
|
+
CrawlRequest,
|
|
9
|
+
CrawlJob,
|
|
10
|
+
CrawlResponse, Document, CrawlParamsRequest, CrawlParamsResponse, CrawlParamsData,
|
|
11
|
+
WebhookConfig, CrawlErrorsResponse, ActiveCrawlsResponse, ActiveCrawl, PaginationConfig
|
|
12
|
+
)
|
|
13
|
+
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
14
|
+
from ..utils.normalize import normalize_document_input
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _validate_crawl_request(request: CrawlRequest) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Validate crawl request parameters.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
request: CrawlRequest to validate
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
ValueError: If request is invalid
|
|
26
|
+
"""
|
|
27
|
+
if not request.url or not request.url.strip():
|
|
28
|
+
raise ValueError("URL cannot be empty")
|
|
29
|
+
|
|
30
|
+
if request.limit is not None and request.limit <= 0:
|
|
31
|
+
raise ValueError("Limit must be positive")
|
|
32
|
+
|
|
33
|
+
# Validate scrape_options (if provided)
|
|
34
|
+
if request.scrape_options is not None:
|
|
35
|
+
validate_scrape_options(request.scrape_options)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
39
|
+
"""
|
|
40
|
+
Prepare crawl request for API submission.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
request: CrawlRequest to prepare
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Dictionary ready for API submission
|
|
47
|
+
"""
|
|
48
|
+
# Validate request
|
|
49
|
+
_validate_crawl_request(request)
|
|
50
|
+
|
|
51
|
+
# Start with basic data
|
|
52
|
+
data = {"url": request.url}
|
|
53
|
+
|
|
54
|
+
# Add prompt if present
|
|
55
|
+
if request.prompt:
|
|
56
|
+
data["prompt"] = request.prompt
|
|
57
|
+
|
|
58
|
+
# Handle scrape_options conversion first (before model_dump)
|
|
59
|
+
if request.scrape_options is not None:
|
|
60
|
+
scrape_data = prepare_scrape_options(request.scrape_options)
|
|
61
|
+
if scrape_data:
|
|
62
|
+
data["scrapeOptions"] = scrape_data
|
|
63
|
+
|
|
64
|
+
# Convert request to dict
|
|
65
|
+
request_data = request.model_dump(exclude_none=True, exclude_unset=True)
|
|
66
|
+
|
|
67
|
+
# Remove url, prompt, and scrape_options (already handled)
|
|
68
|
+
request_data.pop("url", None)
|
|
69
|
+
request_data.pop("prompt", None)
|
|
70
|
+
request_data.pop("scrape_options", None)
|
|
71
|
+
|
|
72
|
+
# Handle webhook conversion first (before model_dump)
|
|
73
|
+
if request.webhook is not None:
|
|
74
|
+
if isinstance(request.webhook, str):
|
|
75
|
+
data["webhook"] = request.webhook
|
|
76
|
+
else:
|
|
77
|
+
# Convert WebhookConfig to dict
|
|
78
|
+
data["webhook"] = request.webhook.model_dump(exclude_none=True)
|
|
79
|
+
|
|
80
|
+
# Convert other snake_case fields to camelCase
|
|
81
|
+
field_mappings = {
|
|
82
|
+
"include_paths": "includePaths",
|
|
83
|
+
"exclude_paths": "excludePaths",
|
|
84
|
+
"max_discovery_depth": "maxDiscoveryDepth",
|
|
85
|
+
"sitemap": "sitemap",
|
|
86
|
+
"ignore_query_parameters": "ignoreQueryParameters",
|
|
87
|
+
"crawl_entire_domain": "crawlEntireDomain",
|
|
88
|
+
"allow_external_links": "allowExternalLinks",
|
|
89
|
+
"allow_subdomains": "allowSubdomains",
|
|
90
|
+
"delay": "delay",
|
|
91
|
+
"max_concurrency": "maxConcurrency",
|
|
92
|
+
"zero_data_retention": "zeroDataRetention"
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# Apply field mappings
|
|
96
|
+
for snake_case, camel_case in field_mappings.items():
|
|
97
|
+
if snake_case in request_data:
|
|
98
|
+
data[camel_case] = request_data.pop(snake_case)
|
|
99
|
+
|
|
100
|
+
# Add any remaining fields that don't need conversion (like limit)
|
|
101
|
+
data.update(request_data)
|
|
102
|
+
# Trim integration if present
|
|
103
|
+
if "integration" in data and isinstance(data["integration"], str):
|
|
104
|
+
data["integration"] = data["integration"].strip()
|
|
105
|
+
|
|
106
|
+
return data
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def start_crawl(client: HttpClient, request: CrawlRequest) -> CrawlResponse:
|
|
110
|
+
"""
|
|
111
|
+
Start a crawl job for a website.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
client: HTTP client instance
|
|
115
|
+
request: CrawlRequest containing URL and options
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
CrawlResponse with job information
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
ValueError: If request is invalid
|
|
122
|
+
Exception: If the crawl operation fails to start
|
|
123
|
+
"""
|
|
124
|
+
request_data = _prepare_crawl_request(request)
|
|
125
|
+
|
|
126
|
+
response = client.post("/v2/crawl", request_data)
|
|
127
|
+
|
|
128
|
+
if not response.ok:
|
|
129
|
+
handle_response_error(response, "start crawl")
|
|
130
|
+
|
|
131
|
+
response_data = response.json()
|
|
132
|
+
|
|
133
|
+
if response_data.get("success"):
|
|
134
|
+
job_data = {
|
|
135
|
+
"id": response_data.get("id"),
|
|
136
|
+
"url": response_data.get("url")
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return CrawlResponse(**job_data)
|
|
140
|
+
else:
|
|
141
|
+
raise Exception(response_data.get("error", "Unknown error occurred"))
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_crawl_status(
|
|
145
|
+
client: HttpClient,
|
|
146
|
+
job_id: str,
|
|
147
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
148
|
+
*,
|
|
149
|
+
request_timeout: Optional[float] = None,
|
|
150
|
+
) -> CrawlJob:
|
|
151
|
+
"""
|
|
152
|
+
Get the status of a crawl job.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
client: HTTP client instance
|
|
156
|
+
job_id: ID of the crawl job
|
|
157
|
+
pagination_config: Optional configuration for pagination behavior
|
|
158
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
159
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
160
|
+
each page request separately, not to the entire operation
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
CrawlJob with current status and data
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
Exception: If the status check fails
|
|
167
|
+
"""
|
|
168
|
+
# Make the API request
|
|
169
|
+
response = client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
|
|
170
|
+
|
|
171
|
+
# Handle errors
|
|
172
|
+
if not response.ok:
|
|
173
|
+
handle_response_error(response, "get crawl status")
|
|
174
|
+
|
|
175
|
+
# Parse response
|
|
176
|
+
response_data = response.json()
|
|
177
|
+
|
|
178
|
+
if response_data.get("success"):
|
|
179
|
+
# The API returns status fields at the top level, not in a data field
|
|
180
|
+
|
|
181
|
+
# Convert documents
|
|
182
|
+
documents = []
|
|
183
|
+
data_list = response_data.get("data", [])
|
|
184
|
+
for doc_data in data_list:
|
|
185
|
+
if isinstance(doc_data, str):
|
|
186
|
+
# Handle case where API returns just URLs - this shouldn't happen for crawl
|
|
187
|
+
# but we'll handle it gracefully
|
|
188
|
+
continue
|
|
189
|
+
else:
|
|
190
|
+
documents.append(Document(**normalize_document_input(doc_data)))
|
|
191
|
+
|
|
192
|
+
# Handle pagination if requested
|
|
193
|
+
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
194
|
+
if auto_paginate and response_data.get("next") and not (
|
|
195
|
+
pagination_config
|
|
196
|
+
and pagination_config.max_results is not None
|
|
197
|
+
and len(documents) >= pagination_config.max_results
|
|
198
|
+
):
|
|
199
|
+
documents = _fetch_all_pages(
|
|
200
|
+
client,
|
|
201
|
+
response_data.get("next"),
|
|
202
|
+
documents,
|
|
203
|
+
pagination_config,
|
|
204
|
+
request_timeout=request_timeout,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Create CrawlJob with current status and data
|
|
208
|
+
return CrawlJob(
|
|
209
|
+
status=response_data.get("status"),
|
|
210
|
+
completed=response_data.get("completed", 0),
|
|
211
|
+
total=response_data.get("total", 0),
|
|
212
|
+
credits_used=response_data.get("creditsUsed", 0),
|
|
213
|
+
expires_at=response_data.get("expiresAt"),
|
|
214
|
+
next=response_data.get("next", None) if not auto_paginate else None,
|
|
215
|
+
data=documents
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
raise Exception(response_data.get("error", "Unknown error occurred"))
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _fetch_all_pages(
|
|
222
|
+
client: HttpClient,
|
|
223
|
+
next_url: str,
|
|
224
|
+
initial_documents: List[Document],
|
|
225
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
226
|
+
*,
|
|
227
|
+
request_timeout: Optional[float] = None,
|
|
228
|
+
) -> List[Document]:
|
|
229
|
+
"""
|
|
230
|
+
Fetch all pages of crawl results.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
client: HTTP client instance
|
|
234
|
+
next_url: URL for the next page
|
|
235
|
+
initial_documents: Documents from the first page
|
|
236
|
+
pagination_config: Optional configuration for pagination limits
|
|
237
|
+
request_timeout: Optional timeout (in seconds) for the underlying HTTP request
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
List of all documents from all pages
|
|
241
|
+
"""
|
|
242
|
+
documents = initial_documents.copy()
|
|
243
|
+
current_url = next_url
|
|
244
|
+
page_count = 0
|
|
245
|
+
|
|
246
|
+
# Apply pagination limits
|
|
247
|
+
max_pages = pagination_config.max_pages if pagination_config else None
|
|
248
|
+
max_results = pagination_config.max_results if pagination_config else None
|
|
249
|
+
max_wait_time = pagination_config.max_wait_time if pagination_config else None
|
|
250
|
+
|
|
251
|
+
start_time = time.monotonic()
|
|
252
|
+
|
|
253
|
+
while current_url:
|
|
254
|
+
# Check pagination limits (treat 0 as a valid limit)
|
|
255
|
+
if (max_pages is not None) and page_count >= max_pages:
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
|
|
259
|
+
break
|
|
260
|
+
|
|
261
|
+
# Fetch next page
|
|
262
|
+
response = client.get(current_url, timeout=request_timeout)
|
|
263
|
+
|
|
264
|
+
if not response.ok:
|
|
265
|
+
# Log error but continue with what we have
|
|
266
|
+
import logging
|
|
267
|
+
logger = logging.getLogger("firecrawl")
|
|
268
|
+
logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
|
|
269
|
+
break
|
|
270
|
+
|
|
271
|
+
page_data = response.json()
|
|
272
|
+
|
|
273
|
+
if not page_data.get("success"):
|
|
274
|
+
break
|
|
275
|
+
|
|
276
|
+
# Add documents from this page
|
|
277
|
+
data_list = page_data.get("data", [])
|
|
278
|
+
for doc_data in data_list:
|
|
279
|
+
if isinstance(doc_data, str):
|
|
280
|
+
continue
|
|
281
|
+
else:
|
|
282
|
+
# Check max_results limit BEFORE adding each document
|
|
283
|
+
if max_results is not None and len(documents) >= max_results:
|
|
284
|
+
break
|
|
285
|
+
documents.append(Document(**normalize_document_input(doc_data)))
|
|
286
|
+
|
|
287
|
+
# Check if we hit max_results limit
|
|
288
|
+
if max_results is not None and len(documents) >= max_results:
|
|
289
|
+
break
|
|
290
|
+
|
|
291
|
+
# Get next URL
|
|
292
|
+
current_url = page_data.get("next")
|
|
293
|
+
page_count += 1
|
|
294
|
+
|
|
295
|
+
return documents
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def cancel_crawl(client: HttpClient, job_id: str) -> bool:
|
|
299
|
+
"""
|
|
300
|
+
Cancel a running crawl job.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
client: HTTP client instance
|
|
304
|
+
job_id: ID of the crawl job to cancel
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
bool: True if the crawl was cancelled, False otherwise
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
Exception: If the cancellation fails
|
|
311
|
+
"""
|
|
312
|
+
response = client.delete(f"/v2/crawl/{job_id}")
|
|
313
|
+
|
|
314
|
+
if not response.ok:
|
|
315
|
+
handle_response_error(response, "cancel crawl")
|
|
316
|
+
|
|
317
|
+
response_data = response.json()
|
|
318
|
+
|
|
319
|
+
return response_data.get("status") == "cancelled"
|
|
320
|
+
|
|
321
|
+
def wait_for_crawl_completion(
|
|
322
|
+
client: HttpClient,
|
|
323
|
+
job_id: str,
|
|
324
|
+
poll_interval: int = 2,
|
|
325
|
+
timeout: Optional[int] = None,
|
|
326
|
+
*,
|
|
327
|
+
request_timeout: Optional[float] = None,
|
|
328
|
+
) -> CrawlJob:
|
|
329
|
+
"""
|
|
330
|
+
Wait for a crawl job to complete, polling for status updates.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
client: HTTP client instance
|
|
334
|
+
job_id: ID of the crawl job
|
|
335
|
+
poll_interval: Seconds between status checks
|
|
336
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
337
|
+
request_timeout: Optional timeout (in seconds) for each status request
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
CrawlJob when job completes
|
|
341
|
+
|
|
342
|
+
Raises:
|
|
343
|
+
Exception: If the job fails
|
|
344
|
+
TimeoutError: If timeout is reached
|
|
345
|
+
"""
|
|
346
|
+
start_time = time.monotonic()
|
|
347
|
+
|
|
348
|
+
while True:
|
|
349
|
+
crawl_job = get_crawl_status(
|
|
350
|
+
client,
|
|
351
|
+
job_id,
|
|
352
|
+
request_timeout=request_timeout,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Check if job is complete
|
|
356
|
+
if crawl_job.status in ["completed", "failed", "cancelled"]:
|
|
357
|
+
return crawl_job
|
|
358
|
+
|
|
359
|
+
# Check timeout
|
|
360
|
+
if timeout is not None and (time.monotonic() - start_time) > timeout:
|
|
361
|
+
raise TimeoutError(f"Crawl job {job_id} did not complete within {timeout} seconds")
|
|
362
|
+
|
|
363
|
+
# Wait before next poll
|
|
364
|
+
time.sleep(poll_interval)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def crawl(
|
|
368
|
+
client: HttpClient,
|
|
369
|
+
request: CrawlRequest,
|
|
370
|
+
poll_interval: int = 2,
|
|
371
|
+
timeout: Optional[int] = None,
|
|
372
|
+
*,
|
|
373
|
+
request_timeout: Optional[float] = None,
|
|
374
|
+
) -> CrawlJob:
|
|
375
|
+
"""
|
|
376
|
+
Start a crawl job and wait for it to complete.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
client: HTTP client instance
|
|
380
|
+
request: CrawlRequest containing URL and options
|
|
381
|
+
poll_interval: Seconds between status checks
|
|
382
|
+
timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
|
|
383
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination
|
|
384
|
+
requests when fetching results. If there are multiple pages, each page request gets this timeout
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
CrawlJob when job completes
|
|
388
|
+
|
|
389
|
+
Raises:
|
|
390
|
+
ValueError: If request is invalid
|
|
391
|
+
Exception: If the crawl fails to start or complete
|
|
392
|
+
TimeoutError: If timeout is reached
|
|
393
|
+
"""
|
|
394
|
+
# Start the crawl
|
|
395
|
+
crawl_job = start_crawl(client, request)
|
|
396
|
+
job_id = crawl_job.id
|
|
397
|
+
|
|
398
|
+
# Determine the per-request timeout. If not provided, reuse the overall timeout value.
|
|
399
|
+
effective_request_timeout = request_timeout if request_timeout is not None else timeout
|
|
400
|
+
|
|
401
|
+
# Wait for completion
|
|
402
|
+
return wait_for_crawl_completion(
|
|
403
|
+
client,
|
|
404
|
+
job_id,
|
|
405
|
+
poll_interval,
|
|
406
|
+
timeout,
|
|
407
|
+
request_timeout=effective_request_timeout,
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def crawl_params_preview(client: HttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
|
|
412
|
+
"""
|
|
413
|
+
Get crawl parameters from LLM based on URL and prompt.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
client: HTTP client instance
|
|
417
|
+
request: CrawlParamsRequest containing URL and prompt
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
CrawlParamsData containing suggested crawl options
|
|
421
|
+
|
|
422
|
+
Raises:
|
|
423
|
+
ValueError: If request is invalid
|
|
424
|
+
Exception: If the operation fails
|
|
425
|
+
"""
|
|
426
|
+
# Validate request
|
|
427
|
+
if not request.url or not request.url.strip():
|
|
428
|
+
raise ValueError("URL cannot be empty")
|
|
429
|
+
|
|
430
|
+
if not request.prompt or not request.prompt.strip():
|
|
431
|
+
raise ValueError("Prompt cannot be empty")
|
|
432
|
+
|
|
433
|
+
# Prepare request data
|
|
434
|
+
request_data = {
|
|
435
|
+
"url": request.url,
|
|
436
|
+
"prompt": request.prompt
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
# Make the API request
|
|
440
|
+
response = client.post("/v2/crawl/params-preview", request_data)
|
|
441
|
+
|
|
442
|
+
# Handle errors
|
|
443
|
+
if not response.ok:
|
|
444
|
+
handle_response_error(response, "crawl params preview")
|
|
445
|
+
|
|
446
|
+
# Parse response
|
|
447
|
+
response_data = response.json()
|
|
448
|
+
|
|
449
|
+
if response_data.get("success"):
|
|
450
|
+
params_data = response_data.get("data", {})
|
|
451
|
+
|
|
452
|
+
# Convert camelCase to snake_case for CrawlParamsData
|
|
453
|
+
converted_params = {}
|
|
454
|
+
field_mappings = {
|
|
455
|
+
"includePaths": "include_paths",
|
|
456
|
+
"excludePaths": "exclude_paths",
|
|
457
|
+
"maxDiscoveryDepth": "max_discovery_depth",
|
|
458
|
+
"sitemap": "sitemap",
|
|
459
|
+
"ignoreQueryParameters": "ignore_query_parameters",
|
|
460
|
+
"crawlEntireDomain": "crawl_entire_domain",
|
|
461
|
+
"allowExternalLinks": "allow_external_links",
|
|
462
|
+
"allowSubdomains": "allow_subdomains",
|
|
463
|
+
"maxConcurrency": "max_concurrency",
|
|
464
|
+
"scrapeOptions": "scrape_options",
|
|
465
|
+
"zeroDataRetention": "zero_data_retention"
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
# Handle webhook conversion
|
|
469
|
+
if "webhook" in params_data:
|
|
470
|
+
webhook_data = params_data["webhook"]
|
|
471
|
+
if isinstance(webhook_data, dict):
|
|
472
|
+
converted_params["webhook"] = WebhookConfig(**webhook_data)
|
|
473
|
+
else:
|
|
474
|
+
converted_params["webhook"] = webhook_data
|
|
475
|
+
|
|
476
|
+
for camel_case, snake_case in field_mappings.items():
|
|
477
|
+
if camel_case in params_data:
|
|
478
|
+
if camel_case == "scrapeOptions" and params_data[camel_case] is not None:
|
|
479
|
+
# Handle nested scrapeOptions conversion
|
|
480
|
+
scrape_opts_data = params_data[camel_case]
|
|
481
|
+
converted_scrape_opts = {}
|
|
482
|
+
scrape_field_mappings = {
|
|
483
|
+
"includeTags": "include_tags",
|
|
484
|
+
"excludeTags": "exclude_tags",
|
|
485
|
+
"onlyMainContent": "only_main_content",
|
|
486
|
+
"waitFor": "wait_for",
|
|
487
|
+
"skipTlsVerification": "skip_tls_verification",
|
|
488
|
+
"removeBase64Images": "remove_base64_images"
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
for scrape_camel, scrape_snake in scrape_field_mappings.items():
|
|
492
|
+
if scrape_camel in scrape_opts_data:
|
|
493
|
+
converted_scrape_opts[scrape_snake] = scrape_opts_data[scrape_camel]
|
|
494
|
+
|
|
495
|
+
# Handle formats field - if it's a list, convert to ScrapeFormats
|
|
496
|
+
if "formats" in scrape_opts_data:
|
|
497
|
+
formats_data = scrape_opts_data["formats"]
|
|
498
|
+
if isinstance(formats_data, list):
|
|
499
|
+
# Convert list to ScrapeFormats object
|
|
500
|
+
from ..types import ScrapeFormats
|
|
501
|
+
converted_scrape_opts["formats"] = ScrapeFormats(formats=formats_data)
|
|
502
|
+
else:
|
|
503
|
+
converted_scrape_opts["formats"] = formats_data
|
|
504
|
+
|
|
505
|
+
# Add fields that don't need conversion
|
|
506
|
+
for key, value in scrape_opts_data.items():
|
|
507
|
+
if key not in scrape_field_mappings and key != "formats":
|
|
508
|
+
converted_scrape_opts[key] = value
|
|
509
|
+
|
|
510
|
+
converted_params[snake_case] = converted_scrape_opts
|
|
511
|
+
else:
|
|
512
|
+
converted_params[snake_case] = params_data[camel_case]
|
|
513
|
+
|
|
514
|
+
# Add fields that don't need conversion
|
|
515
|
+
for key, value in params_data.items():
|
|
516
|
+
if key not in field_mappings:
|
|
517
|
+
converted_params[key] = value
|
|
518
|
+
|
|
519
|
+
# Add warning if present
|
|
520
|
+
if "warning" in response_data:
|
|
521
|
+
converted_params["warning"] = response_data["warning"]
|
|
522
|
+
|
|
523
|
+
return CrawlParamsData(**converted_params)
|
|
524
|
+
else:
|
|
525
|
+
raise Exception(response_data.get("error", "Unknown error occurred"))
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def get_crawl_errors(http_client: HttpClient, crawl_id: str) -> CrawlErrorsResponse:
|
|
529
|
+
"""
|
|
530
|
+
Get errors from a crawl job.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
http_client: HTTP client for making requests
|
|
534
|
+
crawl_id: The ID of the crawl job
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
CrawlErrorsResponse containing errors and robots blocked URLs
|
|
538
|
+
|
|
539
|
+
Raises:
|
|
540
|
+
Exception: If the request fails
|
|
541
|
+
"""
|
|
542
|
+
response = http_client.get(f"/v2/crawl/{crawl_id}/errors")
|
|
543
|
+
|
|
544
|
+
if not response.ok:
|
|
545
|
+
handle_response_error(response, "check crawl errors")
|
|
546
|
+
|
|
547
|
+
try:
|
|
548
|
+
body = response.json()
|
|
549
|
+
payload = body.get("data", body)
|
|
550
|
+
# Manual key normalization since we avoid Pydantic aliases
|
|
551
|
+
normalized = {
|
|
552
|
+
"errors": payload.get("errors", []),
|
|
553
|
+
"robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
|
|
554
|
+
}
|
|
555
|
+
return CrawlErrorsResponse(**normalized)
|
|
556
|
+
except Exception as e:
|
|
557
|
+
raise Exception(f"Failed to parse crawl errors response: {e}")
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def get_active_crawls(client: HttpClient) -> ActiveCrawlsResponse:
|
|
561
|
+
"""
|
|
562
|
+
Get a list of currently active crawl jobs.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
client: HTTP client instance
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
ActiveCrawlsResponse containing a list of active crawl jobs
|
|
569
|
+
|
|
570
|
+
Raises:
|
|
571
|
+
Exception: If the request fails
|
|
572
|
+
"""
|
|
573
|
+
response = client.get("/v2/crawl/active")
|
|
574
|
+
|
|
575
|
+
if not response.ok:
|
|
576
|
+
handle_response_error(response, "get active crawls")
|
|
577
|
+
|
|
578
|
+
body = response.json()
|
|
579
|
+
if not body.get("success"):
|
|
580
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
581
|
+
|
|
582
|
+
crawls_in = body.get("crawls", [])
|
|
583
|
+
normalized_crawls = []
|
|
584
|
+
for c in crawls_in:
|
|
585
|
+
if isinstance(c, dict):
|
|
586
|
+
normalized_crawls.append({
|
|
587
|
+
"id": c.get("id"),
|
|
588
|
+
"team_id": c.get("teamId", c.get("team_id")),
|
|
589
|
+
"url": c.get("url"),
|
|
590
|
+
"options": c.get("options"),
|
|
591
|
+
})
|
|
592
|
+
return ActiveCrawlsResponse(success=True, crawls=[ActiveCrawl(**nc) for nc in normalized_crawls])
|