firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from typing import Optional, List, Dict, Any
|
|
2
|
+
from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob, PaginationConfig
|
|
3
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
4
|
+
from ...utils.validation import prepare_scrape_options
|
|
5
|
+
from ...utils.error_handler import handle_response_error
|
|
6
|
+
from ...utils.normalize import normalize_document_input
|
|
7
|
+
from ...methods.batch import validate_batch_urls
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
def _prepare(urls: List[str], *, options: Optional[ScrapeOptions] = None, **kwargs) -> Dict[str, Any]:
|
|
11
|
+
if not urls:
|
|
12
|
+
raise ValueError("URLs list cannot be empty")
|
|
13
|
+
|
|
14
|
+
validated_urls = validate_batch_urls([u.strip() if isinstance(u, str) else u for u in urls])
|
|
15
|
+
payload: Dict[str, Any] = {"urls": validated_urls}
|
|
16
|
+
if options:
|
|
17
|
+
opts = prepare_scrape_options(options)
|
|
18
|
+
if opts:
|
|
19
|
+
payload.update(opts)
|
|
20
|
+
if (w := kwargs.get("webhook")) is not None:
|
|
21
|
+
payload["webhook"] = w if isinstance(w, str) else w.model_dump(exclude_none=True)
|
|
22
|
+
if (v := kwargs.get("append_to_id")) is not None:
|
|
23
|
+
payload["appendToId"] = v
|
|
24
|
+
if (v := kwargs.get("ignore_invalid_urls")) is not None:
|
|
25
|
+
payload["ignoreInvalidURLs"] = v
|
|
26
|
+
if (v := kwargs.get("max_concurrency")) is not None:
|
|
27
|
+
payload["maxConcurrency"] = v
|
|
28
|
+
if (v := kwargs.get("zero_data_retention")) is not None:
|
|
29
|
+
payload["zeroDataRetention"] = v
|
|
30
|
+
if (v := kwargs.get("integration")) is not None:
|
|
31
|
+
trimmed_integration = str(v).strip()
|
|
32
|
+
if trimmed_integration:
|
|
33
|
+
payload["integration"] = trimmed_integration
|
|
34
|
+
return payload
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
async def start_batch_scrape(client: AsyncHttpClient, urls: List[str], **kwargs) -> BatchScrapeResponse:
|
|
38
|
+
payload = _prepare(urls, **kwargs)
|
|
39
|
+
response = await client.post("/v2/batch/scrape", payload)
|
|
40
|
+
if response.status_code >= 400:
|
|
41
|
+
handle_response_error(response, "start batch scrape")
|
|
42
|
+
body = response.json()
|
|
43
|
+
if not body.get("success"):
|
|
44
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
45
|
+
return BatchScrapeResponse(id=body.get("id"), url=body.get("url"), invalid_urls=body.get("invalidURLs"))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def get_batch_scrape_status(
|
|
49
|
+
client: AsyncHttpClient,
|
|
50
|
+
job_id: str,
|
|
51
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
52
|
+
) -> BatchScrapeJob:
|
|
53
|
+
"""
|
|
54
|
+
Get the status of a batch scrape job.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
client: Async HTTP client instance
|
|
58
|
+
job_id: ID of the batch scrape job
|
|
59
|
+
pagination_config: Optional configuration for pagination behavior
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
BatchScrapeJob containing job status and data
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
Exception: If the status check fails
|
|
66
|
+
"""
|
|
67
|
+
response = await client.get(f"/v2/batch/scrape/{job_id}")
|
|
68
|
+
if response.status_code >= 400:
|
|
69
|
+
handle_response_error(response, "get batch scrape status")
|
|
70
|
+
body = response.json()
|
|
71
|
+
if not body.get("success"):
|
|
72
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
73
|
+
docs: List[Document] = []
|
|
74
|
+
for doc in body.get("data", []) or []:
|
|
75
|
+
if isinstance(doc, dict):
|
|
76
|
+
normalized = normalize_document_input(doc)
|
|
77
|
+
docs.append(Document(**normalized))
|
|
78
|
+
|
|
79
|
+
# Handle pagination if requested
|
|
80
|
+
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
81
|
+
if auto_paginate and body.get("next"):
|
|
82
|
+
docs = await _fetch_all_batch_pages_async(
|
|
83
|
+
client,
|
|
84
|
+
body.get("next"),
|
|
85
|
+
docs,
|
|
86
|
+
pagination_config
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return BatchScrapeJob(
|
|
90
|
+
status=body.get("status"),
|
|
91
|
+
completed=body.get("completed", 0),
|
|
92
|
+
total=body.get("total", 0),
|
|
93
|
+
credits_used=body.get("creditsUsed"),
|
|
94
|
+
expires_at=body.get("expiresAt"),
|
|
95
|
+
next=body.get("next") if not auto_paginate else None,
|
|
96
|
+
data=docs,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
async def _fetch_all_batch_pages_async(
|
|
101
|
+
client: AsyncHttpClient,
|
|
102
|
+
next_url: str,
|
|
103
|
+
initial_documents: List[Document],
|
|
104
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
105
|
+
) -> List[Document]:
|
|
106
|
+
"""
|
|
107
|
+
Fetch all pages of batch scrape results asynchronously.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
client: Async HTTP client instance
|
|
111
|
+
next_url: URL for the next page
|
|
112
|
+
initial_documents: Documents from the first page
|
|
113
|
+
pagination_config: Optional configuration for pagination limits
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List of all documents from all pages
|
|
117
|
+
"""
|
|
118
|
+
documents = initial_documents.copy()
|
|
119
|
+
current_url = next_url
|
|
120
|
+
page_count = 0
|
|
121
|
+
|
|
122
|
+
# Apply pagination limits
|
|
123
|
+
max_pages = pagination_config.max_pages if pagination_config else None
|
|
124
|
+
max_results = pagination_config.max_results if pagination_config else None
|
|
125
|
+
max_wait_time = pagination_config.max_wait_time if pagination_config else None
|
|
126
|
+
|
|
127
|
+
start_time = time.monotonic()
|
|
128
|
+
|
|
129
|
+
while current_url:
|
|
130
|
+
# Check pagination limits
|
|
131
|
+
if (max_pages is not None) and (page_count >= max_pages):
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
# Fetch next page
|
|
138
|
+
response = await client.get(current_url)
|
|
139
|
+
|
|
140
|
+
if response.status_code >= 400:
|
|
141
|
+
# Log error but continue with what we have
|
|
142
|
+
import logging
|
|
143
|
+
logger = logging.getLogger("firecrawl")
|
|
144
|
+
logger.warning(f"Failed to fetch next page: {response.status_code}")
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
page_data = response.json()
|
|
148
|
+
|
|
149
|
+
if not page_data.get("success"):
|
|
150
|
+
break
|
|
151
|
+
|
|
152
|
+
# Add documents from this page
|
|
153
|
+
for doc in page_data.get("data", []) or []:
|
|
154
|
+
if isinstance(doc, dict):
|
|
155
|
+
# Check max_results limit
|
|
156
|
+
if (max_results is not None) and (len(documents) >= max_results):
|
|
157
|
+
break
|
|
158
|
+
normalized = normalize_document_input(doc)
|
|
159
|
+
documents.append(Document(**normalized))
|
|
160
|
+
|
|
161
|
+
# Check if we hit max_results limit
|
|
162
|
+
if (max_results is not None) and (len(documents) >= max_results):
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
# Get next URL
|
|
166
|
+
current_url = page_data.get("next")
|
|
167
|
+
page_count += 1
|
|
168
|
+
|
|
169
|
+
return documents
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
async def cancel_batch_scrape(client: AsyncHttpClient, job_id: str) -> bool:
|
|
173
|
+
response = await client.delete(f"/v2/batch/scrape/{job_id}")
|
|
174
|
+
if response.status_code >= 400:
|
|
175
|
+
handle_response_error(response, "cancel batch scrape")
|
|
176
|
+
body = response.json()
|
|
177
|
+
return body.get("status") == "cancelled"
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
async def get_batch_scrape_errors(client: AsyncHttpClient, job_id: str) -> Dict[str, Any]:
|
|
181
|
+
response = await client.get(f"/v2/batch/scrape/{job_id}/errors")
|
|
182
|
+
if response.status_code >= 400:
|
|
183
|
+
handle_response_error(response, "get batch scrape errors")
|
|
184
|
+
body = response.json()
|
|
185
|
+
if not body.get("success"):
|
|
186
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
187
|
+
return body
|
|
188
|
+
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any, List
|
|
2
|
+
from ...types import (
|
|
3
|
+
CrawlRequest,
|
|
4
|
+
CrawlJob,
|
|
5
|
+
CrawlResponse,
|
|
6
|
+
Document,
|
|
7
|
+
CrawlParamsRequest,
|
|
8
|
+
CrawlParamsData,
|
|
9
|
+
WebhookConfig,
|
|
10
|
+
CrawlErrorsResponse,
|
|
11
|
+
ActiveCrawlsResponse,
|
|
12
|
+
ActiveCrawl,
|
|
13
|
+
PaginationConfig,
|
|
14
|
+
)
|
|
15
|
+
from ...utils.error_handler import handle_response_error
|
|
16
|
+
from ...utils.validation import prepare_scrape_options
|
|
17
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
18
|
+
from ...utils.normalize import normalize_document_input
|
|
19
|
+
import time
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
23
|
+
if not request.url or not request.url.strip():
|
|
24
|
+
raise ValueError("URL cannot be empty")
|
|
25
|
+
data = {"url": request.url}
|
|
26
|
+
if request.prompt:
|
|
27
|
+
data["prompt"] = request.prompt
|
|
28
|
+
if request.scrape_options is not None:
|
|
29
|
+
opts = prepare_scrape_options(request.scrape_options)
|
|
30
|
+
if opts:
|
|
31
|
+
data["scrapeOptions"] = opts
|
|
32
|
+
# Webhook conversion
|
|
33
|
+
if request.webhook is not None:
|
|
34
|
+
if isinstance(request.webhook, str):
|
|
35
|
+
data["webhook"] = request.webhook
|
|
36
|
+
else:
|
|
37
|
+
data["webhook"] = request.webhook.model_dump(exclude_none=True)
|
|
38
|
+
request_data = request.model_dump(exclude_none=True, exclude_unset=True)
|
|
39
|
+
request_data.pop("url", None)
|
|
40
|
+
request_data.pop("prompt", None)
|
|
41
|
+
request_data.pop("scrape_options", None)
|
|
42
|
+
field_mappings = {
|
|
43
|
+
"include_paths": "includePaths",
|
|
44
|
+
"exclude_paths": "excludePaths",
|
|
45
|
+
"max_discovery_depth": "maxDiscoveryDepth",
|
|
46
|
+
"ignore_sitemap": "ignoreSitemap",
|
|
47
|
+
"ignore_query_parameters": "ignoreQueryParameters",
|
|
48
|
+
"crawl_entire_domain": "crawlEntireDomain",
|
|
49
|
+
"allow_external_links": "allowExternalLinks",
|
|
50
|
+
"allow_subdomains": "allowSubdomains",
|
|
51
|
+
"delay": "delay",
|
|
52
|
+
"max_concurrency": "maxConcurrency",
|
|
53
|
+
"zero_data_retention": "zeroDataRetention",
|
|
54
|
+
}
|
|
55
|
+
for snake, camel in field_mappings.items():
|
|
56
|
+
if snake in request_data:
|
|
57
|
+
data[camel] = request_data.pop(snake)
|
|
58
|
+
data.update(request_data)
|
|
59
|
+
if getattr(request, "integration", None) is not None:
|
|
60
|
+
data["integration"] = str(getattr(request, "integration")).strip()
|
|
61
|
+
return data
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlResponse:
|
|
65
|
+
"""
|
|
66
|
+
Start a crawl job for a website.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
client: Async HTTP client instance
|
|
70
|
+
request: CrawlRequest containing URL and options
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
CrawlResponse with job information
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If request is invalid
|
|
77
|
+
Exception: If the crawl operation fails to start
|
|
78
|
+
"""
|
|
79
|
+
payload = _prepare_crawl_request(request)
|
|
80
|
+
response = await client.post("/v2/crawl", payload)
|
|
81
|
+
if response.status_code >= 400:
|
|
82
|
+
handle_response_error(response, "start crawl")
|
|
83
|
+
body = response.json()
|
|
84
|
+
if body.get("success"):
|
|
85
|
+
return CrawlResponse(id=body.get("id"), url=body.get("url"))
|
|
86
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
async def get_crawl_status(
|
|
90
|
+
client: AsyncHttpClient,
|
|
91
|
+
job_id: str,
|
|
92
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
93
|
+
*,
|
|
94
|
+
request_timeout: Optional[float] = None,
|
|
95
|
+
) -> CrawlJob:
|
|
96
|
+
"""
|
|
97
|
+
Get the status of a crawl job.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
client: Async HTTP client instance
|
|
101
|
+
job_id: ID of the crawl job
|
|
102
|
+
pagination_config: Optional configuration for pagination limits
|
|
103
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
104
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
105
|
+
each page request separately, not to the entire operation
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
CrawlJob with job information
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
Exception: If the status check fails
|
|
112
|
+
"""
|
|
113
|
+
response = await client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
|
|
114
|
+
if response.status_code >= 400:
|
|
115
|
+
handle_response_error(response, "get crawl status")
|
|
116
|
+
body = response.json()
|
|
117
|
+
if body.get("success"):
|
|
118
|
+
documents = []
|
|
119
|
+
for doc_data in body.get("data", []):
|
|
120
|
+
if isinstance(doc_data, dict):
|
|
121
|
+
normalized = normalize_document_input(doc_data)
|
|
122
|
+
documents.append(Document(**normalized))
|
|
123
|
+
|
|
124
|
+
# Handle pagination if requested
|
|
125
|
+
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
126
|
+
if auto_paginate and body.get("next"):
|
|
127
|
+
documents = await _fetch_all_pages_async(
|
|
128
|
+
client,
|
|
129
|
+
body.get("next"),
|
|
130
|
+
documents,
|
|
131
|
+
pagination_config,
|
|
132
|
+
request_timeout=request_timeout,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return CrawlJob(
|
|
136
|
+
status=body.get("status"),
|
|
137
|
+
completed=body.get("completed", 0),
|
|
138
|
+
total=body.get("total", 0),
|
|
139
|
+
credits_used=body.get("creditsUsed", 0),
|
|
140
|
+
expires_at=body.get("expiresAt"),
|
|
141
|
+
next=body.get("next") if not auto_paginate else None,
|
|
142
|
+
data=documents,
|
|
143
|
+
)
|
|
144
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
async def _fetch_all_pages_async(
|
|
148
|
+
client: AsyncHttpClient,
|
|
149
|
+
next_url: str,
|
|
150
|
+
initial_documents: List[Document],
|
|
151
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
152
|
+
*,
|
|
153
|
+
request_timeout: Optional[float] = None,
|
|
154
|
+
) -> List[Document]:
|
|
155
|
+
"""
|
|
156
|
+
Fetch all pages of crawl results asynchronously.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
client: Async HTTP client instance
|
|
160
|
+
next_url: URL for the next page
|
|
161
|
+
initial_documents: Documents from the first page
|
|
162
|
+
pagination_config: Optional configuration for pagination limits
|
|
163
|
+
request_timeout: Optional timeout (in seconds) for the underlying HTTP request
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
List of all documents from all pages
|
|
167
|
+
"""
|
|
168
|
+
documents = initial_documents.copy()
|
|
169
|
+
current_url = next_url
|
|
170
|
+
page_count = 0
|
|
171
|
+
|
|
172
|
+
# Apply pagination limits
|
|
173
|
+
max_pages = pagination_config.max_pages if pagination_config else None
|
|
174
|
+
max_results = pagination_config.max_results if pagination_config else None
|
|
175
|
+
max_wait_time = pagination_config.max_wait_time if pagination_config else None
|
|
176
|
+
|
|
177
|
+
start_time = time.monotonic()
|
|
178
|
+
|
|
179
|
+
while current_url:
|
|
180
|
+
# Check pagination limits (treat 0 as a valid limit)
|
|
181
|
+
if (max_pages is not None) and page_count >= max_pages:
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
# Fetch next page
|
|
188
|
+
response = await client.get(current_url, timeout=request_timeout)
|
|
189
|
+
|
|
190
|
+
if response.status_code >= 400:
|
|
191
|
+
# Log error but continue with what we have
|
|
192
|
+
import logging
|
|
193
|
+
logger = logging.getLogger("firecrawl")
|
|
194
|
+
logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
page_data = response.json()
|
|
198
|
+
|
|
199
|
+
if not page_data.get("success"):
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
# Add documents from this page
|
|
203
|
+
for doc_data in page_data.get("data", []):
|
|
204
|
+
if isinstance(doc_data, dict):
|
|
205
|
+
# Check max_results limit
|
|
206
|
+
if (max_results is not None) and (len(documents) >= max_results):
|
|
207
|
+
break
|
|
208
|
+
normalized = normalize_document_input(doc_data)
|
|
209
|
+
documents.append(Document(**normalized))
|
|
210
|
+
|
|
211
|
+
# Check if we hit max_results limit
|
|
212
|
+
if (max_results is not None) and (len(documents) >= max_results):
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
# Get next URL
|
|
216
|
+
current_url = page_data.get("next")
|
|
217
|
+
page_count += 1
|
|
218
|
+
|
|
219
|
+
return documents
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
|
|
223
|
+
"""
|
|
224
|
+
Cancel a crawl job.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
client: Async HTTP client instance
|
|
228
|
+
job_id: ID of the crawl job
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
True if cancellation was successful
|
|
232
|
+
|
|
233
|
+
Raises:
|
|
234
|
+
Exception: If the cancellation operation fails
|
|
235
|
+
"""
|
|
236
|
+
response = await client.delete(f"/v2/crawl/{job_id}")
|
|
237
|
+
if response.status_code >= 400:
|
|
238
|
+
handle_response_error(response, "cancel crawl")
|
|
239
|
+
body = response.json()
|
|
240
|
+
return body.get("status") == "cancelled"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
|
|
244
|
+
"""
|
|
245
|
+
Preview crawl parameters before starting a crawl job.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
client: Async HTTP client instance
|
|
249
|
+
request: CrawlParamsRequest containing URL and prompt
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
CrawlParamsData containing crawl configuration
|
|
253
|
+
|
|
254
|
+
Raises:
|
|
255
|
+
ValueError: If request is invalid
|
|
256
|
+
Exception: If the parameter preview fails
|
|
257
|
+
"""
|
|
258
|
+
if not request.url or not request.url.strip():
|
|
259
|
+
raise ValueError("URL cannot be empty")
|
|
260
|
+
if not request.prompt or not request.prompt.strip():
|
|
261
|
+
raise ValueError("Prompt cannot be empty")
|
|
262
|
+
payload = {"url": request.url, "prompt": request.prompt}
|
|
263
|
+
response = await client.post("/v2/crawl/params-preview", payload)
|
|
264
|
+
if response.status_code >= 400:
|
|
265
|
+
handle_response_error(response, "crawl params preview")
|
|
266
|
+
body = response.json()
|
|
267
|
+
if not body.get("success"):
|
|
268
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
269
|
+
params_data = body.get("data", {})
|
|
270
|
+
converted: Dict[str, Any] = {}
|
|
271
|
+
mapping = {
|
|
272
|
+
"includePaths": "include_paths",
|
|
273
|
+
"excludePaths": "exclude_paths",
|
|
274
|
+
"maxDiscoveryDepth": "max_discovery_depth",
|
|
275
|
+
"ignoreSitemap": "ignore_sitemap",
|
|
276
|
+
"ignoreQueryParameters": "ignore_query_parameters",
|
|
277
|
+
"crawlEntireDomain": "crawl_entire_domain",
|
|
278
|
+
"allowExternalLinks": "allow_external_links",
|
|
279
|
+
"allowSubdomains": "allow_subdomains",
|
|
280
|
+
"maxConcurrency": "max_concurrency",
|
|
281
|
+
"scrapeOptions": "scrape_options",
|
|
282
|
+
"zeroDataRetention": "zero_data_retention",
|
|
283
|
+
}
|
|
284
|
+
for camel, snake in mapping.items():
|
|
285
|
+
if camel in params_data:
|
|
286
|
+
converted[snake] = params_data[camel]
|
|
287
|
+
if "webhook" in params_data:
|
|
288
|
+
wk = params_data["webhook"]
|
|
289
|
+
converted["webhook"] = wk
|
|
290
|
+
if "warning" in body:
|
|
291
|
+
converted["warning"] = body["warning"]
|
|
292
|
+
return CrawlParamsData(**converted)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlErrorsResponse:
|
|
296
|
+
"""
|
|
297
|
+
Get errors from a crawl job.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
client: Async HTTP client instance
|
|
301
|
+
crawl_id: ID of the crawl job
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
CrawlErrorsResponse with errors and robots blocked
|
|
305
|
+
|
|
306
|
+
Raises:
|
|
307
|
+
Exception: If the error check operation fails
|
|
308
|
+
"""
|
|
309
|
+
response = await client.get(f"/v2/crawl/{crawl_id}/errors")
|
|
310
|
+
if response.status_code >= 400:
|
|
311
|
+
handle_response_error(response, "check crawl errors")
|
|
312
|
+
body = response.json()
|
|
313
|
+
payload = body.get("data", body)
|
|
314
|
+
normalized = {
|
|
315
|
+
"errors": payload.get("errors", []),
|
|
316
|
+
"robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
|
|
317
|
+
}
|
|
318
|
+
return CrawlErrorsResponse(**normalized)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
async def get_active_crawls(client: AsyncHttpClient) -> ActiveCrawlsResponse:
|
|
322
|
+
"""
|
|
323
|
+
Get active crawl jobs.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
client: Async HTTP client instance
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
ActiveCrawlsResponse with active crawl jobs
|
|
330
|
+
|
|
331
|
+
Raises:
|
|
332
|
+
Exception: If the active crawl jobs operation fails
|
|
333
|
+
"""
|
|
334
|
+
response = await client.get("/v2/crawl/active")
|
|
335
|
+
if response.status_code >= 400:
|
|
336
|
+
handle_response_error(response, "get active crawls")
|
|
337
|
+
body = response.json()
|
|
338
|
+
if not body.get("success"):
|
|
339
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
340
|
+
crawls_in = body.get("crawls", [])
|
|
341
|
+
normalized = []
|
|
342
|
+
for c in crawls_in:
|
|
343
|
+
if isinstance(c, dict):
|
|
344
|
+
normalized.append({
|
|
345
|
+
"id": c.get("id"),
|
|
346
|
+
"team_id": c.get("teamId", c.get("team_id")),
|
|
347
|
+
"url": c.get("url"),
|
|
348
|
+
"options": c.get("options"),
|
|
349
|
+
})
|
|
350
|
+
return ActiveCrawlsResponse(success=True, crawls=[ActiveCrawl(**nc) for nc in normalized])
|
|
351
|
+
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
import asyncio
|
|
3
|
+
|
|
4
|
+
from ...types import ExtractResponse, ScrapeOptions
|
|
5
|
+
from ...utils.http_client_async import AsyncHttpClient
|
|
6
|
+
from ...utils.validation import prepare_scrape_options
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _prepare_extract_request(
|
|
10
|
+
urls: Optional[List[str]],
|
|
11
|
+
*,
|
|
12
|
+
prompt: Optional[str] = None,
|
|
13
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
14
|
+
system_prompt: Optional[str] = None,
|
|
15
|
+
allow_external_links: Optional[bool] = None,
|
|
16
|
+
enable_web_search: Optional[bool] = None,
|
|
17
|
+
show_sources: Optional[bool] = None,
|
|
18
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
19
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
20
|
+
integration: Optional[str] = None,
|
|
21
|
+
) -> Dict[str, Any]:
|
|
22
|
+
body: Dict[str, Any] = {}
|
|
23
|
+
if urls is not None:
|
|
24
|
+
body["urls"] = urls
|
|
25
|
+
if prompt is not None:
|
|
26
|
+
body["prompt"] = prompt
|
|
27
|
+
if schema is not None:
|
|
28
|
+
body["schema"] = schema
|
|
29
|
+
if system_prompt is not None:
|
|
30
|
+
body["systemPrompt"] = system_prompt
|
|
31
|
+
if allow_external_links is not None:
|
|
32
|
+
body["allowExternalLinks"] = allow_external_links
|
|
33
|
+
if enable_web_search is not None:
|
|
34
|
+
body["enableWebSearch"] = enable_web_search
|
|
35
|
+
if show_sources is not None:
|
|
36
|
+
body["showSources"] = show_sources
|
|
37
|
+
if ignore_invalid_urls is not None:
|
|
38
|
+
body["ignoreInvalidURLs"] = ignore_invalid_urls
|
|
39
|
+
if scrape_options is not None:
|
|
40
|
+
prepared = prepare_scrape_options(scrape_options)
|
|
41
|
+
if prepared:
|
|
42
|
+
body["scrapeOptions"] = prepared
|
|
43
|
+
if integration is not None and str(integration).strip():
|
|
44
|
+
body["integration"] = str(integration).strip()
|
|
45
|
+
return body
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def start_extract(
|
|
49
|
+
client: AsyncHttpClient,
|
|
50
|
+
urls: Optional[List[str]],
|
|
51
|
+
*,
|
|
52
|
+
prompt: Optional[str] = None,
|
|
53
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
54
|
+
system_prompt: Optional[str] = None,
|
|
55
|
+
allow_external_links: Optional[bool] = None,
|
|
56
|
+
enable_web_search: Optional[bool] = None,
|
|
57
|
+
show_sources: Optional[bool] = None,
|
|
58
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
59
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
60
|
+
integration: Optional[str] = None,
|
|
61
|
+
) -> ExtractResponse:
|
|
62
|
+
body = _prepare_extract_request(
|
|
63
|
+
urls,
|
|
64
|
+
prompt=prompt,
|
|
65
|
+
schema=schema,
|
|
66
|
+
system_prompt=system_prompt,
|
|
67
|
+
allow_external_links=allow_external_links,
|
|
68
|
+
enable_web_search=enable_web_search,
|
|
69
|
+
show_sources=show_sources,
|
|
70
|
+
scrape_options=scrape_options,
|
|
71
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
72
|
+
integration=integration,
|
|
73
|
+
)
|
|
74
|
+
resp = await client.post("/v2/extract", body)
|
|
75
|
+
return ExtractResponse(**resp.json())
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def get_extract_status(client: AsyncHttpClient, job_id: str) -> ExtractResponse:
|
|
79
|
+
resp = await client.get(f"/v2/extract/{job_id}")
|
|
80
|
+
return ExtractResponse(**resp.json())
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
async def wait_extract(
|
|
84
|
+
client: AsyncHttpClient,
|
|
85
|
+
job_id: str,
|
|
86
|
+
*,
|
|
87
|
+
poll_interval: int = 2,
|
|
88
|
+
timeout: Optional[int] = None,
|
|
89
|
+
) -> ExtractResponse:
|
|
90
|
+
start_ts = asyncio.get_event_loop().time()
|
|
91
|
+
while True:
|
|
92
|
+
status = await get_extract_status(client, job_id)
|
|
93
|
+
if status.status in ("completed", "failed", "cancelled"):
|
|
94
|
+
return status
|
|
95
|
+
if timeout is not None and (asyncio.get_event_loop().time() - start_ts) > timeout:
|
|
96
|
+
return status
|
|
97
|
+
await asyncio.sleep(max(1, poll_interval))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
async def extract(
|
|
101
|
+
client: AsyncHttpClient,
|
|
102
|
+
urls: Optional[List[str]],
|
|
103
|
+
*,
|
|
104
|
+
prompt: Optional[str] = None,
|
|
105
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
106
|
+
system_prompt: Optional[str] = None,
|
|
107
|
+
allow_external_links: Optional[bool] = None,
|
|
108
|
+
enable_web_search: Optional[bool] = None,
|
|
109
|
+
show_sources: Optional[bool] = None,
|
|
110
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
111
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
112
|
+
poll_interval: int = 2,
|
|
113
|
+
timeout: Optional[int] = None,
|
|
114
|
+
integration: Optional[str] = None,
|
|
115
|
+
) -> ExtractResponse:
|
|
116
|
+
started = await start_extract(
|
|
117
|
+
client,
|
|
118
|
+
urls,
|
|
119
|
+
prompt=prompt,
|
|
120
|
+
schema=schema,
|
|
121
|
+
system_prompt=system_prompt,
|
|
122
|
+
allow_external_links=allow_external_links,
|
|
123
|
+
enable_web_search=enable_web_search,
|
|
124
|
+
show_sources=show_sources,
|
|
125
|
+
scrape_options=scrape_options,
|
|
126
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
127
|
+
integration=integration,
|
|
128
|
+
)
|
|
129
|
+
job_id = getattr(started, "id", None)
|
|
130
|
+
if not job_id:
|
|
131
|
+
return started
|
|
132
|
+
return await wait_extract(client, job_id, poll_interval=poll_interval, timeout=timeout)
|
|
133
|
+
|