firecrawl 3.4.0__py3-none-any.whl → 4.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/e2e/v2/test_crawl.py +1 -1
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +602 -0
- firecrawl/v2/client.py +27 -7
- firecrawl/v2/client_async.py +21 -4
- firecrawl/v2/methods/aio/batch.py +107 -8
- firecrawl/v2/methods/aio/crawl.py +172 -3
- firecrawl/v2/methods/batch.py +90 -5
- firecrawl/v2/methods/crawl.py +95 -6
- firecrawl/v2/types.py +34 -2
- firecrawl/v2/utils/http_client.py +26 -3
- firecrawl/v2/utils/validation.py +15 -1
- {firecrawl-3.4.0.dist-info → firecrawl-4.1.0.dist-info}/METADATA +1 -1
- {firecrawl-3.4.0.dist-info → firecrawl-4.1.0.dist-info}/RECORD +17 -16
- {firecrawl-3.4.0.dist-info → firecrawl-4.1.0.dist-info}/WHEEL +0 -0
- {firecrawl-3.4.0.dist-info → firecrawl-4.1.0.dist-info}/licenses/LICENSE +0 -0
- {firecrawl-3.4.0.dist-info → firecrawl-4.1.0.dist-info}/top_level.txt +0 -0
firecrawl/v2/client.py
CHANGED
|
@@ -18,6 +18,7 @@ from .types import (
|
|
|
18
18
|
CrawlResponse,
|
|
19
19
|
CrawlJob,
|
|
20
20
|
CrawlParamsRequest,
|
|
21
|
+
PDFParser,
|
|
21
22
|
CrawlParamsData,
|
|
22
23
|
WebhookConfig,
|
|
23
24
|
CrawlErrorsResponse,
|
|
@@ -35,6 +36,7 @@ from .types import (
|
|
|
35
36
|
ExecuteJavascriptAction,
|
|
36
37
|
PDFAction,
|
|
37
38
|
Location,
|
|
39
|
+
PaginationConfig,
|
|
38
40
|
)
|
|
39
41
|
from .utils.http_client import HttpClient
|
|
40
42
|
from .utils.error_handler import FirecrawlError
|
|
@@ -104,7 +106,7 @@ class FirecrawlClient:
|
|
|
104
106
|
timeout: Optional[int] = None,
|
|
105
107
|
wait_for: Optional[int] = None,
|
|
106
108
|
mobile: Optional[bool] = None,
|
|
107
|
-
parsers: Optional[List[str]] = None,
|
|
109
|
+
parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
|
|
108
110
|
actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
|
|
109
111
|
location: Optional['Location'] = None,
|
|
110
112
|
skip_tls_verification: Optional[bool] = None,
|
|
@@ -356,12 +358,17 @@ class FirecrawlClient:
|
|
|
356
358
|
|
|
357
359
|
return crawl_module.start_crawl(self.http_client, request)
|
|
358
360
|
|
|
359
|
-
def get_crawl_status(
|
|
361
|
+
def get_crawl_status(
|
|
362
|
+
self,
|
|
363
|
+
job_id: str,
|
|
364
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
365
|
+
) -> CrawlJob:
|
|
360
366
|
"""
|
|
361
367
|
Get the status of a crawl job.
|
|
362
368
|
|
|
363
369
|
Args:
|
|
364
370
|
job_id: ID of the crawl job
|
|
371
|
+
pagination_config: Optional configuration for pagination behavior
|
|
365
372
|
|
|
366
373
|
Returns:
|
|
367
374
|
CrawlJob with current status and data
|
|
@@ -369,7 +376,11 @@ class FirecrawlClient:
|
|
|
369
376
|
Raises:
|
|
370
377
|
Exception: If the status check fails
|
|
371
378
|
"""
|
|
372
|
-
return crawl_module.get_crawl_status(
|
|
379
|
+
return crawl_module.get_crawl_status(
|
|
380
|
+
self.http_client,
|
|
381
|
+
job_id,
|
|
382
|
+
pagination_config=pagination_config
|
|
383
|
+
)
|
|
373
384
|
|
|
374
385
|
def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
|
|
375
386
|
"""
|
|
@@ -561,7 +572,7 @@ class FirecrawlClient:
|
|
|
561
572
|
timeout: Optional[int] = None,
|
|
562
573
|
wait_for: Optional[int] = None,
|
|
563
574
|
mobile: Optional[bool] = None,
|
|
564
|
-
parsers: Optional[List[str]] = None,
|
|
575
|
+
parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
|
|
565
576
|
actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
|
|
566
577
|
location: Optional['Location'] = None,
|
|
567
578
|
skip_tls_verification: Optional[bool] = None,
|
|
@@ -651,16 +662,25 @@ class FirecrawlClient:
|
|
|
651
662
|
idempotency_key=idempotency_key,
|
|
652
663
|
)
|
|
653
664
|
|
|
654
|
-
def get_batch_scrape_status(
|
|
665
|
+
def get_batch_scrape_status(
|
|
666
|
+
self,
|
|
667
|
+
job_id: str,
|
|
668
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
669
|
+
):
|
|
655
670
|
"""Get current status and any scraped data for a batch job.
|
|
656
671
|
|
|
657
672
|
Args:
|
|
658
673
|
job_id: Batch job ID
|
|
674
|
+
pagination_config: Optional configuration for pagination behavior
|
|
659
675
|
|
|
660
676
|
Returns:
|
|
661
677
|
Status payload including counts and partial data
|
|
662
678
|
"""
|
|
663
|
-
return batch_module.get_batch_scrape_status(
|
|
679
|
+
return batch_module.get_batch_scrape_status(
|
|
680
|
+
self.http_client,
|
|
681
|
+
job_id,
|
|
682
|
+
pagination_config=pagination_config
|
|
683
|
+
)
|
|
664
684
|
|
|
665
685
|
def cancel_batch_scrape(self, job_id: str) -> bool:
|
|
666
686
|
"""Cancel a running batch scrape job.
|
|
@@ -740,7 +760,7 @@ class FirecrawlClient:
|
|
|
740
760
|
timeout: Optional[int] = None,
|
|
741
761
|
wait_for: Optional[int] = None,
|
|
742
762
|
mobile: Optional[bool] = None,
|
|
743
|
-
parsers: Optional[List[str]] = None,
|
|
763
|
+
parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
|
|
744
764
|
actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
|
|
745
765
|
location: Optional['Location'] = None,
|
|
746
766
|
skip_tls_verification: Optional[bool] = None,
|
firecrawl/v2/client_async.py
CHANGED
|
@@ -31,6 +31,7 @@ from .types import (
|
|
|
31
31
|
ExecuteJavascriptAction,
|
|
32
32
|
PDFAction,
|
|
33
33
|
Location,
|
|
34
|
+
PaginationConfig,
|
|
34
35
|
)
|
|
35
36
|
from .utils.http_client import HttpClient
|
|
36
37
|
from .utils.http_client_async import AsyncHttpClient
|
|
@@ -94,8 +95,16 @@ class AsyncFirecrawlClient:
|
|
|
94
95
|
timeout = kwargs.get("timeout")
|
|
95
96
|
return await self.wait_crawl(resp.id, poll_interval=poll_interval, timeout=timeout)
|
|
96
97
|
|
|
97
|
-
async def get_crawl_status(
|
|
98
|
-
|
|
98
|
+
async def get_crawl_status(
|
|
99
|
+
self,
|
|
100
|
+
job_id: str,
|
|
101
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
102
|
+
) -> CrawlJob:
|
|
103
|
+
return await async_crawl.get_crawl_status(
|
|
104
|
+
self.async_http_client,
|
|
105
|
+
job_id,
|
|
106
|
+
pagination_config=pagination_config
|
|
107
|
+
)
|
|
99
108
|
|
|
100
109
|
async def cancel_crawl(self, job_id: str) -> bool:
|
|
101
110
|
return await async_crawl.cancel_crawl(self.async_http_client, job_id)
|
|
@@ -154,8 +163,16 @@ class AsyncFirecrawlClient:
|
|
|
154
163
|
timeout = kwargs.get("timeout")
|
|
155
164
|
return await self.wait_batch_scrape(job_id, poll_interval=poll_interval, timeout=timeout)
|
|
156
165
|
|
|
157
|
-
async def get_batch_scrape_status(
|
|
158
|
-
|
|
166
|
+
async def get_batch_scrape_status(
|
|
167
|
+
self,
|
|
168
|
+
job_id: str,
|
|
169
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
170
|
+
):
|
|
171
|
+
return await async_batch.get_batch_scrape_status(
|
|
172
|
+
self.async_http_client,
|
|
173
|
+
job_id,
|
|
174
|
+
pagination_config=pagination_config
|
|
175
|
+
)
|
|
159
176
|
|
|
160
177
|
async def cancel_batch_scrape(self, job_id: str) -> bool:
|
|
161
178
|
return await async_batch.cancel_batch_scrape(self.async_http_client, job_id)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from typing import Optional, List, Dict, Any
|
|
2
|
-
from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob
|
|
2
|
+
from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob, PaginationConfig
|
|
3
3
|
from ...utils.http_client_async import AsyncHttpClient
|
|
4
4
|
from ...utils.validation import prepare_scrape_options
|
|
5
5
|
from ...utils.error_handler import handle_response_error
|
|
6
|
+
from ...utils.normalize import normalize_document_input
|
|
7
|
+
import time
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
def _prepare(urls: List[str], *, options: Optional[ScrapeOptions] = None, **kwargs) -> Dict[str, Any]:
|
|
@@ -39,7 +41,25 @@ async def start_batch_scrape(client: AsyncHttpClient, urls: List[str], **kwargs)
|
|
|
39
41
|
return BatchScrapeResponse(id=body.get("id"), url=body.get("url"), invalid_urls=body.get("invalidURLs"))
|
|
40
42
|
|
|
41
43
|
|
|
42
|
-
async def get_batch_scrape_status(
|
|
44
|
+
async def get_batch_scrape_status(
|
|
45
|
+
client: AsyncHttpClient,
|
|
46
|
+
job_id: str,
|
|
47
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
48
|
+
) -> BatchScrapeJob:
|
|
49
|
+
"""
|
|
50
|
+
Get the status of a batch scrape job.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
client: Async HTTP client instance
|
|
54
|
+
job_id: ID of the batch scrape job
|
|
55
|
+
pagination_config: Optional configuration for pagination behavior
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
BatchScrapeJob containing job status and data
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
Exception: If the status check fails
|
|
62
|
+
"""
|
|
43
63
|
response = await client.get(f"/v2/batch/scrape/{job_id}")
|
|
44
64
|
if response.status_code >= 400:
|
|
45
65
|
handle_response_error(response, "get batch scrape status")
|
|
@@ -49,23 +69,102 @@ async def get_batch_scrape_status(client: AsyncHttpClient, job_id: str) -> Batch
|
|
|
49
69
|
docs: List[Document] = []
|
|
50
70
|
for doc in body.get("data", []) or []:
|
|
51
71
|
if isinstance(doc, dict):
|
|
52
|
-
normalized =
|
|
53
|
-
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
54
|
-
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
55
|
-
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
56
|
-
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
72
|
+
normalized = normalize_document_input(doc)
|
|
57
73
|
docs.append(Document(**normalized))
|
|
74
|
+
|
|
75
|
+
# Handle pagination if requested
|
|
76
|
+
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
77
|
+
if auto_paginate and body.get("next"):
|
|
78
|
+
docs = await _fetch_all_batch_pages_async(
|
|
79
|
+
client,
|
|
80
|
+
body.get("next"),
|
|
81
|
+
docs,
|
|
82
|
+
pagination_config
|
|
83
|
+
)
|
|
84
|
+
|
|
58
85
|
return BatchScrapeJob(
|
|
59
86
|
status=body.get("status"),
|
|
60
87
|
completed=body.get("completed", 0),
|
|
61
88
|
total=body.get("total", 0),
|
|
62
89
|
credits_used=body.get("creditsUsed"),
|
|
63
90
|
expires_at=body.get("expiresAt"),
|
|
64
|
-
next=body.get("next"),
|
|
91
|
+
next=body.get("next") if not auto_paginate else None,
|
|
65
92
|
data=docs,
|
|
66
93
|
)
|
|
67
94
|
|
|
68
95
|
|
|
96
|
+
async def _fetch_all_batch_pages_async(
|
|
97
|
+
client: AsyncHttpClient,
|
|
98
|
+
next_url: str,
|
|
99
|
+
initial_documents: List[Document],
|
|
100
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
101
|
+
) -> List[Document]:
|
|
102
|
+
"""
|
|
103
|
+
Fetch all pages of batch scrape results asynchronously.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
client: Async HTTP client instance
|
|
107
|
+
next_url: URL for the next page
|
|
108
|
+
initial_documents: Documents from the first page
|
|
109
|
+
pagination_config: Optional configuration for pagination limits
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
List of all documents from all pages
|
|
113
|
+
"""
|
|
114
|
+
documents = initial_documents.copy()
|
|
115
|
+
current_url = next_url
|
|
116
|
+
page_count = 0
|
|
117
|
+
|
|
118
|
+
# Apply pagination limits
|
|
119
|
+
max_pages = pagination_config.max_pages if pagination_config else None
|
|
120
|
+
max_results = pagination_config.max_results if pagination_config else None
|
|
121
|
+
max_wait_time = pagination_config.max_wait_time if pagination_config else None
|
|
122
|
+
|
|
123
|
+
start_time = time.monotonic()
|
|
124
|
+
|
|
125
|
+
while current_url:
|
|
126
|
+
# Check pagination limits
|
|
127
|
+
if (max_pages is not None) and (page_count >= max_pages):
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
# Fetch next page
|
|
134
|
+
response = await client.get(current_url)
|
|
135
|
+
|
|
136
|
+
if response.status_code >= 400:
|
|
137
|
+
# Log error but continue with what we have
|
|
138
|
+
import logging
|
|
139
|
+
logger = logging.getLogger("firecrawl")
|
|
140
|
+
logger.warning(f"Failed to fetch next page: {response.status_code}")
|
|
141
|
+
break
|
|
142
|
+
|
|
143
|
+
page_data = response.json()
|
|
144
|
+
|
|
145
|
+
if not page_data.get("success"):
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
# Add documents from this page
|
|
149
|
+
for doc in page_data.get("data", []) or []:
|
|
150
|
+
if isinstance(doc, dict):
|
|
151
|
+
# Check max_results limit
|
|
152
|
+
if (max_results is not None) and (len(documents) >= max_results):
|
|
153
|
+
break
|
|
154
|
+
normalized = normalize_document_input(doc)
|
|
155
|
+
documents.append(Document(**normalized))
|
|
156
|
+
|
|
157
|
+
# Check if we hit max_results limit
|
|
158
|
+
if (max_results is not None) and (len(documents) >= max_results):
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
# Get next URL
|
|
162
|
+
current_url = page_data.get("next")
|
|
163
|
+
page_count += 1
|
|
164
|
+
|
|
165
|
+
return documents
|
|
166
|
+
|
|
167
|
+
|
|
69
168
|
async def cancel_batch_scrape(client: AsyncHttpClient, job_id: str) -> bool:
|
|
70
169
|
response = await client.delete(f"/v2/batch/scrape/{job_id}")
|
|
71
170
|
if response.status_code >= 400:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, Dict, Any
|
|
1
|
+
from typing import Optional, Dict, Any, List
|
|
2
2
|
from ...types import (
|
|
3
3
|
CrawlRequest,
|
|
4
4
|
CrawlJob,
|
|
@@ -10,11 +10,13 @@ from ...types import (
|
|
|
10
10
|
CrawlErrorsResponse,
|
|
11
11
|
ActiveCrawlsResponse,
|
|
12
12
|
ActiveCrawl,
|
|
13
|
+
PaginationConfig,
|
|
13
14
|
)
|
|
14
15
|
from ...utils.error_handler import handle_response_error
|
|
15
16
|
from ...utils.validation import prepare_scrape_options
|
|
16
17
|
from ...utils.http_client_async import AsyncHttpClient
|
|
17
18
|
from ...utils.normalize import normalize_document_input
|
|
19
|
+
import time
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
@@ -58,6 +60,20 @@ def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
|
58
60
|
|
|
59
61
|
|
|
60
62
|
async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlResponse:
|
|
63
|
+
"""
|
|
64
|
+
Start a crawl job for a website.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
client: Async HTTP client instance
|
|
68
|
+
request: CrawlRequest containing URL and options
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
CrawlResponse with job information
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If request is invalid
|
|
75
|
+
Exception: If the crawl operation fails to start
|
|
76
|
+
"""
|
|
61
77
|
payload = _prepare_crawl_request(request)
|
|
62
78
|
response = await client.post("/v2/crawl", payload)
|
|
63
79
|
if response.status_code >= 400:
|
|
@@ -68,7 +84,25 @@ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlRe
|
|
|
68
84
|
raise Exception(body.get("error", "Unknown error occurred"))
|
|
69
85
|
|
|
70
86
|
|
|
71
|
-
async def get_crawl_status(
|
|
87
|
+
async def get_crawl_status(
|
|
88
|
+
client: AsyncHttpClient,
|
|
89
|
+
job_id: str,
|
|
90
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
91
|
+
) -> CrawlJob:
|
|
92
|
+
"""
|
|
93
|
+
Get the status of a crawl job.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
client: Async HTTP client instance
|
|
97
|
+
job_id: ID of the crawl job
|
|
98
|
+
pagination_config: Optional configuration for pagination limits
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
CrawlJob with job information
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
Exception: If the status check fails
|
|
105
|
+
"""
|
|
72
106
|
response = await client.get(f"/v2/crawl/{job_id}")
|
|
73
107
|
if response.status_code >= 400:
|
|
74
108
|
handle_response_error(response, "get crawl status")
|
|
@@ -79,19 +113,115 @@ async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
|
|
|
79
113
|
if isinstance(doc_data, dict):
|
|
80
114
|
normalized = normalize_document_input(doc_data)
|
|
81
115
|
documents.append(Document(**normalized))
|
|
116
|
+
|
|
117
|
+
# Handle pagination if requested
|
|
118
|
+
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
119
|
+
if auto_paginate and body.get("next"):
|
|
120
|
+
documents = await _fetch_all_pages_async(
|
|
121
|
+
client,
|
|
122
|
+
body.get("next"),
|
|
123
|
+
documents,
|
|
124
|
+
pagination_config
|
|
125
|
+
)
|
|
126
|
+
|
|
82
127
|
return CrawlJob(
|
|
83
128
|
status=body.get("status"),
|
|
84
129
|
completed=body.get("completed", 0),
|
|
85
130
|
total=body.get("total", 0),
|
|
86
131
|
credits_used=body.get("creditsUsed", 0),
|
|
87
132
|
expires_at=body.get("expiresAt"),
|
|
88
|
-
next=body.get("next"),
|
|
133
|
+
next=body.get("next") if not auto_paginate else None,
|
|
89
134
|
data=documents,
|
|
90
135
|
)
|
|
91
136
|
raise Exception(body.get("error", "Unknown error occurred"))
|
|
92
137
|
|
|
93
138
|
|
|
139
|
+
async def _fetch_all_pages_async(
|
|
140
|
+
client: AsyncHttpClient,
|
|
141
|
+
next_url: str,
|
|
142
|
+
initial_documents: List[Document],
|
|
143
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
144
|
+
) -> List[Document]:
|
|
145
|
+
"""
|
|
146
|
+
Fetch all pages of crawl results asynchronously.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
client: Async HTTP client instance
|
|
150
|
+
next_url: URL for the next page
|
|
151
|
+
initial_documents: Documents from the first page
|
|
152
|
+
pagination_config: Optional configuration for pagination limits
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
List of all documents from all pages
|
|
156
|
+
"""
|
|
157
|
+
documents = initial_documents.copy()
|
|
158
|
+
current_url = next_url
|
|
159
|
+
page_count = 0
|
|
160
|
+
|
|
161
|
+
# Apply pagination limits
|
|
162
|
+
max_pages = pagination_config.max_pages if pagination_config else None
|
|
163
|
+
max_results = pagination_config.max_results if pagination_config else None
|
|
164
|
+
max_wait_time = pagination_config.max_wait_time if pagination_config else None
|
|
165
|
+
|
|
166
|
+
start_time = time.monotonic()
|
|
167
|
+
|
|
168
|
+
while current_url:
|
|
169
|
+
# Check pagination limits (treat 0 as a valid limit)
|
|
170
|
+
if (max_pages is not None) and page_count >= max_pages:
|
|
171
|
+
break
|
|
172
|
+
|
|
173
|
+
if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
|
|
174
|
+
break
|
|
175
|
+
|
|
176
|
+
# Fetch next page
|
|
177
|
+
response = await client.get(current_url)
|
|
178
|
+
|
|
179
|
+
if response.status_code >= 400:
|
|
180
|
+
# Log error but continue with what we have
|
|
181
|
+
import logging
|
|
182
|
+
logger = logging.getLogger("firecrawl")
|
|
183
|
+
logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
|
|
184
|
+
break
|
|
185
|
+
|
|
186
|
+
page_data = response.json()
|
|
187
|
+
|
|
188
|
+
if not page_data.get("success"):
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
# Add documents from this page
|
|
192
|
+
for doc_data in page_data.get("data", []):
|
|
193
|
+
if isinstance(doc_data, dict):
|
|
194
|
+
# Check max_results limit
|
|
195
|
+
if (max_results is not None) and (len(documents) >= max_results):
|
|
196
|
+
break
|
|
197
|
+
normalized = normalize_document_input(doc_data)
|
|
198
|
+
documents.append(Document(**normalized))
|
|
199
|
+
|
|
200
|
+
# Check if we hit max_results limit
|
|
201
|
+
if (max_results is not None) and (len(documents) >= max_results):
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
# Get next URL
|
|
205
|
+
current_url = page_data.get("next")
|
|
206
|
+
page_count += 1
|
|
207
|
+
|
|
208
|
+
return documents
|
|
209
|
+
|
|
210
|
+
|
|
94
211
|
async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
|
|
212
|
+
"""
|
|
213
|
+
Cancel a crawl job.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
client: Async HTTP client instance
|
|
217
|
+
job_id: ID of the crawl job
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
True if cancellation was successful
|
|
221
|
+
|
|
222
|
+
Raises:
|
|
223
|
+
Exception: If the cancellation operation fails
|
|
224
|
+
"""
|
|
95
225
|
response = await client.delete(f"/v2/crawl/{job_id}")
|
|
96
226
|
if response.status_code >= 400:
|
|
97
227
|
handle_response_error(response, "cancel crawl")
|
|
@@ -100,6 +230,20 @@ async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
|
|
|
100
230
|
|
|
101
231
|
|
|
102
232
|
async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
|
|
233
|
+
"""
|
|
234
|
+
Preview crawl parameters before starting a crawl job.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
client: Async HTTP client instance
|
|
238
|
+
request: CrawlParamsRequest containing URL and prompt
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
CrawlParamsData containing crawl configuration
|
|
242
|
+
|
|
243
|
+
Raises:
|
|
244
|
+
ValueError: If request is invalid
|
|
245
|
+
Exception: If the parameter preview fails
|
|
246
|
+
"""
|
|
103
247
|
if not request.url or not request.url.strip():
|
|
104
248
|
raise ValueError("URL cannot be empty")
|
|
105
249
|
if not request.prompt or not request.prompt.strip():
|
|
@@ -138,6 +282,19 @@ async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequ
|
|
|
138
282
|
|
|
139
283
|
|
|
140
284
|
async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlErrorsResponse:
|
|
285
|
+
"""
|
|
286
|
+
Get errors from a crawl job.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
client: Async HTTP client instance
|
|
290
|
+
crawl_id: ID of the crawl job
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
CrawlErrorsResponse with errors and robots blocked
|
|
294
|
+
|
|
295
|
+
Raises:
|
|
296
|
+
Exception: If the error check operation fails
|
|
297
|
+
"""
|
|
141
298
|
response = await client.get(f"/v2/crawl/{crawl_id}/errors")
|
|
142
299
|
if response.status_code >= 400:
|
|
143
300
|
handle_response_error(response, "check crawl errors")
|
|
@@ -151,6 +308,18 @@ async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlError
|
|
|
151
308
|
|
|
152
309
|
|
|
153
310
|
async def get_active_crawls(client: AsyncHttpClient) -> ActiveCrawlsResponse:
|
|
311
|
+
"""
|
|
312
|
+
Get active crawl jobs.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
client: Async HTTP client instance
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
ActiveCrawlsResponse with active crawl jobs
|
|
319
|
+
|
|
320
|
+
Raises:
|
|
321
|
+
Exception: If the active crawl jobs operation fails
|
|
322
|
+
"""
|
|
154
323
|
response = await client.get("/v2/crawl/active")
|
|
155
324
|
if response.status_code >= 400:
|
|
156
325
|
handle_response_error(response, "get active crawls")
|
firecrawl/v2/methods/batch.py
CHANGED
|
@@ -11,6 +11,7 @@ from ..types import (
|
|
|
11
11
|
ScrapeOptions,
|
|
12
12
|
Document,
|
|
13
13
|
WebhookConfig,
|
|
14
|
+
PaginationConfig,
|
|
14
15
|
)
|
|
15
16
|
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
16
17
|
from ..utils.normalize import normalize_document_input
|
|
@@ -77,7 +78,8 @@ def start_batch_scrape(
|
|
|
77
78
|
|
|
78
79
|
def get_batch_scrape_status(
|
|
79
80
|
client: HttpClient,
|
|
80
|
-
job_id: str
|
|
81
|
+
job_id: str,
|
|
82
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
81
83
|
) -> BatchScrapeJob:
|
|
82
84
|
"""
|
|
83
85
|
Get the status of a batch scrape job.
|
|
@@ -85,9 +87,10 @@ def get_batch_scrape_status(
|
|
|
85
87
|
Args:
|
|
86
88
|
client: HTTP client instance
|
|
87
89
|
job_id: ID of the batch scrape job
|
|
90
|
+
pagination_config: Optional configuration for pagination behavior
|
|
88
91
|
|
|
89
92
|
Returns:
|
|
90
|
-
|
|
93
|
+
BatchScrapeJob containing job status and data
|
|
91
94
|
|
|
92
95
|
Raises:
|
|
93
96
|
FirecrawlError: If the status check fails
|
|
@@ -111,17 +114,99 @@ def get_batch_scrape_status(
|
|
|
111
114
|
normalized = normalize_document_input(doc)
|
|
112
115
|
documents.append(Document(**normalized))
|
|
113
116
|
|
|
117
|
+
# Handle pagination if requested
|
|
118
|
+
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
119
|
+
if auto_paginate and body.get("next"):
|
|
120
|
+
documents = _fetch_all_batch_pages(
|
|
121
|
+
client,
|
|
122
|
+
body.get("next"),
|
|
123
|
+
documents,
|
|
124
|
+
pagination_config
|
|
125
|
+
)
|
|
126
|
+
|
|
114
127
|
return BatchScrapeJob(
|
|
115
128
|
status=body.get("status"),
|
|
116
129
|
completed=body.get("completed", 0),
|
|
117
130
|
total=body.get("total", 0),
|
|
118
131
|
credits_used=body.get("creditsUsed"),
|
|
119
132
|
expires_at=body.get("expiresAt"),
|
|
120
|
-
next=body.get("next"),
|
|
133
|
+
next=body.get("next") if not auto_paginate else None,
|
|
121
134
|
data=documents,
|
|
122
135
|
)
|
|
123
136
|
|
|
124
137
|
|
|
138
|
+
def _fetch_all_batch_pages(
|
|
139
|
+
client: HttpClient,
|
|
140
|
+
next_url: str,
|
|
141
|
+
initial_documents: List[Document],
|
|
142
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
143
|
+
) -> List[Document]:
|
|
144
|
+
"""
|
|
145
|
+
Fetch all pages of batch scrape results.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
client: HTTP client instance
|
|
149
|
+
next_url: URL for the next page
|
|
150
|
+
initial_documents: Documents from the first page
|
|
151
|
+
pagination_config: Optional configuration for pagination limits
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
List of all documents from all pages
|
|
155
|
+
"""
|
|
156
|
+
documents = initial_documents.copy()
|
|
157
|
+
current_url = next_url
|
|
158
|
+
page_count = 0
|
|
159
|
+
|
|
160
|
+
# Apply pagination limits
|
|
161
|
+
max_pages = pagination_config.max_pages if pagination_config else None
|
|
162
|
+
max_results = pagination_config.max_results if pagination_config else None
|
|
163
|
+
max_wait_time = pagination_config.max_wait_time if pagination_config else None
|
|
164
|
+
|
|
165
|
+
start_time = time.monotonic()
|
|
166
|
+
|
|
167
|
+
while current_url:
|
|
168
|
+
# Check pagination limits (treat 0 as a valid limit)
|
|
169
|
+
if (max_pages is not None) and page_count >= max_pages:
|
|
170
|
+
break
|
|
171
|
+
|
|
172
|
+
if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
# Fetch next page
|
|
176
|
+
response = client.get(current_url)
|
|
177
|
+
|
|
178
|
+
if not response.ok:
|
|
179
|
+
# Log error but continue with what we have
|
|
180
|
+
import logging
|
|
181
|
+
logger = logging.getLogger("firecrawl")
|
|
182
|
+
logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
page_data = response.json()
|
|
186
|
+
|
|
187
|
+
if not page_data.get("success"):
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
# Add documents from this page
|
|
191
|
+
for doc in page_data.get("data", []) or []:
|
|
192
|
+
if isinstance(doc, dict):
|
|
193
|
+
# Check max_results limit
|
|
194
|
+
if max_results is not None and len(documents) >= max_results:
|
|
195
|
+
break
|
|
196
|
+
normalized = normalize_document_input(doc)
|
|
197
|
+
documents.append(Document(**normalized))
|
|
198
|
+
|
|
199
|
+
# Check if we hit max_results limit after adding all docs from this page
|
|
200
|
+
if max_results is not None and len(documents) >= max_results:
|
|
201
|
+
break
|
|
202
|
+
|
|
203
|
+
# Get next URL
|
|
204
|
+
current_url = page_data.get("next")
|
|
205
|
+
page_count += 1
|
|
206
|
+
|
|
207
|
+
return documents
|
|
208
|
+
|
|
209
|
+
|
|
125
210
|
def cancel_batch_scrape(
|
|
126
211
|
client: HttpClient,
|
|
127
212
|
job_id: str
|
|
@@ -173,7 +258,7 @@ def wait_for_batch_completion(
|
|
|
173
258
|
FirecrawlError: If the job fails or timeout is reached
|
|
174
259
|
TimeoutError: If timeout is reached
|
|
175
260
|
"""
|
|
176
|
-
start_time = time.
|
|
261
|
+
start_time = time.monotonic()
|
|
177
262
|
|
|
178
263
|
while True:
|
|
179
264
|
status_job = get_batch_scrape_status(client, job_id)
|
|
@@ -183,7 +268,7 @@ def wait_for_batch_completion(
|
|
|
183
268
|
return status_job
|
|
184
269
|
|
|
185
270
|
# Check timeout
|
|
186
|
-
if timeout and (time.
|
|
271
|
+
if timeout and (time.monotonic() - start_time) > timeout:
|
|
187
272
|
raise TimeoutError(f"Batch scrape job {job_id} did not complete within {timeout} seconds")
|
|
188
273
|
|
|
189
274
|
# Wait before next poll
|