firecrawl 3.3.3__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/e2e/v2/test_crawl.py +1 -1
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +602 -0
- firecrawl/v2/client.py +23 -4
- firecrawl/v2/client_async.py +21 -4
- firecrawl/v2/methods/aio/batch.py +107 -8
- firecrawl/v2/methods/aio/crawl.py +172 -3
- firecrawl/v2/methods/batch.py +90 -5
- firecrawl/v2/methods/crawl.py +95 -6
- firecrawl/v2/types.py +7 -0
- firecrawl/v2/utils/http_client.py +26 -3
- {firecrawl-3.3.3.dist-info → firecrawl-4.0.0.dist-info}/METADATA +1 -1
- {firecrawl-3.3.3.dist-info → firecrawl-4.0.0.dist-info}/RECORD +16 -15
- {firecrawl-3.3.3.dist-info → firecrawl-4.0.0.dist-info}/WHEEL +0 -0
- {firecrawl-3.3.3.dist-info → firecrawl-4.0.0.dist-info}/licenses/LICENSE +0 -0
- {firecrawl-3.3.3.dist-info → firecrawl-4.0.0.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -63,7 +63,7 @@ class TestCrawlE2E:
|
|
|
63
63
|
assert status_job.status in ["scraping", "completed", "failed"]
|
|
64
64
|
assert status_job.completed >= 0
|
|
65
65
|
assert status_job.expires_at is not None
|
|
66
|
-
assert status_job.next is
|
|
66
|
+
assert status_job.next is None
|
|
67
67
|
assert isinstance(status_job.data, list)
|
|
68
68
|
|
|
69
69
|
def test_cancel_crawl(self):
|
|
@@ -0,0 +1,602 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unit tests for Firecrawl v2 pagination functionality.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
import time
|
|
7
|
+
from unittest.mock import Mock, patch, AsyncMock
|
|
8
|
+
from typing import Dict, Any, List
|
|
9
|
+
|
|
10
|
+
from firecrawl.v2.types import (
|
|
11
|
+
PaginationConfig,
|
|
12
|
+
CrawlJob,
|
|
13
|
+
BatchScrapeJob,
|
|
14
|
+
Document,
|
|
15
|
+
DocumentMetadata
|
|
16
|
+
)
|
|
17
|
+
from firecrawl.v2.methods.crawl import get_crawl_status, _fetch_all_pages
|
|
18
|
+
from firecrawl.v2.methods.batch import get_batch_scrape_status, _fetch_all_batch_pages
|
|
19
|
+
from firecrawl.v2.methods.aio.crawl import get_crawl_status as get_crawl_status_async, _fetch_all_pages_async
|
|
20
|
+
from firecrawl.v2.methods.aio.batch import get_batch_scrape_status as get_batch_scrape_status_async, _fetch_all_batch_pages_async
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestPaginationConfig:
|
|
24
|
+
"""Test PaginationConfig model."""
|
|
25
|
+
|
|
26
|
+
def test_default_values(self):
|
|
27
|
+
"""Test default values for PaginationConfig."""
|
|
28
|
+
config = PaginationConfig()
|
|
29
|
+
assert config.auto_paginate is True
|
|
30
|
+
assert config.max_pages is None
|
|
31
|
+
assert config.max_results is None
|
|
32
|
+
assert config.max_wait_time is None
|
|
33
|
+
|
|
34
|
+
def test_custom_values(self):
|
|
35
|
+
"""Test custom values for PaginationConfig."""
|
|
36
|
+
config = PaginationConfig(
|
|
37
|
+
auto_paginate=False,
|
|
38
|
+
max_pages=5,
|
|
39
|
+
max_results=100,
|
|
40
|
+
max_wait_time=30
|
|
41
|
+
)
|
|
42
|
+
assert config.auto_paginate is False
|
|
43
|
+
assert config.max_pages == 5
|
|
44
|
+
assert config.max_results == 100
|
|
45
|
+
assert config.max_wait_time == 30
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class TestCrawlPagination:
|
|
49
|
+
"""Test crawl pagination functionality."""
|
|
50
|
+
|
|
51
|
+
def setup_method(self):
|
|
52
|
+
"""Set up test fixtures."""
|
|
53
|
+
self.mock_client = Mock()
|
|
54
|
+
self.job_id = "test-crawl-123"
|
|
55
|
+
|
|
56
|
+
# Sample document data
|
|
57
|
+
self.sample_doc = {
|
|
58
|
+
"url": "https://example.com",
|
|
59
|
+
"markdown": "# Test Content",
|
|
60
|
+
"metadata": {
|
|
61
|
+
"title": "Test Page",
|
|
62
|
+
"statusCode": 200
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def test_get_crawl_status_no_pagination(self):
|
|
67
|
+
"""Test get_crawl_status with auto_paginate=False."""
|
|
68
|
+
# Mock response with next URL
|
|
69
|
+
mock_response = Mock()
|
|
70
|
+
mock_response.ok = True
|
|
71
|
+
mock_response.json.return_value = {
|
|
72
|
+
"success": True,
|
|
73
|
+
"status": "completed",
|
|
74
|
+
"completed": 10,
|
|
75
|
+
"total": 10,
|
|
76
|
+
"creditsUsed": 5,
|
|
77
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
78
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
|
|
79
|
+
"data": [self.sample_doc]
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
self.mock_client.get.return_value = mock_response
|
|
83
|
+
|
|
84
|
+
# Test with auto_paginate=False
|
|
85
|
+
pagination_config = PaginationConfig(auto_paginate=False)
|
|
86
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
87
|
+
|
|
88
|
+
assert result.status == "completed"
|
|
89
|
+
assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
|
|
90
|
+
assert len(result.data) == 1
|
|
91
|
+
assert isinstance(result.data[0], Document)
|
|
92
|
+
|
|
93
|
+
def test_get_crawl_status_with_pagination(self):
|
|
94
|
+
"""Test get_crawl_status with auto_paginate=True."""
|
|
95
|
+
# Mock first page response
|
|
96
|
+
mock_response1 = Mock()
|
|
97
|
+
mock_response1.ok = True
|
|
98
|
+
mock_response1.json.return_value = {
|
|
99
|
+
"success": True,
|
|
100
|
+
"status": "completed",
|
|
101
|
+
"completed": 10,
|
|
102
|
+
"total": 20,
|
|
103
|
+
"creditsUsed": 5,
|
|
104
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
105
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
|
|
106
|
+
"data": [self.sample_doc]
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Mock second page response
|
|
110
|
+
mock_response2 = Mock()
|
|
111
|
+
mock_response2.ok = True
|
|
112
|
+
mock_response2.json.return_value = {
|
|
113
|
+
"success": True,
|
|
114
|
+
"status": "completed",
|
|
115
|
+
"completed": 20,
|
|
116
|
+
"total": 20,
|
|
117
|
+
"creditsUsed": 10,
|
|
118
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
119
|
+
"next": None,
|
|
120
|
+
"data": [self.sample_doc]
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
124
|
+
|
|
125
|
+
# Test with auto_paginate=True
|
|
126
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
127
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
128
|
+
|
|
129
|
+
assert result.status == "completed"
|
|
130
|
+
assert result.next is None # Should be None when auto_paginate=True
|
|
131
|
+
assert len(result.data) == 2
|
|
132
|
+
assert self.mock_client.get.call_count == 2
|
|
133
|
+
|
|
134
|
+
def test_get_crawl_status_max_pages_limit(self):
|
|
135
|
+
"""Test get_crawl_status with max_pages limit."""
|
|
136
|
+
# Mock responses for multiple pages
|
|
137
|
+
mock_responses = []
|
|
138
|
+
for i in range(5): # 5 pages available
|
|
139
|
+
mock_response = Mock()
|
|
140
|
+
mock_response.ok = True
|
|
141
|
+
mock_response.json.return_value = {
|
|
142
|
+
"success": True,
|
|
143
|
+
"status": "completed",
|
|
144
|
+
"completed": (i + 1) * 10,
|
|
145
|
+
"total": 50,
|
|
146
|
+
"creditsUsed": (i + 1) * 5,
|
|
147
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
148
|
+
"next": f"https://api.firecrawl.dev/v2/crawl/test-crawl-123?page={i+2}" if i < 4 else None,
|
|
149
|
+
"data": [self.sample_doc]
|
|
150
|
+
}
|
|
151
|
+
mock_responses.append(mock_response)
|
|
152
|
+
|
|
153
|
+
self.mock_client.get.side_effect = mock_responses
|
|
154
|
+
|
|
155
|
+
# Test with max_pages=3
|
|
156
|
+
pagination_config = PaginationConfig(auto_paginate=True, max_pages=3)
|
|
157
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
158
|
+
|
|
159
|
+
assert len(result.data) == 4 # 1 initial + 3 from pages
|
|
160
|
+
assert self.mock_client.get.call_count == 4 # 1 initial + 3 pagination calls
|
|
161
|
+
|
|
162
|
+
def test_get_crawl_status_max_results_limit(self):
|
|
163
|
+
"""Test get_crawl_status with max_results limit."""
|
|
164
|
+
# Mock responses with multiple documents per page
|
|
165
|
+
mock_response1 = Mock()
|
|
166
|
+
mock_response1.ok = True
|
|
167
|
+
mock_response1.json.return_value = {
|
|
168
|
+
"success": True,
|
|
169
|
+
"status": "completed",
|
|
170
|
+
"completed": 10,
|
|
171
|
+
"total": 20,
|
|
172
|
+
"creditsUsed": 5,
|
|
173
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
174
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
|
|
175
|
+
"data": [self.sample_doc, self.sample_doc, self.sample_doc] # 3 docs
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
mock_response2 = Mock()
|
|
179
|
+
mock_response2.ok = True
|
|
180
|
+
mock_response2.json.return_value = {
|
|
181
|
+
"success": True,
|
|
182
|
+
"status": "completed",
|
|
183
|
+
"completed": 20,
|
|
184
|
+
"total": 20,
|
|
185
|
+
"creditsUsed": 10,
|
|
186
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
187
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=3",
|
|
188
|
+
"data": [self.sample_doc, self.sample_doc] # 2 more docs
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
192
|
+
|
|
193
|
+
# Test with max_results=4
|
|
194
|
+
pagination_config = PaginationConfig(auto_paginate=True, max_results=4)
|
|
195
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
196
|
+
|
|
197
|
+
assert len(result.data) == 4 # Should stop at 4 results
|
|
198
|
+
assert self.mock_client.get.call_count == 2 # Should fetch 2 pages
|
|
199
|
+
|
|
200
|
+
def test_get_crawl_status_max_wait_time_limit(self):
|
|
201
|
+
"""Test get_crawl_status with max_wait_time limit."""
|
|
202
|
+
# Mock slow response
|
|
203
|
+
mock_response = Mock()
|
|
204
|
+
mock_response.ok = True
|
|
205
|
+
mock_response.json.return_value = {
|
|
206
|
+
"success": True,
|
|
207
|
+
"status": "completed",
|
|
208
|
+
"completed": 10,
|
|
209
|
+
"total": 20,
|
|
210
|
+
"creditsUsed": 5,
|
|
211
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
212
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
|
|
213
|
+
"data": [self.sample_doc]
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
self.mock_client.get.return_value = mock_response
|
|
217
|
+
|
|
218
|
+
# Test with max_wait_time=1 second
|
|
219
|
+
pagination_config = PaginationConfig(auto_paginate=True, max_wait_time=1)
|
|
220
|
+
|
|
221
|
+
with patch('firecrawl.v2.methods.crawl.time.monotonic', side_effect=[0, 2]): # Simulate 2 seconds elapsed
|
|
222
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
223
|
+
|
|
224
|
+
assert len(result.data) == 1 # Should stop due to timeout
|
|
225
|
+
assert self.mock_client.get.call_count == 1
|
|
226
|
+
|
|
227
|
+
def test_fetch_all_pages_error_handling(self):
|
|
228
|
+
"""Test _fetch_all_pages with API errors."""
|
|
229
|
+
# Mock first page success, second page error
|
|
230
|
+
mock_response1 = Mock()
|
|
231
|
+
mock_response1.ok = True
|
|
232
|
+
mock_response1.json.return_value = {
|
|
233
|
+
"success": True,
|
|
234
|
+
"data": [self.sample_doc],
|
|
235
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
mock_response2 = Mock()
|
|
239
|
+
mock_response2.ok = False
|
|
240
|
+
mock_response2.status_code = 500
|
|
241
|
+
|
|
242
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
243
|
+
|
|
244
|
+
# Should continue with what we have
|
|
245
|
+
result = _fetch_all_pages(self.mock_client, "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2", [], None)
|
|
246
|
+
|
|
247
|
+
assert len(result) == 1 # Should have the first page data
|
|
248
|
+
assert self.mock_client.get.call_count == 2
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
class TestBatchScrapePagination:
|
|
252
|
+
"""Test batch scrape pagination functionality."""
|
|
253
|
+
|
|
254
|
+
def setup_method(self):
|
|
255
|
+
"""Set up test fixtures."""
|
|
256
|
+
self.mock_client = Mock()
|
|
257
|
+
self.job_id = "test-batch-123"
|
|
258
|
+
|
|
259
|
+
# Sample document data
|
|
260
|
+
self.sample_doc = {
|
|
261
|
+
"url": "https://example.com",
|
|
262
|
+
"markdown": "# Test Content",
|
|
263
|
+
"metadata": {
|
|
264
|
+
"title": "Test Page",
|
|
265
|
+
"statusCode": 200
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
def test_get_batch_scrape_status_no_pagination(self):
|
|
270
|
+
"""Test get_batch_scrape_status with auto_paginate=False."""
|
|
271
|
+
# Mock response with next URL
|
|
272
|
+
mock_response = Mock()
|
|
273
|
+
mock_response.ok = True
|
|
274
|
+
mock_response.json.return_value = {
|
|
275
|
+
"success": True,
|
|
276
|
+
"status": "completed",
|
|
277
|
+
"completed": 10,
|
|
278
|
+
"total": 10,
|
|
279
|
+
"creditsUsed": 5,
|
|
280
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
281
|
+
"next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
|
|
282
|
+
"data": [self.sample_doc]
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
self.mock_client.get.return_value = mock_response
|
|
286
|
+
|
|
287
|
+
# Test with auto_paginate=False
|
|
288
|
+
pagination_config = PaginationConfig(auto_paginate=False)
|
|
289
|
+
result = get_batch_scrape_status(self.mock_client, self.job_id, pagination_config)
|
|
290
|
+
|
|
291
|
+
assert result.status == "completed"
|
|
292
|
+
assert result.next == "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2"
|
|
293
|
+
assert len(result.data) == 1
|
|
294
|
+
assert isinstance(result.data[0], Document)
|
|
295
|
+
|
|
296
|
+
def test_get_batch_scrape_status_with_pagination(self):
|
|
297
|
+
"""Test get_batch_scrape_status with auto_paginate=True."""
|
|
298
|
+
# Mock first page response
|
|
299
|
+
mock_response1 = Mock()
|
|
300
|
+
mock_response1.ok = True
|
|
301
|
+
mock_response1.json.return_value = {
|
|
302
|
+
"success": True,
|
|
303
|
+
"status": "completed",
|
|
304
|
+
"completed": 10,
|
|
305
|
+
"total": 20,
|
|
306
|
+
"creditsUsed": 5,
|
|
307
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
308
|
+
"next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
|
|
309
|
+
"data": [self.sample_doc]
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
# Mock second page response
|
|
313
|
+
mock_response2 = Mock()
|
|
314
|
+
mock_response2.ok = True
|
|
315
|
+
mock_response2.json.return_value = {
|
|
316
|
+
"success": True,
|
|
317
|
+
"status": "completed",
|
|
318
|
+
"completed": 20,
|
|
319
|
+
"total": 20,
|
|
320
|
+
"creditsUsed": 10,
|
|
321
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
322
|
+
"next": None,
|
|
323
|
+
"data": [self.sample_doc]
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
327
|
+
|
|
328
|
+
# Test with auto_paginate=True
|
|
329
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
330
|
+
result = get_batch_scrape_status(self.mock_client, self.job_id, pagination_config)
|
|
331
|
+
|
|
332
|
+
assert result.status == "completed"
|
|
333
|
+
assert result.next is None # Should be None when auto_paginate=True
|
|
334
|
+
assert len(result.data) == 2
|
|
335
|
+
assert self.mock_client.get.call_count == 2
|
|
336
|
+
|
|
337
|
+
def test_fetch_all_batch_pages_limits(self):
|
|
338
|
+
"""Test _fetch_all_batch_pages with various limits."""
|
|
339
|
+
# Mock responses for multiple pages
|
|
340
|
+
mock_responses = []
|
|
341
|
+
for i in range(5): # 5 pages available
|
|
342
|
+
mock_response = Mock()
|
|
343
|
+
mock_response.ok = True
|
|
344
|
+
mock_response.json.return_value = {
|
|
345
|
+
"success": True,
|
|
346
|
+
"data": [self.sample_doc, self.sample_doc], # 2 docs per page
|
|
347
|
+
"next": f"https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page={i+2}" if i < 4 else None
|
|
348
|
+
}
|
|
349
|
+
mock_responses.append(mock_response)
|
|
350
|
+
|
|
351
|
+
self.mock_client.get.side_effect = mock_responses
|
|
352
|
+
|
|
353
|
+
# Test with max_pages=2, max_results=4 (total docs we want)
|
|
354
|
+
pagination_config = PaginationConfig(max_pages=2, max_results=4)
|
|
355
|
+
result = _fetch_all_batch_pages(
|
|
356
|
+
self.mock_client,
|
|
357
|
+
"https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
|
|
358
|
+
[Document(**self.sample_doc)], # 1 initial doc
|
|
359
|
+
pagination_config
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Should have 1 initial + 3 from pages (limited by max_results=4)
|
|
363
|
+
assert len(result) == 4
|
|
364
|
+
assert self.mock_client.get.call_count == 2 # Should fetch 2 pages
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
class TestAsyncPagination:
|
|
368
|
+
"""Test async pagination functionality."""
|
|
369
|
+
|
|
370
|
+
def setup_method(self):
|
|
371
|
+
"""Set up test fixtures."""
|
|
372
|
+
self.mock_client = AsyncMock()
|
|
373
|
+
self.job_id = "test-async-123"
|
|
374
|
+
|
|
375
|
+
# Sample document data
|
|
376
|
+
self.sample_doc = {
|
|
377
|
+
"url": "https://example.com",
|
|
378
|
+
"markdown": "# Test Content",
|
|
379
|
+
"metadata": {
|
|
380
|
+
"title": "Test Page",
|
|
381
|
+
"statusCode": 200
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
@pytest.mark.asyncio
|
|
386
|
+
async def test_get_crawl_status_async_with_pagination(self):
|
|
387
|
+
"""Test async get_crawl_status with pagination."""
|
|
388
|
+
# Mock first page response
|
|
389
|
+
mock_response1 = Mock()
|
|
390
|
+
mock_response1.status_code = 200
|
|
391
|
+
mock_response1.json.return_value = {
|
|
392
|
+
"success": True,
|
|
393
|
+
"status": "completed",
|
|
394
|
+
"completed": 10,
|
|
395
|
+
"total": 20,
|
|
396
|
+
"creditsUsed": 5,
|
|
397
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
398
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-async-123?page=2",
|
|
399
|
+
"data": [self.sample_doc]
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
# Mock second page response
|
|
403
|
+
mock_response2 = Mock()
|
|
404
|
+
mock_response2.status_code = 200
|
|
405
|
+
mock_response2.json.return_value = {
|
|
406
|
+
"success": True,
|
|
407
|
+
"status": "completed",
|
|
408
|
+
"completed": 20,
|
|
409
|
+
"total": 20,
|
|
410
|
+
"creditsUsed": 10,
|
|
411
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
412
|
+
"next": None,
|
|
413
|
+
"data": [self.sample_doc]
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
417
|
+
|
|
418
|
+
# Test with auto_paginate=True
|
|
419
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
420
|
+
result = await get_crawl_status_async(self.mock_client, self.job_id, pagination_config)
|
|
421
|
+
|
|
422
|
+
assert result.status == "completed"
|
|
423
|
+
assert result.next is None
|
|
424
|
+
assert len(result.data) == 2
|
|
425
|
+
assert self.mock_client.get.call_count == 2
|
|
426
|
+
|
|
427
|
+
@pytest.mark.asyncio
|
|
428
|
+
async def test_get_batch_scrape_status_async_with_pagination(self):
|
|
429
|
+
"""Test async get_batch_scrape_status with pagination."""
|
|
430
|
+
# Mock first page response
|
|
431
|
+
mock_response1 = Mock()
|
|
432
|
+
mock_response1.status_code = 200
|
|
433
|
+
mock_response1.json.return_value = {
|
|
434
|
+
"success": True,
|
|
435
|
+
"status": "completed",
|
|
436
|
+
"completed": 10,
|
|
437
|
+
"total": 20,
|
|
438
|
+
"creditsUsed": 5,
|
|
439
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
440
|
+
"next": "https://api.firecrawl.dev/v2/batch/scrape/test-async-123?page=2",
|
|
441
|
+
"data": [self.sample_doc]
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
# Mock second page response
|
|
445
|
+
mock_response2 = Mock()
|
|
446
|
+
mock_response2.status_code = 200
|
|
447
|
+
mock_response2.json.return_value = {
|
|
448
|
+
"success": True,
|
|
449
|
+
"status": "completed",
|
|
450
|
+
"completed": 20,
|
|
451
|
+
"total": 20,
|
|
452
|
+
"creditsUsed": 10,
|
|
453
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
454
|
+
"next": None,
|
|
455
|
+
"data": [self.sample_doc]
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
459
|
+
|
|
460
|
+
# Test with auto_paginate=True
|
|
461
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
462
|
+
result = await get_batch_scrape_status_async(self.mock_client, self.job_id, pagination_config)
|
|
463
|
+
|
|
464
|
+
assert result.status == "completed"
|
|
465
|
+
assert result.next is None
|
|
466
|
+
assert len(result.data) == 2
|
|
467
|
+
assert self.mock_client.get.call_count == 2
|
|
468
|
+
|
|
469
|
+
@pytest.mark.asyncio
|
|
470
|
+
async def test_fetch_all_pages_async_limits(self):
|
|
471
|
+
"""Test async _fetch_all_pages_async with limits."""
|
|
472
|
+
# Mock responses for multiple pages
|
|
473
|
+
mock_responses = []
|
|
474
|
+
for i in range(3): # 3 pages available
|
|
475
|
+
mock_response = Mock()
|
|
476
|
+
mock_response.status_code = 200
|
|
477
|
+
mock_response.json.return_value = {
|
|
478
|
+
"success": True,
|
|
479
|
+
"data": [self.sample_doc],
|
|
480
|
+
"next": f"https://api.firecrawl.dev/v2/crawl/test-async-123?page={i+2}" if i < 2 else None
|
|
481
|
+
}
|
|
482
|
+
mock_responses.append(mock_response)
|
|
483
|
+
|
|
484
|
+
self.mock_client.get.side_effect = mock_responses
|
|
485
|
+
|
|
486
|
+
# Test with max_pages=2
|
|
487
|
+
pagination_config = PaginationConfig(max_pages=2)
|
|
488
|
+
result = await _fetch_all_pages_async(
|
|
489
|
+
self.mock_client,
|
|
490
|
+
"https://api.firecrawl.dev/v2/crawl/test-async-123?page=2",
|
|
491
|
+
[Document(**self.sample_doc)], # 1 initial doc
|
|
492
|
+
pagination_config
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
assert len(result) == 3 # 1 initial + 2 from pages
|
|
496
|
+
assert self.mock_client.get.call_count == 2
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
class TestPaginationEdgeCases:
|
|
500
|
+
"""Test pagination edge cases and error conditions."""
|
|
501
|
+
|
|
502
|
+
def setup_method(self):
|
|
503
|
+
"""Set up test fixtures."""
|
|
504
|
+
self.mock_client = Mock()
|
|
505
|
+
self.sample_doc = {
|
|
506
|
+
"url": "https://example.com",
|
|
507
|
+
"markdown": "# Test Content",
|
|
508
|
+
"metadata": {"title": "Test Page"}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
def test_pagination_with_empty_data(self):
|
|
512
|
+
"""Test pagination when API returns empty data."""
|
|
513
|
+
mock_response = Mock()
|
|
514
|
+
mock_response.ok = True
|
|
515
|
+
mock_response.json.return_value = {
|
|
516
|
+
"success": True,
|
|
517
|
+
"status": "completed",
|
|
518
|
+
"completed": 0,
|
|
519
|
+
"total": 0,
|
|
520
|
+
"creditsUsed": 0,
|
|
521
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
522
|
+
"next": None,
|
|
523
|
+
"data": []
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
self.mock_client.get.return_value = mock_response
|
|
527
|
+
|
|
528
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
529
|
+
result = get_crawl_status(self.mock_client, "test-123", pagination_config)
|
|
530
|
+
|
|
531
|
+
assert len(result.data) == 0
|
|
532
|
+
assert result.next is None
|
|
533
|
+
|
|
534
|
+
def test_pagination_with_string_data(self):
|
|
535
|
+
"""Test pagination when API returns string data (should be skipped)."""
|
|
536
|
+
mock_response = Mock()
|
|
537
|
+
mock_response.ok = True
|
|
538
|
+
mock_response.json.return_value = {
|
|
539
|
+
"success": True,
|
|
540
|
+
"status": "completed",
|
|
541
|
+
"completed": 2,
|
|
542
|
+
"total": 2,
|
|
543
|
+
"creditsUsed": 1,
|
|
544
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
545
|
+
"next": None,
|
|
546
|
+
"data": ["https://example.com", self.sample_doc] # String + dict
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
self.mock_client.get.return_value = mock_response
|
|
550
|
+
|
|
551
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
552
|
+
result = get_crawl_status(self.mock_client, "test-123", pagination_config)
|
|
553
|
+
|
|
554
|
+
assert len(result.data) == 1 # Only the dict should be processed
|
|
555
|
+
assert isinstance(result.data[0], Document)
|
|
556
|
+
|
|
557
|
+
def test_pagination_with_failed_response(self):
|
|
558
|
+
"""Test pagination when API response indicates failure."""
|
|
559
|
+
mock_response = Mock()
|
|
560
|
+
mock_response.ok = True
|
|
561
|
+
mock_response.json.return_value = {
|
|
562
|
+
"success": False,
|
|
563
|
+
"error": "Job not found"
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
self.mock_client.get.return_value = mock_response
|
|
567
|
+
|
|
568
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
569
|
+
|
|
570
|
+
with pytest.raises(Exception, match="Job not found"):
|
|
571
|
+
get_crawl_status(self.mock_client, "test-123", pagination_config)
|
|
572
|
+
|
|
573
|
+
def test_pagination_with_unsuccessful_page(self):
|
|
574
|
+
"""Test pagination when a subsequent page is unsuccessful."""
|
|
575
|
+
# Mock first page success
|
|
576
|
+
mock_response1 = Mock()
|
|
577
|
+
mock_response1.ok = True
|
|
578
|
+
mock_response1.json.return_value = {
|
|
579
|
+
"success": True,
|
|
580
|
+
"data": [self.sample_doc],
|
|
581
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-123?page=2"
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
# Mock second page failure
|
|
585
|
+
mock_response2 = Mock()
|
|
586
|
+
mock_response2.ok = True
|
|
587
|
+
mock_response2.json.return_value = {
|
|
588
|
+
"success": False,
|
|
589
|
+
"error": "Page not found"
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
593
|
+
|
|
594
|
+
# Should continue with what we have
|
|
595
|
+
result = _fetch_all_pages(self.mock_client, "https://api.firecrawl.dev/v2/crawl/test-123?page=2", [], None)
|
|
596
|
+
|
|
597
|
+
assert len(result) == 1 # Should have the first page data
|
|
598
|
+
assert self.mock_client.get.call_count == 2
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
if __name__ == "__main__":
|
|
602
|
+
pytest.main([__file__, "-v"])
|
firecrawl/v2/client.py
CHANGED
|
@@ -35,6 +35,7 @@ from .types import (
|
|
|
35
35
|
ExecuteJavascriptAction,
|
|
36
36
|
PDFAction,
|
|
37
37
|
Location,
|
|
38
|
+
PaginationConfig,
|
|
38
39
|
)
|
|
39
40
|
from .utils.http_client import HttpClient
|
|
40
41
|
from .utils.error_handler import FirecrawlError
|
|
@@ -356,12 +357,17 @@ class FirecrawlClient:
|
|
|
356
357
|
|
|
357
358
|
return crawl_module.start_crawl(self.http_client, request)
|
|
358
359
|
|
|
359
|
-
def get_crawl_status(
|
|
360
|
+
def get_crawl_status(
|
|
361
|
+
self,
|
|
362
|
+
job_id: str,
|
|
363
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
364
|
+
) -> CrawlJob:
|
|
360
365
|
"""
|
|
361
366
|
Get the status of a crawl job.
|
|
362
367
|
|
|
363
368
|
Args:
|
|
364
369
|
job_id: ID of the crawl job
|
|
370
|
+
pagination_config: Optional configuration for pagination behavior
|
|
365
371
|
|
|
366
372
|
Returns:
|
|
367
373
|
CrawlJob with current status and data
|
|
@@ -369,7 +375,11 @@ class FirecrawlClient:
|
|
|
369
375
|
Raises:
|
|
370
376
|
Exception: If the status check fails
|
|
371
377
|
"""
|
|
372
|
-
return crawl_module.get_crawl_status(
|
|
378
|
+
return crawl_module.get_crawl_status(
|
|
379
|
+
self.http_client,
|
|
380
|
+
job_id,
|
|
381
|
+
pagination_config=pagination_config
|
|
382
|
+
)
|
|
373
383
|
|
|
374
384
|
def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
|
|
375
385
|
"""
|
|
@@ -651,16 +661,25 @@ class FirecrawlClient:
|
|
|
651
661
|
idempotency_key=idempotency_key,
|
|
652
662
|
)
|
|
653
663
|
|
|
654
|
-
def get_batch_scrape_status(
|
|
664
|
+
def get_batch_scrape_status(
|
|
665
|
+
self,
|
|
666
|
+
job_id: str,
|
|
667
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
668
|
+
):
|
|
655
669
|
"""Get current status and any scraped data for a batch job.
|
|
656
670
|
|
|
657
671
|
Args:
|
|
658
672
|
job_id: Batch job ID
|
|
673
|
+
pagination_config: Optional configuration for pagination behavior
|
|
659
674
|
|
|
660
675
|
Returns:
|
|
661
676
|
Status payload including counts and partial data
|
|
662
677
|
"""
|
|
663
|
-
return batch_module.get_batch_scrape_status(
|
|
678
|
+
return batch_module.get_batch_scrape_status(
|
|
679
|
+
self.http_client,
|
|
680
|
+
job_id,
|
|
681
|
+
pagination_config=pagination_config
|
|
682
|
+
)
|
|
664
683
|
|
|
665
684
|
def cancel_batch_scrape(self, job_id: str) -> bool:
|
|
666
685
|
"""Cancel a running batch scrape job.
|