firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unit tests for Firecrawl v2 pagination functionality.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
import time
|
|
7
|
+
from unittest.mock import Mock, patch, AsyncMock
|
|
8
|
+
from typing import Dict, Any, List
|
|
9
|
+
|
|
10
|
+
from firecrawl.v2.types import (
|
|
11
|
+
PaginationConfig,
|
|
12
|
+
CrawlJob,
|
|
13
|
+
BatchScrapeJob,
|
|
14
|
+
Document,
|
|
15
|
+
DocumentMetadata
|
|
16
|
+
)
|
|
17
|
+
from firecrawl.v2.methods.crawl import get_crawl_status, _fetch_all_pages
|
|
18
|
+
from firecrawl.v2.methods.batch import get_batch_scrape_status, _fetch_all_batch_pages
|
|
19
|
+
from firecrawl.v2.methods.aio.crawl import get_crawl_status as get_crawl_status_async, _fetch_all_pages_async
|
|
20
|
+
from firecrawl.v2.methods.aio.batch import get_batch_scrape_status as get_batch_scrape_status_async, _fetch_all_batch_pages_async
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestPaginationConfig:
|
|
24
|
+
"""Test PaginationConfig model."""
|
|
25
|
+
|
|
26
|
+
def test_default_values(self):
|
|
27
|
+
"""Test default values for PaginationConfig."""
|
|
28
|
+
config = PaginationConfig()
|
|
29
|
+
assert config.auto_paginate is True
|
|
30
|
+
assert config.max_pages is None
|
|
31
|
+
assert config.max_results is None
|
|
32
|
+
assert config.max_wait_time is None
|
|
33
|
+
|
|
34
|
+
def test_custom_values(self):
|
|
35
|
+
"""Test custom values for PaginationConfig."""
|
|
36
|
+
config = PaginationConfig(
|
|
37
|
+
auto_paginate=False,
|
|
38
|
+
max_pages=5,
|
|
39
|
+
max_results=100,
|
|
40
|
+
max_wait_time=30
|
|
41
|
+
)
|
|
42
|
+
assert config.auto_paginate is False
|
|
43
|
+
assert config.max_pages == 5
|
|
44
|
+
assert config.max_results == 100
|
|
45
|
+
assert config.max_wait_time == 30
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class TestCrawlPagination:
|
|
49
|
+
"""Test crawl pagination functionality."""
|
|
50
|
+
|
|
51
|
+
def setup_method(self):
|
|
52
|
+
"""Set up test fixtures."""
|
|
53
|
+
self.mock_client = Mock()
|
|
54
|
+
self.job_id = "test-crawl-123"
|
|
55
|
+
|
|
56
|
+
# Sample document data
|
|
57
|
+
self.sample_doc = {
|
|
58
|
+
"url": "https://example.com",
|
|
59
|
+
"markdown": "# Test Content",
|
|
60
|
+
"metadata": {
|
|
61
|
+
"title": "Test Page",
|
|
62
|
+
"statusCode": 200
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def test_get_crawl_status_no_pagination(self):
|
|
67
|
+
"""Test get_crawl_status with auto_paginate=False."""
|
|
68
|
+
# Mock response with next URL
|
|
69
|
+
mock_response = Mock()
|
|
70
|
+
mock_response.ok = True
|
|
71
|
+
mock_response.json.return_value = {
|
|
72
|
+
"success": True,
|
|
73
|
+
"status": "completed",
|
|
74
|
+
"completed": 10,
|
|
75
|
+
"total": 10,
|
|
76
|
+
"creditsUsed": 5,
|
|
77
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
78
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
|
|
79
|
+
"data": [self.sample_doc]
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
self.mock_client.get.return_value = mock_response
|
|
83
|
+
|
|
84
|
+
# Test with auto_paginate=False
|
|
85
|
+
pagination_config = PaginationConfig(auto_paginate=False)
|
|
86
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
87
|
+
|
|
88
|
+
assert result.status == "completed"
|
|
89
|
+
assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
|
|
90
|
+
assert len(result.data) == 1
|
|
91
|
+
assert isinstance(result.data[0], Document)
|
|
92
|
+
|
|
93
|
+
def test_get_crawl_status_propagates_request_timeout(self):
|
|
94
|
+
"""Ensure request_timeout is forwarded to the HTTP client."""
|
|
95
|
+
mock_response = Mock()
|
|
96
|
+
mock_response.ok = True
|
|
97
|
+
mock_response.json.return_value = {
|
|
98
|
+
"success": True,
|
|
99
|
+
"status": "completed",
|
|
100
|
+
"completed": 1,
|
|
101
|
+
"total": 1,
|
|
102
|
+
"creditsUsed": 1,
|
|
103
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
104
|
+
"next": None,
|
|
105
|
+
"data": [self.sample_doc],
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
self.mock_client.get.return_value = mock_response
|
|
109
|
+
|
|
110
|
+
timeout_seconds = 5.5
|
|
111
|
+
import firecrawl.v2.methods.crawl as crawl_module
|
|
112
|
+
|
|
113
|
+
assert crawl_module.__file__.endswith("firecrawl/v2/methods/crawl.py")
|
|
114
|
+
assert crawl_module.get_crawl_status.__kwdefaults__ is not None
|
|
115
|
+
assert "request_timeout" in crawl_module.get_crawl_status.__kwdefaults__
|
|
116
|
+
result = get_crawl_status(
|
|
117
|
+
self.mock_client,
|
|
118
|
+
self.job_id,
|
|
119
|
+
request_timeout=timeout_seconds,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
assert result.status == "completed"
|
|
123
|
+
self.mock_client.get.assert_called_with(
|
|
124
|
+
f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def test_get_crawl_status_with_pagination(self):
|
|
128
|
+
"""Test get_crawl_status with auto_paginate=True."""
|
|
129
|
+
# Mock first page response
|
|
130
|
+
mock_response1 = Mock()
|
|
131
|
+
mock_response1.ok = True
|
|
132
|
+
mock_response1.json.return_value = {
|
|
133
|
+
"success": True,
|
|
134
|
+
"status": "completed",
|
|
135
|
+
"completed": 10,
|
|
136
|
+
"total": 20,
|
|
137
|
+
"creditsUsed": 5,
|
|
138
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
139
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
|
|
140
|
+
"data": [self.sample_doc]
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Mock second page response
|
|
144
|
+
mock_response2 = Mock()
|
|
145
|
+
mock_response2.ok = True
|
|
146
|
+
mock_response2.json.return_value = {
|
|
147
|
+
"success": True,
|
|
148
|
+
"status": "completed",
|
|
149
|
+
"completed": 20,
|
|
150
|
+
"total": 20,
|
|
151
|
+
"creditsUsed": 10,
|
|
152
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
153
|
+
"next": None,
|
|
154
|
+
"data": [self.sample_doc]
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
158
|
+
|
|
159
|
+
# Test with auto_paginate=True
|
|
160
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
161
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
162
|
+
|
|
163
|
+
assert result.status == "completed"
|
|
164
|
+
assert result.next is None # Should be None when auto_paginate=True
|
|
165
|
+
assert len(result.data) == 2
|
|
166
|
+
assert self.mock_client.get.call_count == 2
|
|
167
|
+
|
|
168
|
+
def test_get_crawl_status_max_pages_limit(self):
|
|
169
|
+
"""Test get_crawl_status with max_pages limit."""
|
|
170
|
+
# Mock responses for multiple pages
|
|
171
|
+
mock_responses = []
|
|
172
|
+
for i in range(5): # 5 pages available
|
|
173
|
+
mock_response = Mock()
|
|
174
|
+
mock_response.ok = True
|
|
175
|
+
mock_response.json.return_value = {
|
|
176
|
+
"success": True,
|
|
177
|
+
"status": "completed",
|
|
178
|
+
"completed": (i + 1) * 10,
|
|
179
|
+
"total": 50,
|
|
180
|
+
"creditsUsed": (i + 1) * 5,
|
|
181
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
182
|
+
"next": f"https://api.firecrawl.dev/v2/crawl/test-crawl-123?page={i+2}" if i < 4 else None,
|
|
183
|
+
"data": [self.sample_doc]
|
|
184
|
+
}
|
|
185
|
+
mock_responses.append(mock_response)
|
|
186
|
+
|
|
187
|
+
self.mock_client.get.side_effect = mock_responses
|
|
188
|
+
|
|
189
|
+
# Test with max_pages=3
|
|
190
|
+
pagination_config = PaginationConfig(auto_paginate=True, max_pages=3)
|
|
191
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
192
|
+
|
|
193
|
+
assert len(result.data) == 4 # 1 initial + 3 from pages
|
|
194
|
+
assert self.mock_client.get.call_count == 4 # 1 initial + 3 pagination calls
|
|
195
|
+
|
|
196
|
+
def test_get_crawl_status_max_results_limit(self):
|
|
197
|
+
"""Test get_crawl_status with max_results limit."""
|
|
198
|
+
# Mock responses with multiple documents per page
|
|
199
|
+
mock_response1 = Mock()
|
|
200
|
+
mock_response1.ok = True
|
|
201
|
+
mock_response1.json.return_value = {
|
|
202
|
+
"success": True,
|
|
203
|
+
"status": "completed",
|
|
204
|
+
"completed": 10,
|
|
205
|
+
"total": 20,
|
|
206
|
+
"creditsUsed": 5,
|
|
207
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
208
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
|
|
209
|
+
"data": [self.sample_doc, self.sample_doc, self.sample_doc] # 3 docs
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
mock_response2 = Mock()
|
|
213
|
+
mock_response2.ok = True
|
|
214
|
+
mock_response2.json.return_value = {
|
|
215
|
+
"success": True,
|
|
216
|
+
"status": "completed",
|
|
217
|
+
"completed": 20,
|
|
218
|
+
"total": 20,
|
|
219
|
+
"creditsUsed": 10,
|
|
220
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
221
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=3",
|
|
222
|
+
"data": [self.sample_doc, self.sample_doc] # 2 more docs
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
226
|
+
|
|
227
|
+
# Test with max_results=4
|
|
228
|
+
pagination_config = PaginationConfig(auto_paginate=True, max_results=4)
|
|
229
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
230
|
+
|
|
231
|
+
assert len(result.data) == 4 # Should stop at 4 results
|
|
232
|
+
assert self.mock_client.get.call_count == 2 # Should fetch 2 pages
|
|
233
|
+
|
|
234
|
+
def test_get_crawl_status_max_wait_time_limit(self):
|
|
235
|
+
"""Test get_crawl_status with max_wait_time limit."""
|
|
236
|
+
# Mock slow response
|
|
237
|
+
mock_response = Mock()
|
|
238
|
+
mock_response.ok = True
|
|
239
|
+
mock_response.json.return_value = {
|
|
240
|
+
"success": True,
|
|
241
|
+
"status": "completed",
|
|
242
|
+
"completed": 10,
|
|
243
|
+
"total": 20,
|
|
244
|
+
"creditsUsed": 5,
|
|
245
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
246
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
|
|
247
|
+
"data": [self.sample_doc]
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
self.mock_client.get.return_value = mock_response
|
|
251
|
+
|
|
252
|
+
# Test with max_wait_time=1 second
|
|
253
|
+
pagination_config = PaginationConfig(auto_paginate=True, max_wait_time=1)
|
|
254
|
+
|
|
255
|
+
with patch('firecrawl.v2.methods.crawl.time.monotonic', side_effect=[0, 2]): # Simulate 2 seconds elapsed
|
|
256
|
+
result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
|
|
257
|
+
|
|
258
|
+
assert len(result.data) == 1 # Should stop due to timeout
|
|
259
|
+
assert self.mock_client.get.call_count == 1
|
|
260
|
+
|
|
261
|
+
def test_fetch_all_pages_error_handling(self):
|
|
262
|
+
"""Test _fetch_all_pages with API errors."""
|
|
263
|
+
# Mock first page success, second page error
|
|
264
|
+
mock_response1 = Mock()
|
|
265
|
+
mock_response1.ok = True
|
|
266
|
+
mock_response1.json.return_value = {
|
|
267
|
+
"success": True,
|
|
268
|
+
"data": [self.sample_doc],
|
|
269
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
mock_response2 = Mock()
|
|
273
|
+
mock_response2.ok = False
|
|
274
|
+
mock_response2.status_code = 500
|
|
275
|
+
|
|
276
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
277
|
+
|
|
278
|
+
# Should continue with what we have
|
|
279
|
+
result = _fetch_all_pages(self.mock_client, "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2", [], None)
|
|
280
|
+
|
|
281
|
+
assert len(result) == 1 # Should have the first page data
|
|
282
|
+
assert self.mock_client.get.call_count == 2
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class TestBatchScrapePagination:
|
|
286
|
+
"""Test batch scrape pagination functionality."""
|
|
287
|
+
|
|
288
|
+
def setup_method(self):
|
|
289
|
+
"""Set up test fixtures."""
|
|
290
|
+
self.mock_client = Mock()
|
|
291
|
+
self.job_id = "test-batch-123"
|
|
292
|
+
|
|
293
|
+
# Sample document data
|
|
294
|
+
self.sample_doc = {
|
|
295
|
+
"url": "https://example.com",
|
|
296
|
+
"markdown": "# Test Content",
|
|
297
|
+
"metadata": {
|
|
298
|
+
"title": "Test Page",
|
|
299
|
+
"statusCode": 200
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
def test_get_batch_scrape_status_no_pagination(self):
|
|
304
|
+
"""Test get_batch_scrape_status with auto_paginate=False."""
|
|
305
|
+
# Mock response with next URL
|
|
306
|
+
mock_response = Mock()
|
|
307
|
+
mock_response.ok = True
|
|
308
|
+
mock_response.json.return_value = {
|
|
309
|
+
"success": True,
|
|
310
|
+
"status": "completed",
|
|
311
|
+
"completed": 10,
|
|
312
|
+
"total": 10,
|
|
313
|
+
"creditsUsed": 5,
|
|
314
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
315
|
+
"next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
|
|
316
|
+
"data": [self.sample_doc]
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
self.mock_client.get.return_value = mock_response
|
|
320
|
+
|
|
321
|
+
# Test with auto_paginate=False
|
|
322
|
+
pagination_config = PaginationConfig(auto_paginate=False)
|
|
323
|
+
result = get_batch_scrape_status(self.mock_client, self.job_id, pagination_config)
|
|
324
|
+
|
|
325
|
+
assert result.status == "completed"
|
|
326
|
+
assert result.next == "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2"
|
|
327
|
+
assert len(result.data) == 1
|
|
328
|
+
assert isinstance(result.data[0], Document)
|
|
329
|
+
|
|
330
|
+
def test_get_batch_scrape_status_with_pagination(self):
|
|
331
|
+
"""Test get_batch_scrape_status with auto_paginate=True."""
|
|
332
|
+
# Mock first page response
|
|
333
|
+
mock_response1 = Mock()
|
|
334
|
+
mock_response1.ok = True
|
|
335
|
+
mock_response1.json.return_value = {
|
|
336
|
+
"success": True,
|
|
337
|
+
"status": "completed",
|
|
338
|
+
"completed": 10,
|
|
339
|
+
"total": 20,
|
|
340
|
+
"creditsUsed": 5,
|
|
341
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
342
|
+
"next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
|
|
343
|
+
"data": [self.sample_doc]
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
# Mock second page response
|
|
347
|
+
mock_response2 = Mock()
|
|
348
|
+
mock_response2.ok = True
|
|
349
|
+
mock_response2.json.return_value = {
|
|
350
|
+
"success": True,
|
|
351
|
+
"status": "completed",
|
|
352
|
+
"completed": 20,
|
|
353
|
+
"total": 20,
|
|
354
|
+
"creditsUsed": 10,
|
|
355
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
356
|
+
"next": None,
|
|
357
|
+
"data": [self.sample_doc]
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
361
|
+
|
|
362
|
+
# Test with auto_paginate=True
|
|
363
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
364
|
+
result = get_batch_scrape_status(self.mock_client, self.job_id, pagination_config)
|
|
365
|
+
|
|
366
|
+
assert result.status == "completed"
|
|
367
|
+
assert result.next is None # Should be None when auto_paginate=True
|
|
368
|
+
assert len(result.data) == 2
|
|
369
|
+
assert self.mock_client.get.call_count == 2
|
|
370
|
+
|
|
371
|
+
def test_fetch_all_batch_pages_limits(self):
|
|
372
|
+
"""Test _fetch_all_batch_pages with various limits."""
|
|
373
|
+
# Mock responses for multiple pages
|
|
374
|
+
mock_responses = []
|
|
375
|
+
for i in range(5): # 5 pages available
|
|
376
|
+
mock_response = Mock()
|
|
377
|
+
mock_response.ok = True
|
|
378
|
+
mock_response.json.return_value = {
|
|
379
|
+
"success": True,
|
|
380
|
+
"data": [self.sample_doc, self.sample_doc], # 2 docs per page
|
|
381
|
+
"next": f"https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page={i+2}" if i < 4 else None
|
|
382
|
+
}
|
|
383
|
+
mock_responses.append(mock_response)
|
|
384
|
+
|
|
385
|
+
self.mock_client.get.side_effect = mock_responses
|
|
386
|
+
|
|
387
|
+
# Test with max_pages=2, max_results=4 (total docs we want)
|
|
388
|
+
pagination_config = PaginationConfig(max_pages=2, max_results=4)
|
|
389
|
+
result = _fetch_all_batch_pages(
|
|
390
|
+
self.mock_client,
|
|
391
|
+
"https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
|
|
392
|
+
[Document(**self.sample_doc)], # 1 initial doc
|
|
393
|
+
pagination_config
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Should have 1 initial + 3 from pages (limited by max_results=4)
|
|
397
|
+
assert len(result) == 4
|
|
398
|
+
assert self.mock_client.get.call_count == 2 # Should fetch 2 pages
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
class TestAsyncPagination:
|
|
402
|
+
"""Test async pagination functionality."""
|
|
403
|
+
|
|
404
|
+
def setup_method(self):
|
|
405
|
+
"""Set up test fixtures."""
|
|
406
|
+
self.mock_client = AsyncMock()
|
|
407
|
+
self.job_id = "test-async-123"
|
|
408
|
+
|
|
409
|
+
# Sample document data
|
|
410
|
+
self.sample_doc = {
|
|
411
|
+
"url": "https://example.com",
|
|
412
|
+
"markdown": "# Test Content",
|
|
413
|
+
"metadata": {
|
|
414
|
+
"title": "Test Page",
|
|
415
|
+
"statusCode": 200
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
@pytest.mark.asyncio
|
|
420
|
+
async def test_get_crawl_status_async_with_pagination(self):
|
|
421
|
+
"""Test async get_crawl_status with pagination."""
|
|
422
|
+
# Mock first page response
|
|
423
|
+
mock_response1 = Mock()
|
|
424
|
+
mock_response1.status_code = 200
|
|
425
|
+
mock_response1.json.return_value = {
|
|
426
|
+
"success": True,
|
|
427
|
+
"status": "completed",
|
|
428
|
+
"completed": 10,
|
|
429
|
+
"total": 20,
|
|
430
|
+
"creditsUsed": 5,
|
|
431
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
432
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-async-123?page=2",
|
|
433
|
+
"data": [self.sample_doc]
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
# Mock second page response
|
|
437
|
+
mock_response2 = Mock()
|
|
438
|
+
mock_response2.status_code = 200
|
|
439
|
+
mock_response2.json.return_value = {
|
|
440
|
+
"success": True,
|
|
441
|
+
"status": "completed",
|
|
442
|
+
"completed": 20,
|
|
443
|
+
"total": 20,
|
|
444
|
+
"creditsUsed": 10,
|
|
445
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
446
|
+
"next": None,
|
|
447
|
+
"data": [self.sample_doc]
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
451
|
+
|
|
452
|
+
# Test with auto_paginate=True
|
|
453
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
454
|
+
result = await get_crawl_status_async(self.mock_client, self.job_id, pagination_config)
|
|
455
|
+
|
|
456
|
+
assert result.status == "completed"
|
|
457
|
+
assert result.next is None
|
|
458
|
+
assert len(result.data) == 2
|
|
459
|
+
assert self.mock_client.get.call_count == 2
|
|
460
|
+
|
|
461
|
+
@pytest.mark.asyncio
|
|
462
|
+
async def test_get_crawl_status_async_propagates_request_timeout(self):
|
|
463
|
+
"""Ensure async request_timeout is forwarded to the HTTP client."""
|
|
464
|
+
mock_response = Mock()
|
|
465
|
+
mock_response.status_code = 200
|
|
466
|
+
mock_response.json.return_value = {
|
|
467
|
+
"success": True,
|
|
468
|
+
"status": "completed",
|
|
469
|
+
"completed": 1,
|
|
470
|
+
"total": 1,
|
|
471
|
+
"creditsUsed": 1,
|
|
472
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
473
|
+
"next": None,
|
|
474
|
+
"data": [self.sample_doc],
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
self.mock_client.get.return_value = mock_response
|
|
478
|
+
|
|
479
|
+
timeout_seconds = 3.3
|
|
480
|
+
import firecrawl.v2.methods.aio.crawl as crawl_module_async
|
|
481
|
+
|
|
482
|
+
assert crawl_module_async.__file__.endswith("firecrawl/v2/methods/aio/crawl.py")
|
|
483
|
+
assert crawl_module_async.get_crawl_status.__kwdefaults__ is not None
|
|
484
|
+
assert "request_timeout" in crawl_module_async.get_crawl_status.__kwdefaults__
|
|
485
|
+
result = await get_crawl_status_async(
|
|
486
|
+
self.mock_client,
|
|
487
|
+
self.job_id,
|
|
488
|
+
request_timeout=timeout_seconds,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
assert result.status == "completed"
|
|
492
|
+
self.mock_client.get.assert_awaited_with(
|
|
493
|
+
f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
@pytest.mark.asyncio
|
|
497
|
+
async def test_get_batch_scrape_status_async_with_pagination(self):
|
|
498
|
+
"""Test async get_batch_scrape_status with pagination."""
|
|
499
|
+
# Mock first page response
|
|
500
|
+
mock_response1 = Mock()
|
|
501
|
+
mock_response1.status_code = 200
|
|
502
|
+
mock_response1.json.return_value = {
|
|
503
|
+
"success": True,
|
|
504
|
+
"status": "completed",
|
|
505
|
+
"completed": 10,
|
|
506
|
+
"total": 20,
|
|
507
|
+
"creditsUsed": 5,
|
|
508
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
509
|
+
"next": "https://api.firecrawl.dev/v2/batch/scrape/test-async-123?page=2",
|
|
510
|
+
"data": [self.sample_doc]
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
# Mock second page response
|
|
514
|
+
mock_response2 = Mock()
|
|
515
|
+
mock_response2.status_code = 200
|
|
516
|
+
mock_response2.json.return_value = {
|
|
517
|
+
"success": True,
|
|
518
|
+
"status": "completed",
|
|
519
|
+
"completed": 20,
|
|
520
|
+
"total": 20,
|
|
521
|
+
"creditsUsed": 10,
|
|
522
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
523
|
+
"next": None,
|
|
524
|
+
"data": [self.sample_doc]
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
528
|
+
|
|
529
|
+
# Test with auto_paginate=True
|
|
530
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
531
|
+
result = await get_batch_scrape_status_async(self.mock_client, self.job_id, pagination_config)
|
|
532
|
+
|
|
533
|
+
assert result.status == "completed"
|
|
534
|
+
assert result.next is None
|
|
535
|
+
assert len(result.data) == 2
|
|
536
|
+
assert self.mock_client.get.call_count == 2
|
|
537
|
+
|
|
538
|
+
@pytest.mark.asyncio
|
|
539
|
+
async def test_fetch_all_pages_async_limits(self):
|
|
540
|
+
"""Test async _fetch_all_pages_async with limits."""
|
|
541
|
+
# Mock responses for multiple pages
|
|
542
|
+
mock_responses = []
|
|
543
|
+
for i in range(3): # 3 pages available
|
|
544
|
+
mock_response = Mock()
|
|
545
|
+
mock_response.status_code = 200
|
|
546
|
+
mock_response.json.return_value = {
|
|
547
|
+
"success": True,
|
|
548
|
+
"data": [self.sample_doc],
|
|
549
|
+
"next": f"https://api.firecrawl.dev/v2/crawl/test-async-123?page={i+2}" if i < 2 else None
|
|
550
|
+
}
|
|
551
|
+
mock_responses.append(mock_response)
|
|
552
|
+
|
|
553
|
+
self.mock_client.get.side_effect = mock_responses
|
|
554
|
+
|
|
555
|
+
# Test with max_pages=2
|
|
556
|
+
pagination_config = PaginationConfig(max_pages=2)
|
|
557
|
+
result = await _fetch_all_pages_async(
|
|
558
|
+
self.mock_client,
|
|
559
|
+
"https://api.firecrawl.dev/v2/crawl/test-async-123?page=2",
|
|
560
|
+
[Document(**self.sample_doc)], # 1 initial doc
|
|
561
|
+
pagination_config
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
assert len(result) == 3 # 1 initial + 2 from pages
|
|
565
|
+
assert self.mock_client.get.call_count == 2
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
class TestPaginationEdgeCases:
|
|
569
|
+
"""Test pagination edge cases and error conditions."""
|
|
570
|
+
|
|
571
|
+
def setup_method(self):
|
|
572
|
+
"""Set up test fixtures."""
|
|
573
|
+
self.mock_client = Mock()
|
|
574
|
+
self.sample_doc = {
|
|
575
|
+
"url": "https://example.com",
|
|
576
|
+
"markdown": "# Test Content",
|
|
577
|
+
"metadata": {"title": "Test Page"}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
def test_pagination_with_empty_data(self):
|
|
581
|
+
"""Test pagination when API returns empty data."""
|
|
582
|
+
mock_response = Mock()
|
|
583
|
+
mock_response.ok = True
|
|
584
|
+
mock_response.json.return_value = {
|
|
585
|
+
"success": True,
|
|
586
|
+
"status": "completed",
|
|
587
|
+
"completed": 0,
|
|
588
|
+
"total": 0,
|
|
589
|
+
"creditsUsed": 0,
|
|
590
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
591
|
+
"next": None,
|
|
592
|
+
"data": []
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
self.mock_client.get.return_value = mock_response
|
|
596
|
+
|
|
597
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
598
|
+
result = get_crawl_status(self.mock_client, "test-123", pagination_config)
|
|
599
|
+
|
|
600
|
+
assert len(result.data) == 0
|
|
601
|
+
assert result.next is None
|
|
602
|
+
|
|
603
|
+
def test_pagination_with_string_data(self):
|
|
604
|
+
"""Test pagination when API returns string data (should be skipped)."""
|
|
605
|
+
mock_response = Mock()
|
|
606
|
+
mock_response.ok = True
|
|
607
|
+
mock_response.json.return_value = {
|
|
608
|
+
"success": True,
|
|
609
|
+
"status": "completed",
|
|
610
|
+
"completed": 2,
|
|
611
|
+
"total": 2,
|
|
612
|
+
"creditsUsed": 1,
|
|
613
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
614
|
+
"next": None,
|
|
615
|
+
"data": ["https://example.com", self.sample_doc] # String + dict
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
self.mock_client.get.return_value = mock_response
|
|
619
|
+
|
|
620
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
621
|
+
result = get_crawl_status(self.mock_client, "test-123", pagination_config)
|
|
622
|
+
|
|
623
|
+
assert len(result.data) == 1 # Only the dict should be processed
|
|
624
|
+
assert isinstance(result.data[0], Document)
|
|
625
|
+
|
|
626
|
+
def test_pagination_with_failed_response(self):
|
|
627
|
+
"""Test pagination when API response indicates failure."""
|
|
628
|
+
mock_response = Mock()
|
|
629
|
+
mock_response.ok = True
|
|
630
|
+
mock_response.json.return_value = {
|
|
631
|
+
"success": False,
|
|
632
|
+
"error": "Job not found"
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
self.mock_client.get.return_value = mock_response
|
|
636
|
+
|
|
637
|
+
pagination_config = PaginationConfig(auto_paginate=True)
|
|
638
|
+
|
|
639
|
+
with pytest.raises(Exception, match="Job not found"):
|
|
640
|
+
get_crawl_status(self.mock_client, "test-123", pagination_config)
|
|
641
|
+
|
|
642
|
+
def test_pagination_with_unsuccessful_page(self):
|
|
643
|
+
"""Test pagination when a subsequent page is unsuccessful."""
|
|
644
|
+
# Mock first page success
|
|
645
|
+
mock_response1 = Mock()
|
|
646
|
+
mock_response1.ok = True
|
|
647
|
+
mock_response1.json.return_value = {
|
|
648
|
+
"success": True,
|
|
649
|
+
"data": [self.sample_doc],
|
|
650
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-123?page=2"
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
# Mock second page failure
|
|
654
|
+
mock_response2 = Mock()
|
|
655
|
+
mock_response2.ok = True
|
|
656
|
+
mock_response2.json.return_value = {
|
|
657
|
+
"success": False,
|
|
658
|
+
"error": "Page not found"
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
self.mock_client.get.side_effect = [mock_response1, mock_response2]
|
|
662
|
+
|
|
663
|
+
# Should continue with what we have
|
|
664
|
+
result = _fetch_all_pages(self.mock_client, "https://api.firecrawl.dev/v2/crawl/test-123?page=2", [], None)
|
|
665
|
+
|
|
666
|
+
assert len(result) == 1 # Should have the first page data
|
|
667
|
+
assert self.mock_client.get.call_count == 2
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
if __name__ == "__main__":
|
|
671
|
+
pytest.main([__file__, "-v"])
|