firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,671 @@
1
+ """
2
+ Unit tests for Firecrawl v2 pagination functionality.
3
+ """
4
+
5
+ import pytest
6
+ import time
7
+ from unittest.mock import Mock, patch, AsyncMock
8
+ from typing import Dict, Any, List
9
+
10
+ from firecrawl.v2.types import (
11
+ PaginationConfig,
12
+ CrawlJob,
13
+ BatchScrapeJob,
14
+ Document,
15
+ DocumentMetadata
16
+ )
17
+ from firecrawl.v2.methods.crawl import get_crawl_status, _fetch_all_pages
18
+ from firecrawl.v2.methods.batch import get_batch_scrape_status, _fetch_all_batch_pages
19
+ from firecrawl.v2.methods.aio.crawl import get_crawl_status as get_crawl_status_async, _fetch_all_pages_async
20
+ from firecrawl.v2.methods.aio.batch import get_batch_scrape_status as get_batch_scrape_status_async, _fetch_all_batch_pages_async
21
+
22
+
23
+ class TestPaginationConfig:
24
+ """Test PaginationConfig model."""
25
+
26
+ def test_default_values(self):
27
+ """Test default values for PaginationConfig."""
28
+ config = PaginationConfig()
29
+ assert config.auto_paginate is True
30
+ assert config.max_pages is None
31
+ assert config.max_results is None
32
+ assert config.max_wait_time is None
33
+
34
+ def test_custom_values(self):
35
+ """Test custom values for PaginationConfig."""
36
+ config = PaginationConfig(
37
+ auto_paginate=False,
38
+ max_pages=5,
39
+ max_results=100,
40
+ max_wait_time=30
41
+ )
42
+ assert config.auto_paginate is False
43
+ assert config.max_pages == 5
44
+ assert config.max_results == 100
45
+ assert config.max_wait_time == 30
46
+
47
+
48
+ class TestCrawlPagination:
49
+ """Test crawl pagination functionality."""
50
+
51
+ def setup_method(self):
52
+ """Set up test fixtures."""
53
+ self.mock_client = Mock()
54
+ self.job_id = "test-crawl-123"
55
+
56
+ # Sample document data
57
+ self.sample_doc = {
58
+ "url": "https://example.com",
59
+ "markdown": "# Test Content",
60
+ "metadata": {
61
+ "title": "Test Page",
62
+ "statusCode": 200
63
+ }
64
+ }
65
+
66
+ def test_get_crawl_status_no_pagination(self):
67
+ """Test get_crawl_status with auto_paginate=False."""
68
+ # Mock response with next URL
69
+ mock_response = Mock()
70
+ mock_response.ok = True
71
+ mock_response.json.return_value = {
72
+ "success": True,
73
+ "status": "completed",
74
+ "completed": 10,
75
+ "total": 10,
76
+ "creditsUsed": 5,
77
+ "expiresAt": "2024-01-01T00:00:00Z",
78
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
79
+ "data": [self.sample_doc]
80
+ }
81
+
82
+ self.mock_client.get.return_value = mock_response
83
+
84
+ # Test with auto_paginate=False
85
+ pagination_config = PaginationConfig(auto_paginate=False)
86
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
87
+
88
+ assert result.status == "completed"
89
+ assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
90
+ assert len(result.data) == 1
91
+ assert isinstance(result.data[0], Document)
92
+
93
+ def test_get_crawl_status_propagates_request_timeout(self):
94
+ """Ensure request_timeout is forwarded to the HTTP client."""
95
+ mock_response = Mock()
96
+ mock_response.ok = True
97
+ mock_response.json.return_value = {
98
+ "success": True,
99
+ "status": "completed",
100
+ "completed": 1,
101
+ "total": 1,
102
+ "creditsUsed": 1,
103
+ "expiresAt": "2024-01-01T00:00:00Z",
104
+ "next": None,
105
+ "data": [self.sample_doc],
106
+ }
107
+
108
+ self.mock_client.get.return_value = mock_response
109
+
110
+ timeout_seconds = 5.5
111
+ import firecrawl.v2.methods.crawl as crawl_module
112
+
113
+ assert crawl_module.__file__.endswith("firecrawl/v2/methods/crawl.py")
114
+ assert crawl_module.get_crawl_status.__kwdefaults__ is not None
115
+ assert "request_timeout" in crawl_module.get_crawl_status.__kwdefaults__
116
+ result = get_crawl_status(
117
+ self.mock_client,
118
+ self.job_id,
119
+ request_timeout=timeout_seconds,
120
+ )
121
+
122
+ assert result.status == "completed"
123
+ self.mock_client.get.assert_called_with(
124
+ f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
125
+ )
126
+
127
+ def test_get_crawl_status_with_pagination(self):
128
+ """Test get_crawl_status with auto_paginate=True."""
129
+ # Mock first page response
130
+ mock_response1 = Mock()
131
+ mock_response1.ok = True
132
+ mock_response1.json.return_value = {
133
+ "success": True,
134
+ "status": "completed",
135
+ "completed": 10,
136
+ "total": 20,
137
+ "creditsUsed": 5,
138
+ "expiresAt": "2024-01-01T00:00:00Z",
139
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
140
+ "data": [self.sample_doc]
141
+ }
142
+
143
+ # Mock second page response
144
+ mock_response2 = Mock()
145
+ mock_response2.ok = True
146
+ mock_response2.json.return_value = {
147
+ "success": True,
148
+ "status": "completed",
149
+ "completed": 20,
150
+ "total": 20,
151
+ "creditsUsed": 10,
152
+ "expiresAt": "2024-01-01T00:00:00Z",
153
+ "next": None,
154
+ "data": [self.sample_doc]
155
+ }
156
+
157
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
158
+
159
+ # Test with auto_paginate=True
160
+ pagination_config = PaginationConfig(auto_paginate=True)
161
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
162
+
163
+ assert result.status == "completed"
164
+ assert result.next is None # Should be None when auto_paginate=True
165
+ assert len(result.data) == 2
166
+ assert self.mock_client.get.call_count == 2
167
+
168
+ def test_get_crawl_status_max_pages_limit(self):
169
+ """Test get_crawl_status with max_pages limit."""
170
+ # Mock responses for multiple pages
171
+ mock_responses = []
172
+ for i in range(5): # 5 pages available
173
+ mock_response = Mock()
174
+ mock_response.ok = True
175
+ mock_response.json.return_value = {
176
+ "success": True,
177
+ "status": "completed",
178
+ "completed": (i + 1) * 10,
179
+ "total": 50,
180
+ "creditsUsed": (i + 1) * 5,
181
+ "expiresAt": "2024-01-01T00:00:00Z",
182
+ "next": f"https://api.firecrawl.dev/v2/crawl/test-crawl-123?page={i+2}" if i < 4 else None,
183
+ "data": [self.sample_doc]
184
+ }
185
+ mock_responses.append(mock_response)
186
+
187
+ self.mock_client.get.side_effect = mock_responses
188
+
189
+ # Test with max_pages=3
190
+ pagination_config = PaginationConfig(auto_paginate=True, max_pages=3)
191
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
192
+
193
+ assert len(result.data) == 4 # 1 initial + 3 from pages
194
+ assert self.mock_client.get.call_count == 4 # 1 initial + 3 pagination calls
195
+
196
+ def test_get_crawl_status_max_results_limit(self):
197
+ """Test get_crawl_status with max_results limit."""
198
+ # Mock responses with multiple documents per page
199
+ mock_response1 = Mock()
200
+ mock_response1.ok = True
201
+ mock_response1.json.return_value = {
202
+ "success": True,
203
+ "status": "completed",
204
+ "completed": 10,
205
+ "total": 20,
206
+ "creditsUsed": 5,
207
+ "expiresAt": "2024-01-01T00:00:00Z",
208
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
209
+ "data": [self.sample_doc, self.sample_doc, self.sample_doc] # 3 docs
210
+ }
211
+
212
+ mock_response2 = Mock()
213
+ mock_response2.ok = True
214
+ mock_response2.json.return_value = {
215
+ "success": True,
216
+ "status": "completed",
217
+ "completed": 20,
218
+ "total": 20,
219
+ "creditsUsed": 10,
220
+ "expiresAt": "2024-01-01T00:00:00Z",
221
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=3",
222
+ "data": [self.sample_doc, self.sample_doc] # 2 more docs
223
+ }
224
+
225
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
226
+
227
+ # Test with max_results=4
228
+ pagination_config = PaginationConfig(auto_paginate=True, max_results=4)
229
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
230
+
231
+ assert len(result.data) == 4 # Should stop at 4 results
232
+ assert self.mock_client.get.call_count == 2 # Should fetch 2 pages
233
+
234
+ def test_get_crawl_status_max_wait_time_limit(self):
235
+ """Test get_crawl_status with max_wait_time limit."""
236
+ # Mock slow response
237
+ mock_response = Mock()
238
+ mock_response.ok = True
239
+ mock_response.json.return_value = {
240
+ "success": True,
241
+ "status": "completed",
242
+ "completed": 10,
243
+ "total": 20,
244
+ "creditsUsed": 5,
245
+ "expiresAt": "2024-01-01T00:00:00Z",
246
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
247
+ "data": [self.sample_doc]
248
+ }
249
+
250
+ self.mock_client.get.return_value = mock_response
251
+
252
+ # Test with max_wait_time=1 second
253
+ pagination_config = PaginationConfig(auto_paginate=True, max_wait_time=1)
254
+
255
+ with patch('firecrawl.v2.methods.crawl.time.monotonic', side_effect=[0, 2]): # Simulate 2 seconds elapsed
256
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
257
+
258
+ assert len(result.data) == 1 # Should stop due to timeout
259
+ assert self.mock_client.get.call_count == 1
260
+
261
+ def test_fetch_all_pages_error_handling(self):
262
+ """Test _fetch_all_pages with API errors."""
263
+ # Mock first page success, second page error
264
+ mock_response1 = Mock()
265
+ mock_response1.ok = True
266
+ mock_response1.json.return_value = {
267
+ "success": True,
268
+ "data": [self.sample_doc],
269
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
270
+ }
271
+
272
+ mock_response2 = Mock()
273
+ mock_response2.ok = False
274
+ mock_response2.status_code = 500
275
+
276
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
277
+
278
+ # Should continue with what we have
279
+ result = _fetch_all_pages(self.mock_client, "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2", [], None)
280
+
281
+ assert len(result) == 1 # Should have the first page data
282
+ assert self.mock_client.get.call_count == 2
283
+
284
+
285
+ class TestBatchScrapePagination:
286
+ """Test batch scrape pagination functionality."""
287
+
288
+ def setup_method(self):
289
+ """Set up test fixtures."""
290
+ self.mock_client = Mock()
291
+ self.job_id = "test-batch-123"
292
+
293
+ # Sample document data
294
+ self.sample_doc = {
295
+ "url": "https://example.com",
296
+ "markdown": "# Test Content",
297
+ "metadata": {
298
+ "title": "Test Page",
299
+ "statusCode": 200
300
+ }
301
+ }
302
+
303
+ def test_get_batch_scrape_status_no_pagination(self):
304
+ """Test get_batch_scrape_status with auto_paginate=False."""
305
+ # Mock response with next URL
306
+ mock_response = Mock()
307
+ mock_response.ok = True
308
+ mock_response.json.return_value = {
309
+ "success": True,
310
+ "status": "completed",
311
+ "completed": 10,
312
+ "total": 10,
313
+ "creditsUsed": 5,
314
+ "expiresAt": "2024-01-01T00:00:00Z",
315
+ "next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
316
+ "data": [self.sample_doc]
317
+ }
318
+
319
+ self.mock_client.get.return_value = mock_response
320
+
321
+ # Test with auto_paginate=False
322
+ pagination_config = PaginationConfig(auto_paginate=False)
323
+ result = get_batch_scrape_status(self.mock_client, self.job_id, pagination_config)
324
+
325
+ assert result.status == "completed"
326
+ assert result.next == "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2"
327
+ assert len(result.data) == 1
328
+ assert isinstance(result.data[0], Document)
329
+
330
+ def test_get_batch_scrape_status_with_pagination(self):
331
+ """Test get_batch_scrape_status with auto_paginate=True."""
332
+ # Mock first page response
333
+ mock_response1 = Mock()
334
+ mock_response1.ok = True
335
+ mock_response1.json.return_value = {
336
+ "success": True,
337
+ "status": "completed",
338
+ "completed": 10,
339
+ "total": 20,
340
+ "creditsUsed": 5,
341
+ "expiresAt": "2024-01-01T00:00:00Z",
342
+ "next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
343
+ "data": [self.sample_doc]
344
+ }
345
+
346
+ # Mock second page response
347
+ mock_response2 = Mock()
348
+ mock_response2.ok = True
349
+ mock_response2.json.return_value = {
350
+ "success": True,
351
+ "status": "completed",
352
+ "completed": 20,
353
+ "total": 20,
354
+ "creditsUsed": 10,
355
+ "expiresAt": "2024-01-01T00:00:00Z",
356
+ "next": None,
357
+ "data": [self.sample_doc]
358
+ }
359
+
360
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
361
+
362
+ # Test with auto_paginate=True
363
+ pagination_config = PaginationConfig(auto_paginate=True)
364
+ result = get_batch_scrape_status(self.mock_client, self.job_id, pagination_config)
365
+
366
+ assert result.status == "completed"
367
+ assert result.next is None # Should be None when auto_paginate=True
368
+ assert len(result.data) == 2
369
+ assert self.mock_client.get.call_count == 2
370
+
371
+ def test_fetch_all_batch_pages_limits(self):
372
+ """Test _fetch_all_batch_pages with various limits."""
373
+ # Mock responses for multiple pages
374
+ mock_responses = []
375
+ for i in range(5): # 5 pages available
376
+ mock_response = Mock()
377
+ mock_response.ok = True
378
+ mock_response.json.return_value = {
379
+ "success": True,
380
+ "data": [self.sample_doc, self.sample_doc], # 2 docs per page
381
+ "next": f"https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page={i+2}" if i < 4 else None
382
+ }
383
+ mock_responses.append(mock_response)
384
+
385
+ self.mock_client.get.side_effect = mock_responses
386
+
387
+ # Test with max_pages=2, max_results=4 (total docs we want)
388
+ pagination_config = PaginationConfig(max_pages=2, max_results=4)
389
+ result = _fetch_all_batch_pages(
390
+ self.mock_client,
391
+ "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
392
+ [Document(**self.sample_doc)], # 1 initial doc
393
+ pagination_config
394
+ )
395
+
396
+ # Should have 1 initial + 3 from pages (limited by max_results=4)
397
+ assert len(result) == 4
398
+ assert self.mock_client.get.call_count == 2 # Should fetch 2 pages
399
+
400
+
401
+ class TestAsyncPagination:
402
+ """Test async pagination functionality."""
403
+
404
+ def setup_method(self):
405
+ """Set up test fixtures."""
406
+ self.mock_client = AsyncMock()
407
+ self.job_id = "test-async-123"
408
+
409
+ # Sample document data
410
+ self.sample_doc = {
411
+ "url": "https://example.com",
412
+ "markdown": "# Test Content",
413
+ "metadata": {
414
+ "title": "Test Page",
415
+ "statusCode": 200
416
+ }
417
+ }
418
+
419
+ @pytest.mark.asyncio
420
+ async def test_get_crawl_status_async_with_pagination(self):
421
+ """Test async get_crawl_status with pagination."""
422
+ # Mock first page response
423
+ mock_response1 = Mock()
424
+ mock_response1.status_code = 200
425
+ mock_response1.json.return_value = {
426
+ "success": True,
427
+ "status": "completed",
428
+ "completed": 10,
429
+ "total": 20,
430
+ "creditsUsed": 5,
431
+ "expiresAt": "2024-01-01T00:00:00Z",
432
+ "next": "https://api.firecrawl.dev/v2/crawl/test-async-123?page=2",
433
+ "data": [self.sample_doc]
434
+ }
435
+
436
+ # Mock second page response
437
+ mock_response2 = Mock()
438
+ mock_response2.status_code = 200
439
+ mock_response2.json.return_value = {
440
+ "success": True,
441
+ "status": "completed",
442
+ "completed": 20,
443
+ "total": 20,
444
+ "creditsUsed": 10,
445
+ "expiresAt": "2024-01-01T00:00:00Z",
446
+ "next": None,
447
+ "data": [self.sample_doc]
448
+ }
449
+
450
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
451
+
452
+ # Test with auto_paginate=True
453
+ pagination_config = PaginationConfig(auto_paginate=True)
454
+ result = await get_crawl_status_async(self.mock_client, self.job_id, pagination_config)
455
+
456
+ assert result.status == "completed"
457
+ assert result.next is None
458
+ assert len(result.data) == 2
459
+ assert self.mock_client.get.call_count == 2
460
+
461
+ @pytest.mark.asyncio
462
+ async def test_get_crawl_status_async_propagates_request_timeout(self):
463
+ """Ensure async request_timeout is forwarded to the HTTP client."""
464
+ mock_response = Mock()
465
+ mock_response.status_code = 200
466
+ mock_response.json.return_value = {
467
+ "success": True,
468
+ "status": "completed",
469
+ "completed": 1,
470
+ "total": 1,
471
+ "creditsUsed": 1,
472
+ "expiresAt": "2024-01-01T00:00:00Z",
473
+ "next": None,
474
+ "data": [self.sample_doc],
475
+ }
476
+
477
+ self.mock_client.get.return_value = mock_response
478
+
479
+ timeout_seconds = 3.3
480
+ import firecrawl.v2.methods.aio.crawl as crawl_module_async
481
+
482
+ assert crawl_module_async.__file__.endswith("firecrawl/v2/methods/aio/crawl.py")
483
+ assert crawl_module_async.get_crawl_status.__kwdefaults__ is not None
484
+ assert "request_timeout" in crawl_module_async.get_crawl_status.__kwdefaults__
485
+ result = await get_crawl_status_async(
486
+ self.mock_client,
487
+ self.job_id,
488
+ request_timeout=timeout_seconds,
489
+ )
490
+
491
+ assert result.status == "completed"
492
+ self.mock_client.get.assert_awaited_with(
493
+ f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
494
+ )
495
+
496
+ @pytest.mark.asyncio
497
+ async def test_get_batch_scrape_status_async_with_pagination(self):
498
+ """Test async get_batch_scrape_status with pagination."""
499
+ # Mock first page response
500
+ mock_response1 = Mock()
501
+ mock_response1.status_code = 200
502
+ mock_response1.json.return_value = {
503
+ "success": True,
504
+ "status": "completed",
505
+ "completed": 10,
506
+ "total": 20,
507
+ "creditsUsed": 5,
508
+ "expiresAt": "2024-01-01T00:00:00Z",
509
+ "next": "https://api.firecrawl.dev/v2/batch/scrape/test-async-123?page=2",
510
+ "data": [self.sample_doc]
511
+ }
512
+
513
+ # Mock second page response
514
+ mock_response2 = Mock()
515
+ mock_response2.status_code = 200
516
+ mock_response2.json.return_value = {
517
+ "success": True,
518
+ "status": "completed",
519
+ "completed": 20,
520
+ "total": 20,
521
+ "creditsUsed": 10,
522
+ "expiresAt": "2024-01-01T00:00:00Z",
523
+ "next": None,
524
+ "data": [self.sample_doc]
525
+ }
526
+
527
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
528
+
529
+ # Test with auto_paginate=True
530
+ pagination_config = PaginationConfig(auto_paginate=True)
531
+ result = await get_batch_scrape_status_async(self.mock_client, self.job_id, pagination_config)
532
+
533
+ assert result.status == "completed"
534
+ assert result.next is None
535
+ assert len(result.data) == 2
536
+ assert self.mock_client.get.call_count == 2
537
+
538
+ @pytest.mark.asyncio
539
+ async def test_fetch_all_pages_async_limits(self):
540
+ """Test async _fetch_all_pages_async with limits."""
541
+ # Mock responses for multiple pages
542
+ mock_responses = []
543
+ for i in range(3): # 3 pages available
544
+ mock_response = Mock()
545
+ mock_response.status_code = 200
546
+ mock_response.json.return_value = {
547
+ "success": True,
548
+ "data": [self.sample_doc],
549
+ "next": f"https://api.firecrawl.dev/v2/crawl/test-async-123?page={i+2}" if i < 2 else None
550
+ }
551
+ mock_responses.append(mock_response)
552
+
553
+ self.mock_client.get.side_effect = mock_responses
554
+
555
+ # Test with max_pages=2
556
+ pagination_config = PaginationConfig(max_pages=2)
557
+ result = await _fetch_all_pages_async(
558
+ self.mock_client,
559
+ "https://api.firecrawl.dev/v2/crawl/test-async-123?page=2",
560
+ [Document(**self.sample_doc)], # 1 initial doc
561
+ pagination_config
562
+ )
563
+
564
+ assert len(result) == 3 # 1 initial + 2 from pages
565
+ assert self.mock_client.get.call_count == 2
566
+
567
+
568
+ class TestPaginationEdgeCases:
569
+ """Test pagination edge cases and error conditions."""
570
+
571
+ def setup_method(self):
572
+ """Set up test fixtures."""
573
+ self.mock_client = Mock()
574
+ self.sample_doc = {
575
+ "url": "https://example.com",
576
+ "markdown": "# Test Content",
577
+ "metadata": {"title": "Test Page"}
578
+ }
579
+
580
+ def test_pagination_with_empty_data(self):
581
+ """Test pagination when API returns empty data."""
582
+ mock_response = Mock()
583
+ mock_response.ok = True
584
+ mock_response.json.return_value = {
585
+ "success": True,
586
+ "status": "completed",
587
+ "completed": 0,
588
+ "total": 0,
589
+ "creditsUsed": 0,
590
+ "expiresAt": "2024-01-01T00:00:00Z",
591
+ "next": None,
592
+ "data": []
593
+ }
594
+
595
+ self.mock_client.get.return_value = mock_response
596
+
597
+ pagination_config = PaginationConfig(auto_paginate=True)
598
+ result = get_crawl_status(self.mock_client, "test-123", pagination_config)
599
+
600
+ assert len(result.data) == 0
601
+ assert result.next is None
602
+
603
+ def test_pagination_with_string_data(self):
604
+ """Test pagination when API returns string data (should be skipped)."""
605
+ mock_response = Mock()
606
+ mock_response.ok = True
607
+ mock_response.json.return_value = {
608
+ "success": True,
609
+ "status": "completed",
610
+ "completed": 2,
611
+ "total": 2,
612
+ "creditsUsed": 1,
613
+ "expiresAt": "2024-01-01T00:00:00Z",
614
+ "next": None,
615
+ "data": ["https://example.com", self.sample_doc] # String + dict
616
+ }
617
+
618
+ self.mock_client.get.return_value = mock_response
619
+
620
+ pagination_config = PaginationConfig(auto_paginate=True)
621
+ result = get_crawl_status(self.mock_client, "test-123", pagination_config)
622
+
623
+ assert len(result.data) == 1 # Only the dict should be processed
624
+ assert isinstance(result.data[0], Document)
625
+
626
+ def test_pagination_with_failed_response(self):
627
+ """Test pagination when API response indicates failure."""
628
+ mock_response = Mock()
629
+ mock_response.ok = True
630
+ mock_response.json.return_value = {
631
+ "success": False,
632
+ "error": "Job not found"
633
+ }
634
+
635
+ self.mock_client.get.return_value = mock_response
636
+
637
+ pagination_config = PaginationConfig(auto_paginate=True)
638
+
639
+ with pytest.raises(Exception, match="Job not found"):
640
+ get_crawl_status(self.mock_client, "test-123", pagination_config)
641
+
642
+ def test_pagination_with_unsuccessful_page(self):
643
+ """Test pagination when a subsequent page is unsuccessful."""
644
+ # Mock first page success
645
+ mock_response1 = Mock()
646
+ mock_response1.ok = True
647
+ mock_response1.json.return_value = {
648
+ "success": True,
649
+ "data": [self.sample_doc],
650
+ "next": "https://api.firecrawl.dev/v2/crawl/test-123?page=2"
651
+ }
652
+
653
+ # Mock second page failure
654
+ mock_response2 = Mock()
655
+ mock_response2.ok = True
656
+ mock_response2.json.return_value = {
657
+ "success": False,
658
+ "error": "Page not found"
659
+ }
660
+
661
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
662
+
663
+ # Should continue with what we have
664
+ result = _fetch_all_pages(self.mock_client, "https://api.firecrawl.dev/v2/crawl/test-123?page=2", [], None)
665
+
666
+ assert len(result) == 1 # Should have the first page data
667
+ assert self.mock_client.get.call_count == 2
668
+
669
+
670
+ if __name__ == "__main__":
671
+ pytest.main([__file__, "-v"])