firecrawl 3.3.3__tar.gz → 4.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (87) hide show
  1. {firecrawl-3.3.3 → firecrawl-4.0.0}/PKG-INFO +1 -1
  2. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__init__.py +1 -1
  3. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_crawl.py +1 -1
  4. firecrawl-4.0.0/firecrawl/__tests__/unit/v2/methods/test_pagination.py +602 -0
  5. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/client.py +23 -4
  6. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/client_async.py +21 -4
  7. firecrawl-4.0.0/firecrawl/v2/methods/aio/batch.py +184 -0
  8. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/aio/crawl.py +172 -3
  9. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/batch.py +90 -5
  10. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/crawl.py +95 -6
  11. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/types.py +7 -0
  12. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/utils/http_client.py +26 -3
  13. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl.egg-info/PKG-INFO +1 -1
  14. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl.egg-info/SOURCES.txt +1 -0
  15. firecrawl-3.3.3/firecrawl/v2/methods/aio/batch.py +0 -85
  16. {firecrawl-3.3.3 → firecrawl-4.0.0}/LICENSE +0 -0
  17. {firecrawl-3.3.3 → firecrawl-4.0.0}/README.md +0 -0
  18. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -0
  19. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -0
  20. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -0
  21. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -0
  22. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -0
  23. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -0
  24. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
  25. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
  26. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
  27. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
  28. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -0
  29. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_extract.py +0 -0
  30. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_map.py +0 -0
  31. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -0
  32. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_search.py +0 -0
  33. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
  34. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
  35. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
  36. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -0
  37. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
  38. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -0
  39. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
  40. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -0
  41. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
  42. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
  43. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -0
  44. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
  45. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
  46. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
  47. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -0
  48. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -0
  49. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -0
  50. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
  51. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
  52. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
  53. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
  54. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
  55. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/client.py +0 -0
  56. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/firecrawl.backup.py +0 -0
  57. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/types.py +0 -0
  58. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v1/__init__.py +0 -0
  59. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v1/client.py +0 -0
  60. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/__init__.py +0 -0
  61. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/aio/__init__.py +0 -0
  62. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/aio/extract.py +0 -0
  63. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/aio/map.py +0 -0
  64. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/aio/scrape.py +0 -0
  65. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/aio/search.py +0 -0
  66. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/aio/usage.py +0 -0
  67. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/extract.py +0 -0
  68. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/map.py +0 -0
  69. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/scrape.py +0 -0
  70. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/search.py +0 -0
  71. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/methods/usage.py +0 -0
  72. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/utils/__init__.py +0 -0
  73. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/utils/error_handler.py +0 -0
  74. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/utils/get_version.py +0 -0
  75. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/utils/http_client_async.py +0 -0
  76. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/utils/normalize.py +0 -0
  77. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/utils/validation.py +0 -0
  78. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/watcher.py +0 -0
  79. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl/v2/watcher_async.py +0 -0
  80. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl.egg-info/dependency_links.txt +0 -0
  81. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl.egg-info/requires.txt +0 -0
  82. {firecrawl-3.3.3 → firecrawl-4.0.0}/firecrawl.egg-info/top_level.txt +0 -0
  83. {firecrawl-3.3.3 → firecrawl-4.0.0}/pyproject.toml +0 -0
  84. {firecrawl-3.3.3 → firecrawl-4.0.0}/setup.cfg +0 -0
  85. {firecrawl-3.3.3 → firecrawl-4.0.0}/setup.py +0 -0
  86. {firecrawl-3.3.3 → firecrawl-4.0.0}/tests/test_change_tracking.py +0 -0
  87. {firecrawl-3.3.3 → firecrawl-4.0.0}/tests/test_timeout_conversion.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: firecrawl
3
- Version: 3.3.3
3
+ Version: 4.0.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/firecrawl/firecrawl
6
6
  Author: Mendable.ai
@@ -17,7 +17,7 @@ from .v1 import (
17
17
  V1ChangeTrackingOptions,
18
18
  )
19
19
 
20
- __version__ = "3.3.3"
20
+ __version__ = "4.0.0"
21
21
 
22
22
  # Define the logger for the Firecrawl project
23
23
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -63,7 +63,7 @@ class TestCrawlE2E:
63
63
  assert status_job.status in ["scraping", "completed", "failed"]
64
64
  assert status_job.completed >= 0
65
65
  assert status_job.expires_at is not None
66
- assert status_job.next is not None
66
+ assert status_job.next is None
67
67
  assert isinstance(status_job.data, list)
68
68
 
69
69
  def test_cancel_crawl(self):
@@ -0,0 +1,602 @@
1
+ """
2
+ Unit tests for Firecrawl v2 pagination functionality.
3
+ """
4
+
5
+ import pytest
6
+ import time
7
+ from unittest.mock import Mock, patch, AsyncMock
8
+ from typing import Dict, Any, List
9
+
10
+ from firecrawl.v2.types import (
11
+ PaginationConfig,
12
+ CrawlJob,
13
+ BatchScrapeJob,
14
+ Document,
15
+ DocumentMetadata
16
+ )
17
+ from firecrawl.v2.methods.crawl import get_crawl_status, _fetch_all_pages
18
+ from firecrawl.v2.methods.batch import get_batch_scrape_status, _fetch_all_batch_pages
19
+ from firecrawl.v2.methods.aio.crawl import get_crawl_status as get_crawl_status_async, _fetch_all_pages_async
20
+ from firecrawl.v2.methods.aio.batch import get_batch_scrape_status as get_batch_scrape_status_async, _fetch_all_batch_pages_async
21
+
22
+
23
+ class TestPaginationConfig:
24
+ """Test PaginationConfig model."""
25
+
26
+ def test_default_values(self):
27
+ """Test default values for PaginationConfig."""
28
+ config = PaginationConfig()
29
+ assert config.auto_paginate is True
30
+ assert config.max_pages is None
31
+ assert config.max_results is None
32
+ assert config.max_wait_time is None
33
+
34
+ def test_custom_values(self):
35
+ """Test custom values for PaginationConfig."""
36
+ config = PaginationConfig(
37
+ auto_paginate=False,
38
+ max_pages=5,
39
+ max_results=100,
40
+ max_wait_time=30
41
+ )
42
+ assert config.auto_paginate is False
43
+ assert config.max_pages == 5
44
+ assert config.max_results == 100
45
+ assert config.max_wait_time == 30
46
+
47
+
48
+ class TestCrawlPagination:
49
+ """Test crawl pagination functionality."""
50
+
51
+ def setup_method(self):
52
+ """Set up test fixtures."""
53
+ self.mock_client = Mock()
54
+ self.job_id = "test-crawl-123"
55
+
56
+ # Sample document data
57
+ self.sample_doc = {
58
+ "url": "https://example.com",
59
+ "markdown": "# Test Content",
60
+ "metadata": {
61
+ "title": "Test Page",
62
+ "statusCode": 200
63
+ }
64
+ }
65
+
66
+ def test_get_crawl_status_no_pagination(self):
67
+ """Test get_crawl_status with auto_paginate=False."""
68
+ # Mock response with next URL
69
+ mock_response = Mock()
70
+ mock_response.ok = True
71
+ mock_response.json.return_value = {
72
+ "success": True,
73
+ "status": "completed",
74
+ "completed": 10,
75
+ "total": 10,
76
+ "creditsUsed": 5,
77
+ "expiresAt": "2024-01-01T00:00:00Z",
78
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
79
+ "data": [self.sample_doc]
80
+ }
81
+
82
+ self.mock_client.get.return_value = mock_response
83
+
84
+ # Test with auto_paginate=False
85
+ pagination_config = PaginationConfig(auto_paginate=False)
86
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
87
+
88
+ assert result.status == "completed"
89
+ assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
90
+ assert len(result.data) == 1
91
+ assert isinstance(result.data[0], Document)
92
+
93
+ def test_get_crawl_status_with_pagination(self):
94
+ """Test get_crawl_status with auto_paginate=True."""
95
+ # Mock first page response
96
+ mock_response1 = Mock()
97
+ mock_response1.ok = True
98
+ mock_response1.json.return_value = {
99
+ "success": True,
100
+ "status": "completed",
101
+ "completed": 10,
102
+ "total": 20,
103
+ "creditsUsed": 5,
104
+ "expiresAt": "2024-01-01T00:00:00Z",
105
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
106
+ "data": [self.sample_doc]
107
+ }
108
+
109
+ # Mock second page response
110
+ mock_response2 = Mock()
111
+ mock_response2.ok = True
112
+ mock_response2.json.return_value = {
113
+ "success": True,
114
+ "status": "completed",
115
+ "completed": 20,
116
+ "total": 20,
117
+ "creditsUsed": 10,
118
+ "expiresAt": "2024-01-01T00:00:00Z",
119
+ "next": None,
120
+ "data": [self.sample_doc]
121
+ }
122
+
123
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
124
+
125
+ # Test with auto_paginate=True
126
+ pagination_config = PaginationConfig(auto_paginate=True)
127
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
128
+
129
+ assert result.status == "completed"
130
+ assert result.next is None # Should be None when auto_paginate=True
131
+ assert len(result.data) == 2
132
+ assert self.mock_client.get.call_count == 2
133
+
134
+ def test_get_crawl_status_max_pages_limit(self):
135
+ """Test get_crawl_status with max_pages limit."""
136
+ # Mock responses for multiple pages
137
+ mock_responses = []
138
+ for i in range(5): # 5 pages available
139
+ mock_response = Mock()
140
+ mock_response.ok = True
141
+ mock_response.json.return_value = {
142
+ "success": True,
143
+ "status": "completed",
144
+ "completed": (i + 1) * 10,
145
+ "total": 50,
146
+ "creditsUsed": (i + 1) * 5,
147
+ "expiresAt": "2024-01-01T00:00:00Z",
148
+ "next": f"https://api.firecrawl.dev/v2/crawl/test-crawl-123?page={i+2}" if i < 4 else None,
149
+ "data": [self.sample_doc]
150
+ }
151
+ mock_responses.append(mock_response)
152
+
153
+ self.mock_client.get.side_effect = mock_responses
154
+
155
+ # Test with max_pages=3
156
+ pagination_config = PaginationConfig(auto_paginate=True, max_pages=3)
157
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
158
+
159
+ assert len(result.data) == 4 # 1 initial + 3 from pages
160
+ assert self.mock_client.get.call_count == 4 # 1 initial + 3 pagination calls
161
+
162
+ def test_get_crawl_status_max_results_limit(self):
163
+ """Test get_crawl_status with max_results limit."""
164
+ # Mock responses with multiple documents per page
165
+ mock_response1 = Mock()
166
+ mock_response1.ok = True
167
+ mock_response1.json.return_value = {
168
+ "success": True,
169
+ "status": "completed",
170
+ "completed": 10,
171
+ "total": 20,
172
+ "creditsUsed": 5,
173
+ "expiresAt": "2024-01-01T00:00:00Z",
174
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
175
+ "data": [self.sample_doc, self.sample_doc, self.sample_doc] # 3 docs
176
+ }
177
+
178
+ mock_response2 = Mock()
179
+ mock_response2.ok = True
180
+ mock_response2.json.return_value = {
181
+ "success": True,
182
+ "status": "completed",
183
+ "completed": 20,
184
+ "total": 20,
185
+ "creditsUsed": 10,
186
+ "expiresAt": "2024-01-01T00:00:00Z",
187
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=3",
188
+ "data": [self.sample_doc, self.sample_doc] # 2 more docs
189
+ }
190
+
191
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
192
+
193
+ # Test with max_results=4
194
+ pagination_config = PaginationConfig(auto_paginate=True, max_results=4)
195
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
196
+
197
+ assert len(result.data) == 4 # Should stop at 4 results
198
+ assert self.mock_client.get.call_count == 2 # Should fetch 2 pages
199
+
200
+ def test_get_crawl_status_max_wait_time_limit(self):
201
+ """Test get_crawl_status with max_wait_time limit."""
202
+ # Mock slow response
203
+ mock_response = Mock()
204
+ mock_response.ok = True
205
+ mock_response.json.return_value = {
206
+ "success": True,
207
+ "status": "completed",
208
+ "completed": 10,
209
+ "total": 20,
210
+ "creditsUsed": 5,
211
+ "expiresAt": "2024-01-01T00:00:00Z",
212
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2",
213
+ "data": [self.sample_doc]
214
+ }
215
+
216
+ self.mock_client.get.return_value = mock_response
217
+
218
+ # Test with max_wait_time=1 second
219
+ pagination_config = PaginationConfig(auto_paginate=True, max_wait_time=1)
220
+
221
+ with patch('firecrawl.v2.methods.crawl.time.monotonic', side_effect=[0, 2]): # Simulate 2 seconds elapsed
222
+ result = get_crawl_status(self.mock_client, self.job_id, pagination_config)
223
+
224
+ assert len(result.data) == 1 # Should stop due to timeout
225
+ assert self.mock_client.get.call_count == 1
226
+
227
+ def test_fetch_all_pages_error_handling(self):
228
+ """Test _fetch_all_pages with API errors."""
229
+ # Mock first page success, second page error
230
+ mock_response1 = Mock()
231
+ mock_response1.ok = True
232
+ mock_response1.json.return_value = {
233
+ "success": True,
234
+ "data": [self.sample_doc],
235
+ "next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
236
+ }
237
+
238
+ mock_response2 = Mock()
239
+ mock_response2.ok = False
240
+ mock_response2.status_code = 500
241
+
242
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
243
+
244
+ # Should continue with what we have
245
+ result = _fetch_all_pages(self.mock_client, "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2", [], None)
246
+
247
+ assert len(result) == 1 # Should have the first page data
248
+ assert self.mock_client.get.call_count == 2
249
+
250
+
251
+ class TestBatchScrapePagination:
252
+ """Test batch scrape pagination functionality."""
253
+
254
+ def setup_method(self):
255
+ """Set up test fixtures."""
256
+ self.mock_client = Mock()
257
+ self.job_id = "test-batch-123"
258
+
259
+ # Sample document data
260
+ self.sample_doc = {
261
+ "url": "https://example.com",
262
+ "markdown": "# Test Content",
263
+ "metadata": {
264
+ "title": "Test Page",
265
+ "statusCode": 200
266
+ }
267
+ }
268
+
269
+ def test_get_batch_scrape_status_no_pagination(self):
270
+ """Test get_batch_scrape_status with auto_paginate=False."""
271
+ # Mock response with next URL
272
+ mock_response = Mock()
273
+ mock_response.ok = True
274
+ mock_response.json.return_value = {
275
+ "success": True,
276
+ "status": "completed",
277
+ "completed": 10,
278
+ "total": 10,
279
+ "creditsUsed": 5,
280
+ "expiresAt": "2024-01-01T00:00:00Z",
281
+ "next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
282
+ "data": [self.sample_doc]
283
+ }
284
+
285
+ self.mock_client.get.return_value = mock_response
286
+
287
+ # Test with auto_paginate=False
288
+ pagination_config = PaginationConfig(auto_paginate=False)
289
+ result = get_batch_scrape_status(self.mock_client, self.job_id, pagination_config)
290
+
291
+ assert result.status == "completed"
292
+ assert result.next == "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2"
293
+ assert len(result.data) == 1
294
+ assert isinstance(result.data[0], Document)
295
+
296
+ def test_get_batch_scrape_status_with_pagination(self):
297
+ """Test get_batch_scrape_status with auto_paginate=True."""
298
+ # Mock first page response
299
+ mock_response1 = Mock()
300
+ mock_response1.ok = True
301
+ mock_response1.json.return_value = {
302
+ "success": True,
303
+ "status": "completed",
304
+ "completed": 10,
305
+ "total": 20,
306
+ "creditsUsed": 5,
307
+ "expiresAt": "2024-01-01T00:00:00Z",
308
+ "next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
309
+ "data": [self.sample_doc]
310
+ }
311
+
312
+ # Mock second page response
313
+ mock_response2 = Mock()
314
+ mock_response2.ok = True
315
+ mock_response2.json.return_value = {
316
+ "success": True,
317
+ "status": "completed",
318
+ "completed": 20,
319
+ "total": 20,
320
+ "creditsUsed": 10,
321
+ "expiresAt": "2024-01-01T00:00:00Z",
322
+ "next": None,
323
+ "data": [self.sample_doc]
324
+ }
325
+
326
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
327
+
328
+ # Test with auto_paginate=True
329
+ pagination_config = PaginationConfig(auto_paginate=True)
330
+ result = get_batch_scrape_status(self.mock_client, self.job_id, pagination_config)
331
+
332
+ assert result.status == "completed"
333
+ assert result.next is None # Should be None when auto_paginate=True
334
+ assert len(result.data) == 2
335
+ assert self.mock_client.get.call_count == 2
336
+
337
+ def test_fetch_all_batch_pages_limits(self):
338
+ """Test _fetch_all_batch_pages with various limits."""
339
+ # Mock responses for multiple pages
340
+ mock_responses = []
341
+ for i in range(5): # 5 pages available
342
+ mock_response = Mock()
343
+ mock_response.ok = True
344
+ mock_response.json.return_value = {
345
+ "success": True,
346
+ "data": [self.sample_doc, self.sample_doc], # 2 docs per page
347
+ "next": f"https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page={i+2}" if i < 4 else None
348
+ }
349
+ mock_responses.append(mock_response)
350
+
351
+ self.mock_client.get.side_effect = mock_responses
352
+
353
+ # Test with max_pages=2, max_results=4 (total docs we want)
354
+ pagination_config = PaginationConfig(max_pages=2, max_results=4)
355
+ result = _fetch_all_batch_pages(
356
+ self.mock_client,
357
+ "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2",
358
+ [Document(**self.sample_doc)], # 1 initial doc
359
+ pagination_config
360
+ )
361
+
362
+ # Should have 1 initial + 3 from pages (limited by max_results=4)
363
+ assert len(result) == 4
364
+ assert self.mock_client.get.call_count == 2 # Should fetch 2 pages
365
+
366
+
367
+ class TestAsyncPagination:
368
+ """Test async pagination functionality."""
369
+
370
+ def setup_method(self):
371
+ """Set up test fixtures."""
372
+ self.mock_client = AsyncMock()
373
+ self.job_id = "test-async-123"
374
+
375
+ # Sample document data
376
+ self.sample_doc = {
377
+ "url": "https://example.com",
378
+ "markdown": "# Test Content",
379
+ "metadata": {
380
+ "title": "Test Page",
381
+ "statusCode": 200
382
+ }
383
+ }
384
+
385
+ @pytest.mark.asyncio
386
+ async def test_get_crawl_status_async_with_pagination(self):
387
+ """Test async get_crawl_status with pagination."""
388
+ # Mock first page response
389
+ mock_response1 = Mock()
390
+ mock_response1.status_code = 200
391
+ mock_response1.json.return_value = {
392
+ "success": True,
393
+ "status": "completed",
394
+ "completed": 10,
395
+ "total": 20,
396
+ "creditsUsed": 5,
397
+ "expiresAt": "2024-01-01T00:00:00Z",
398
+ "next": "https://api.firecrawl.dev/v2/crawl/test-async-123?page=2",
399
+ "data": [self.sample_doc]
400
+ }
401
+
402
+ # Mock second page response
403
+ mock_response2 = Mock()
404
+ mock_response2.status_code = 200
405
+ mock_response2.json.return_value = {
406
+ "success": True,
407
+ "status": "completed",
408
+ "completed": 20,
409
+ "total": 20,
410
+ "creditsUsed": 10,
411
+ "expiresAt": "2024-01-01T00:00:00Z",
412
+ "next": None,
413
+ "data": [self.sample_doc]
414
+ }
415
+
416
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
417
+
418
+ # Test with auto_paginate=True
419
+ pagination_config = PaginationConfig(auto_paginate=True)
420
+ result = await get_crawl_status_async(self.mock_client, self.job_id, pagination_config)
421
+
422
+ assert result.status == "completed"
423
+ assert result.next is None
424
+ assert len(result.data) == 2
425
+ assert self.mock_client.get.call_count == 2
426
+
427
+ @pytest.mark.asyncio
428
+ async def test_get_batch_scrape_status_async_with_pagination(self):
429
+ """Test async get_batch_scrape_status with pagination."""
430
+ # Mock first page response
431
+ mock_response1 = Mock()
432
+ mock_response1.status_code = 200
433
+ mock_response1.json.return_value = {
434
+ "success": True,
435
+ "status": "completed",
436
+ "completed": 10,
437
+ "total": 20,
438
+ "creditsUsed": 5,
439
+ "expiresAt": "2024-01-01T00:00:00Z",
440
+ "next": "https://api.firecrawl.dev/v2/batch/scrape/test-async-123?page=2",
441
+ "data": [self.sample_doc]
442
+ }
443
+
444
+ # Mock second page response
445
+ mock_response2 = Mock()
446
+ mock_response2.status_code = 200
447
+ mock_response2.json.return_value = {
448
+ "success": True,
449
+ "status": "completed",
450
+ "completed": 20,
451
+ "total": 20,
452
+ "creditsUsed": 10,
453
+ "expiresAt": "2024-01-01T00:00:00Z",
454
+ "next": None,
455
+ "data": [self.sample_doc]
456
+ }
457
+
458
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
459
+
460
+ # Test with auto_paginate=True
461
+ pagination_config = PaginationConfig(auto_paginate=True)
462
+ result = await get_batch_scrape_status_async(self.mock_client, self.job_id, pagination_config)
463
+
464
+ assert result.status == "completed"
465
+ assert result.next is None
466
+ assert len(result.data) == 2
467
+ assert self.mock_client.get.call_count == 2
468
+
469
+ @pytest.mark.asyncio
470
+ async def test_fetch_all_pages_async_limits(self):
471
+ """Test async _fetch_all_pages_async with limits."""
472
+ # Mock responses for multiple pages
473
+ mock_responses = []
474
+ for i in range(3): # 3 pages available
475
+ mock_response = Mock()
476
+ mock_response.status_code = 200
477
+ mock_response.json.return_value = {
478
+ "success": True,
479
+ "data": [self.sample_doc],
480
+ "next": f"https://api.firecrawl.dev/v2/crawl/test-async-123?page={i+2}" if i < 2 else None
481
+ }
482
+ mock_responses.append(mock_response)
483
+
484
+ self.mock_client.get.side_effect = mock_responses
485
+
486
+ # Test with max_pages=2
487
+ pagination_config = PaginationConfig(max_pages=2)
488
+ result = await _fetch_all_pages_async(
489
+ self.mock_client,
490
+ "https://api.firecrawl.dev/v2/crawl/test-async-123?page=2",
491
+ [Document(**self.sample_doc)], # 1 initial doc
492
+ pagination_config
493
+ )
494
+
495
+ assert len(result) == 3 # 1 initial + 2 from pages
496
+ assert self.mock_client.get.call_count == 2
497
+
498
+
499
+ class TestPaginationEdgeCases:
500
+ """Test pagination edge cases and error conditions."""
501
+
502
+ def setup_method(self):
503
+ """Set up test fixtures."""
504
+ self.mock_client = Mock()
505
+ self.sample_doc = {
506
+ "url": "https://example.com",
507
+ "markdown": "# Test Content",
508
+ "metadata": {"title": "Test Page"}
509
+ }
510
+
511
+ def test_pagination_with_empty_data(self):
512
+ """Test pagination when API returns empty data."""
513
+ mock_response = Mock()
514
+ mock_response.ok = True
515
+ mock_response.json.return_value = {
516
+ "success": True,
517
+ "status": "completed",
518
+ "completed": 0,
519
+ "total": 0,
520
+ "creditsUsed": 0,
521
+ "expiresAt": "2024-01-01T00:00:00Z",
522
+ "next": None,
523
+ "data": []
524
+ }
525
+
526
+ self.mock_client.get.return_value = mock_response
527
+
528
+ pagination_config = PaginationConfig(auto_paginate=True)
529
+ result = get_crawl_status(self.mock_client, "test-123", pagination_config)
530
+
531
+ assert len(result.data) == 0
532
+ assert result.next is None
533
+
534
+ def test_pagination_with_string_data(self):
535
+ """Test pagination when API returns string data (should be skipped)."""
536
+ mock_response = Mock()
537
+ mock_response.ok = True
538
+ mock_response.json.return_value = {
539
+ "success": True,
540
+ "status": "completed",
541
+ "completed": 2,
542
+ "total": 2,
543
+ "creditsUsed": 1,
544
+ "expiresAt": "2024-01-01T00:00:00Z",
545
+ "next": None,
546
+ "data": ["https://example.com", self.sample_doc] # String + dict
547
+ }
548
+
549
+ self.mock_client.get.return_value = mock_response
550
+
551
+ pagination_config = PaginationConfig(auto_paginate=True)
552
+ result = get_crawl_status(self.mock_client, "test-123", pagination_config)
553
+
554
+ assert len(result.data) == 1 # Only the dict should be processed
555
+ assert isinstance(result.data[0], Document)
556
+
557
+ def test_pagination_with_failed_response(self):
558
+ """Test pagination when API response indicates failure."""
559
+ mock_response = Mock()
560
+ mock_response.ok = True
561
+ mock_response.json.return_value = {
562
+ "success": False,
563
+ "error": "Job not found"
564
+ }
565
+
566
+ self.mock_client.get.return_value = mock_response
567
+
568
+ pagination_config = PaginationConfig(auto_paginate=True)
569
+
570
+ with pytest.raises(Exception, match="Job not found"):
571
+ get_crawl_status(self.mock_client, "test-123", pagination_config)
572
+
573
+ def test_pagination_with_unsuccessful_page(self):
574
+ """Test pagination when a subsequent page is unsuccessful."""
575
+ # Mock first page success
576
+ mock_response1 = Mock()
577
+ mock_response1.ok = True
578
+ mock_response1.json.return_value = {
579
+ "success": True,
580
+ "data": [self.sample_doc],
581
+ "next": "https://api.firecrawl.dev/v2/crawl/test-123?page=2"
582
+ }
583
+
584
+ # Mock second page failure
585
+ mock_response2 = Mock()
586
+ mock_response2.ok = True
587
+ mock_response2.json.return_value = {
588
+ "success": False,
589
+ "error": "Page not found"
590
+ }
591
+
592
+ self.mock_client.get.side_effect = [mock_response1, mock_response2]
593
+
594
+ # Should continue with what we have
595
+ result = _fetch_all_pages(self.mock_client, "https://api.firecrawl.dev/v2/crawl/test-123?page=2", [], None)
596
+
597
+ assert len(result) == 1 # Should have the first page data
598
+ assert self.mock_client.get.call_count == 2
599
+
600
+
601
+ if __name__ == "__main__":
602
+ pytest.main([__file__, "-v"])
@@ -35,6 +35,7 @@ from .types import (
35
35
  ExecuteJavascriptAction,
36
36
  PDFAction,
37
37
  Location,
38
+ PaginationConfig,
38
39
  )
39
40
  from .utils.http_client import HttpClient
40
41
  from .utils.error_handler import FirecrawlError
@@ -356,12 +357,17 @@ class FirecrawlClient:
356
357
 
357
358
  return crawl_module.start_crawl(self.http_client, request)
358
359
 
359
- def get_crawl_status(self, job_id: str) -> CrawlJob:
360
+ def get_crawl_status(
361
+ self,
362
+ job_id: str,
363
+ pagination_config: Optional[PaginationConfig] = None
364
+ ) -> CrawlJob:
360
365
  """
361
366
  Get the status of a crawl job.
362
367
 
363
368
  Args:
364
369
  job_id: ID of the crawl job
370
+ pagination_config: Optional configuration for pagination behavior
365
371
 
366
372
  Returns:
367
373
  CrawlJob with current status and data
@@ -369,7 +375,11 @@ class FirecrawlClient:
369
375
  Raises:
370
376
  Exception: If the status check fails
371
377
  """
372
- return crawl_module.get_crawl_status(self.http_client, job_id)
378
+ return crawl_module.get_crawl_status(
379
+ self.http_client,
380
+ job_id,
381
+ pagination_config=pagination_config
382
+ )
373
383
 
374
384
  def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
375
385
  """
@@ -651,16 +661,25 @@ class FirecrawlClient:
651
661
  idempotency_key=idempotency_key,
652
662
  )
653
663
 
654
- def get_batch_scrape_status(self, job_id: str):
664
+ def get_batch_scrape_status(
665
+ self,
666
+ job_id: str,
667
+ pagination_config: Optional[PaginationConfig] = None
668
+ ):
655
669
  """Get current status and any scraped data for a batch job.
656
670
 
657
671
  Args:
658
672
  job_id: Batch job ID
673
+ pagination_config: Optional configuration for pagination behavior
659
674
 
660
675
  Returns:
661
676
  Status payload including counts and partial data
662
677
  """
663
- return batch_module.get_batch_scrape_status(self.http_client, job_id)
678
+ return batch_module.get_batch_scrape_status(
679
+ self.http_client,
680
+ job_id,
681
+ pagination_config=pagination_config
682
+ )
664
683
 
665
684
  def cancel_batch_scrape(self, job_id: str) -> bool:
666
685
  """Cancel a running batch scrape job.