firecrawl 4.4.0__tar.gz → 4.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (88) hide show
  1. {firecrawl-4.4.0 → firecrawl-4.6.0}/PKG-INFO +1 -1
  2. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__init__.py +1 -1
  3. firecrawl-4.6.0/firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  4. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_pagination.py +70 -1
  5. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/client.py +30 -17
  6. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/client_async.py +77 -14
  7. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/crawl.py +18 -9
  8. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/crawl.py +68 -37
  9. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/types.py +21 -2
  10. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/http_client.py +5 -3
  11. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/http_client_async.py +9 -5
  12. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/normalize.py +7 -0
  13. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/PKG-INFO +1 -1
  14. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/SOURCES.txt +2 -0
  15. firecrawl-4.6.0/tests/test_api_key_handling.py +44 -0
  16. {firecrawl-4.4.0 → firecrawl-4.6.0}/LICENSE +0 -0
  17. {firecrawl-4.4.0 → firecrawl-4.6.0}/README.md +0 -0
  18. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -0
  19. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -0
  20. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -0
  21. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -0
  22. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -0
  23. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -0
  24. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
  25. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
  26. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
  27. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
  28. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -0
  29. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -0
  30. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_extract.py +0 -0
  31. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_map.py +0 -0
  32. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -0
  33. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_search.py +0 -0
  34. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
  35. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
  36. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
  37. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -0
  38. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
  39. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -0
  40. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
  41. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -0
  42. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
  43. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
  44. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -0
  45. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
  46. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
  47. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
  48. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -0
  49. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -0
  50. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -0
  51. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
  52. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
  53. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
  54. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
  55. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
  56. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/client.py +0 -0
  57. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/firecrawl.backup.py +0 -0
  58. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/types.py +0 -0
  59. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v1/__init__.py +0 -0
  60. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v1/client.py +0 -0
  61. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/__init__.py +0 -0
  62. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/__init__.py +0 -0
  63. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/batch.py +0 -0
  64. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/extract.py +0 -0
  65. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/map.py +0 -0
  66. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/scrape.py +0 -0
  67. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/search.py +0 -0
  68. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/usage.py +0 -0
  69. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/batch.py +0 -0
  70. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/extract.py +0 -0
  71. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/map.py +0 -0
  72. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/scrape.py +0 -0
  73. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/search.py +0 -0
  74. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/usage.py +0 -0
  75. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/__init__.py +0 -0
  76. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/error_handler.py +0 -0
  77. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/get_version.py +0 -0
  78. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/validation.py +0 -0
  79. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/watcher.py +0 -0
  80. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/watcher_async.py +0 -0
  81. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/dependency_links.txt +0 -0
  82. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/requires.txt +0 -0
  83. {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/top_level.txt +0 -0
  84. {firecrawl-4.4.0 → firecrawl-4.6.0}/pyproject.toml +0 -0
  85. {firecrawl-4.4.0 → firecrawl-4.6.0}/setup.cfg +0 -0
  86. {firecrawl-4.4.0 → firecrawl-4.6.0}/setup.py +0 -0
  87. {firecrawl-4.4.0 → firecrawl-4.6.0}/tests/test_change_tracking.py +0 -0
  88. {firecrawl-4.4.0 → firecrawl-4.6.0}/tests/test_timeout_conversion.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: firecrawl
3
- Version: 4.4.0
3
+ Version: 4.6.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/firecrawl/firecrawl
6
6
  Author: Mendable.ai
@@ -17,7 +17,7 @@ from .v1 import (
17
17
  V1ChangeTrackingOptions,
18
18
  )
19
19
 
20
- __version__ = "4.4.0"
20
+ __version__ = "4.6.0"
21
21
 
22
22
  # Define the logger for the Firecrawl project
23
23
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -0,0 +1,214 @@
1
+ import pytest
2
+ from unittest.mock import Mock, MagicMock
3
+ from firecrawl.v2.methods.scrape import scrape
4
+ from firecrawl.v2.types import ScrapeOptions, Document
5
+
6
+
7
+ class TestBrandingFormat:
8
+ """Unit tests for branding format support."""
9
+
10
+ def test_scrape_with_branding_format_returns_branding_data(self):
11
+ """Test that scraping with branding format returns branding data."""
12
+ mock_response = Mock()
13
+ mock_response.ok = True
14
+ mock_response.json.return_value = {
15
+ "success": True,
16
+ "data": {
17
+ "markdown": "# Example",
18
+ "branding": {
19
+ "colorScheme": "light",
20
+ "colors": {
21
+ "primary": "#E11D48",
22
+ "secondary": "#3B82F6",
23
+ "accent": "#F59E0B"
24
+ },
25
+ "typography": {
26
+ "fontFamilies": {
27
+ "primary": "Inter",
28
+ "heading": "Poppins"
29
+ },
30
+ "fontSizes": {
31
+ "h1": "2.5rem",
32
+ "body": "1rem"
33
+ }
34
+ },
35
+ "spacing": {
36
+ "baseUnit": 8
37
+ },
38
+ "components": {
39
+ "buttonPrimary": {
40
+ "background": "#E11D48",
41
+ "textColor": "#FFFFFF",
42
+ "borderRadius": "0.5rem"
43
+ }
44
+ }
45
+ }
46
+ }
47
+ }
48
+
49
+ mock_client = Mock()
50
+ mock_client.post.return_value = mock_response
51
+
52
+ result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
53
+
54
+ assert result.branding is not None
55
+ assert result.branding.color_scheme == "light"
56
+ assert result.branding.colors["primary"] == "#E11D48"
57
+ assert result.branding.typography["fontFamilies"]["primary"] == "Inter"
58
+ assert result.branding.spacing["baseUnit"] == 8
59
+ assert result.branding.components["buttonPrimary"]["background"] == "#E11D48"
60
+
61
+ def test_scrape_with_branding_and_markdown_formats_returns_both(self):
62
+ """Test that scraping with both branding and markdown formats returns both."""
63
+ mock_response = Mock()
64
+ mock_response.ok = True
65
+ mock_response.json.return_value = {
66
+ "success": True,
67
+ "data": {
68
+ "markdown": "# Example Content",
69
+ "branding": {
70
+ "colorScheme": "dark",
71
+ "colors": {
72
+ "primary": "#10B981"
73
+ },
74
+ "typography": {
75
+ "fontFamilies": {
76
+ "primary": "Roboto"
77
+ }
78
+ }
79
+ }
80
+ }
81
+ }
82
+
83
+ mock_client = Mock()
84
+ mock_client.post.return_value = mock_response
85
+
86
+ result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["markdown", "branding"]))
87
+
88
+ assert result.markdown == "# Example Content"
89
+ assert result.branding is not None
90
+ assert result.branding.color_scheme == "dark"
91
+ assert result.branding.colors["primary"] == "#10B981"
92
+
93
+ def test_scrape_without_branding_format_does_not_return_branding(self):
94
+ """Test that scraping without branding format does not return branding."""
95
+ mock_response = Mock()
96
+ mock_response.ok = True
97
+ mock_response.json.return_value = {
98
+ "success": True,
99
+ "data": {
100
+ "markdown": "# Example"
101
+ }
102
+ }
103
+
104
+ mock_client = Mock()
105
+ mock_client.post.return_value = mock_response
106
+
107
+ result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["markdown"]))
108
+
109
+ assert result.markdown == "# Example"
110
+ assert result.branding is None
111
+
112
+ def test_branding_format_with_all_nested_fields(self):
113
+ """Test branding format with all nested fields populated."""
114
+ mock_response = Mock()
115
+ mock_response.ok = True
116
+ mock_response.json.return_value = {
117
+ "success": True,
118
+ "data": {
119
+ "branding": {
120
+ "colorScheme": "light",
121
+ "logo": "https://example.com/logo.png",
122
+ "fonts": [
123
+ {"family": "Inter", "weight": 400},
124
+ {"family": "Poppins", "weight": 700}
125
+ ],
126
+ "colors": {
127
+ "primary": "#E11D48",
128
+ "background": "#FFFFFF"
129
+ },
130
+ "typography": {
131
+ "fontFamilies": {"primary": "Inter"},
132
+ "fontStacks": {"body": ["Inter", "sans-serif"]},
133
+ "fontSizes": {"h1": "2.5rem"},
134
+ "lineHeights": {"body": 1.5},
135
+ "fontWeights": {"regular": 400}
136
+ },
137
+ "spacing": {
138
+ "baseUnit": 8,
139
+ "padding": {"sm": 8, "md": 16}
140
+ },
141
+ "components": {
142
+ "buttonPrimary": {
143
+ "background": "#E11D48",
144
+ "textColor": "#FFFFFF"
145
+ }
146
+ },
147
+ "icons": {
148
+ "style": "outline",
149
+ "primaryColor": "#E11D48"
150
+ },
151
+ "images": {
152
+ "logo": "https://example.com/logo.png",
153
+ "favicon": "https://example.com/favicon.ico"
154
+ },
155
+ "animations": {
156
+ "transitionDuration": "200ms",
157
+ "easing": "ease-in-out"
158
+ },
159
+ "layout": {
160
+ "grid": {"columns": 12, "maxWidth": "1200px"},
161
+ "headerHeight": "64px"
162
+ },
163
+ "tone": {
164
+ "voice": "professional",
165
+ "emojiUsage": "minimal"
166
+ },
167
+ "personality": {
168
+ "tone": "professional",
169
+ "energy": "medium",
170
+ "targetAudience": "developers"
171
+ }
172
+ }
173
+ }
174
+ }
175
+
176
+ mock_client = Mock()
177
+ mock_client.post.return_value = mock_response
178
+
179
+ result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
180
+
181
+ assert result.branding is not None
182
+ assert result.branding.color_scheme == "light"
183
+ assert result.branding.logo == "https://example.com/logo.png"
184
+ assert len(result.branding.fonts) == 2
185
+ assert result.branding.typography["fontStacks"]["body"] == ["Inter", "sans-serif"]
186
+ assert result.branding.spacing["padding"] == {"sm": 8, "md": 16}
187
+ assert result.branding.icons["style"] == "outline"
188
+ assert result.branding.images["favicon"] == "https://example.com/favicon.ico"
189
+ assert result.branding.animations["easing"] == "ease-in-out"
190
+ assert result.branding.layout["grid"]["columns"] == 12
191
+ assert result.branding.personality["tone"] == "professional"
192
+
193
+ def test_branding_colorscheme_normalization(self):
194
+ """Test that colorScheme is normalized to color_scheme."""
195
+ mock_response = Mock()
196
+ mock_response.ok = True
197
+ mock_response.json.return_value = {
198
+ "success": True,
199
+ "data": {
200
+ "branding": {
201
+ "colorScheme": "dark",
202
+ "colors": {"primary": "#000000"}
203
+ }
204
+ }
205
+ }
206
+
207
+ mock_client = Mock()
208
+ mock_client.post.return_value = mock_response
209
+
210
+ result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
211
+
212
+ assert result.branding is not None
213
+ assert result.branding.color_scheme == "dark"
214
+ assert not hasattr(result.branding, "colorScheme")
@@ -89,6 +89,40 @@ class TestCrawlPagination:
89
89
  assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
90
90
  assert len(result.data) == 1
91
91
  assert isinstance(result.data[0], Document)
92
+
93
+ def test_get_crawl_status_propagates_request_timeout(self):
94
+ """Ensure request_timeout is forwarded to the HTTP client."""
95
+ mock_response = Mock()
96
+ mock_response.ok = True
97
+ mock_response.json.return_value = {
98
+ "success": True,
99
+ "status": "completed",
100
+ "completed": 1,
101
+ "total": 1,
102
+ "creditsUsed": 1,
103
+ "expiresAt": "2024-01-01T00:00:00Z",
104
+ "next": None,
105
+ "data": [self.sample_doc],
106
+ }
107
+
108
+ self.mock_client.get.return_value = mock_response
109
+
110
+ timeout_seconds = 5.5
111
+ import firecrawl.v2.methods.crawl as crawl_module
112
+
113
+ assert crawl_module.__file__.endswith("firecrawl/v2/methods/crawl.py")
114
+ assert crawl_module.get_crawl_status.__kwdefaults__ is not None
115
+ assert "request_timeout" in crawl_module.get_crawl_status.__kwdefaults__
116
+ result = get_crawl_status(
117
+ self.mock_client,
118
+ self.job_id,
119
+ request_timeout=timeout_seconds,
120
+ )
121
+
122
+ assert result.status == "completed"
123
+ self.mock_client.get.assert_called_with(
124
+ f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
125
+ )
92
126
 
93
127
  def test_get_crawl_status_with_pagination(self):
94
128
  """Test get_crawl_status with auto_paginate=True."""
@@ -423,7 +457,42 @@ class TestAsyncPagination:
423
457
  assert result.next is None
424
458
  assert len(result.data) == 2
425
459
  assert self.mock_client.get.call_count == 2
426
-
460
+
461
+ @pytest.mark.asyncio
462
+ async def test_get_crawl_status_async_propagates_request_timeout(self):
463
+ """Ensure async request_timeout is forwarded to the HTTP client."""
464
+ mock_response = Mock()
465
+ mock_response.status_code = 200
466
+ mock_response.json.return_value = {
467
+ "success": True,
468
+ "status": "completed",
469
+ "completed": 1,
470
+ "total": 1,
471
+ "creditsUsed": 1,
472
+ "expiresAt": "2024-01-01T00:00:00Z",
473
+ "next": None,
474
+ "data": [self.sample_doc],
475
+ }
476
+
477
+ self.mock_client.get.return_value = mock_response
478
+
479
+ timeout_seconds = 3.3
480
+ import firecrawl.v2.methods.aio.crawl as crawl_module_async
481
+
482
+ assert crawl_module_async.__file__.endswith("firecrawl/v2/methods/aio/crawl.py")
483
+ assert crawl_module_async.get_crawl_status.__kwdefaults__ is not None
484
+ assert "request_timeout" in crawl_module_async.get_crawl_status.__kwdefaults__
485
+ result = await get_crawl_status_async(
486
+ self.mock_client,
487
+ self.job_id,
488
+ request_timeout=timeout_seconds,
489
+ )
490
+
491
+ assert result.status == "completed"
492
+ self.mock_client.get.assert_awaited_with(
493
+ f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
494
+ )
495
+
427
496
  @pytest.mark.asyncio
428
497
  async def test_get_batch_scrape_status_async_with_pagination(self):
429
498
  """Test async get_batch_scrape_status with pagination."""
@@ -54,10 +54,14 @@ from .watcher import Watcher
54
54
  class FirecrawlClient:
55
55
  """
56
56
  Main Firecrawl v2 API client.
57
-
57
+
58
58
  This client provides a clean, modular interface to all Firecrawl functionality.
59
59
  """
60
-
60
+
61
+ @staticmethod
62
+ def _is_cloud_service(url: str) -> bool:
63
+ return "api.firecrawl.dev" in url.lower()
64
+
61
65
  def __init__(
62
66
  self,
63
67
  api_key: Optional[str] = None,
@@ -68,7 +72,7 @@ class FirecrawlClient:
68
72
  ):
69
73
  """
70
74
  Initialize the Firecrawl client.
71
-
75
+
72
76
  Args:
73
77
  api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
74
78
  api_url: Base URL for the Firecrawl API
@@ -78,13 +82,13 @@ class FirecrawlClient:
78
82
  """
79
83
  if api_key is None:
80
84
  api_key = os.getenv("FIRECRAWL_API_KEY")
81
-
82
- if not api_key:
85
+
86
+ if self._is_cloud_service(api_url) and not api_key:
83
87
  raise ValueError(
84
- "API key is required. Set FIRECRAWL_API_KEY environment variable "
88
+ "API key is required for the cloud API. Set FIRECRAWL_API_KEY environment variable "
85
89
  "or pass api_key parameter."
86
90
  )
87
-
91
+
88
92
  self.config = ClientConfig(
89
93
  api_key=api_key,
90
94
  api_url=api_url,
@@ -92,7 +96,7 @@ class FirecrawlClient:
92
96
  max_retries=max_retries,
93
97
  backoff_factor=backoff_factor
94
98
  )
95
-
99
+
96
100
  self.http_client = HttpClient(api_key, api_url)
97
101
 
98
102
  def scrape(
@@ -236,6 +240,7 @@ class FirecrawlClient:
236
240
  zero_data_retention: bool = False,
237
241
  poll_interval: int = 2,
238
242
  timeout: Optional[int] = None,
243
+ request_timeout: Optional[float] = None,
239
244
  integration: Optional[str] = None,
240
245
  ) -> CrawlJob:
241
246
  """
@@ -259,7 +264,8 @@ class FirecrawlClient:
259
264
  scrape_options: Page scraping configuration
260
265
  zero_data_retention: Whether to delete data after 24 hours
261
266
  poll_interval: Seconds between status checks
262
- timeout: Maximum seconds to wait (None for no timeout)
267
+ timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
268
+ request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout
263
269
 
264
270
  Returns:
265
271
  CrawlJob when job completes
@@ -290,10 +296,11 @@ class FirecrawlClient:
290
296
  )
291
297
 
292
298
  return crawl_module.crawl(
293
- self.http_client,
294
- request,
295
- poll_interval=poll_interval,
296
- timeout=timeout
299
+ self.http_client,
300
+ request,
301
+ poll_interval=poll_interval,
302
+ timeout=timeout,
303
+ request_timeout=request_timeout,
297
304
  )
298
305
 
299
306
  def start_crawl(
@@ -368,9 +375,11 @@ class FirecrawlClient:
368
375
  return crawl_module.start_crawl(self.http_client, request)
369
376
 
370
377
  def get_crawl_status(
371
- self,
378
+ self,
372
379
  job_id: str,
373
- pagination_config: Optional[PaginationConfig] = None
380
+ pagination_config: Optional[PaginationConfig] = None,
381
+ *,
382
+ request_timeout: Optional[float] = None,
374
383
  ) -> CrawlJob:
375
384
  """
376
385
  Get the status of a crawl job.
@@ -378,6 +387,9 @@ class FirecrawlClient:
378
387
  Args:
379
388
  job_id: ID of the crawl job
380
389
  pagination_config: Optional configuration for pagination behavior
390
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
391
+ is enabled (default) and there are multiple pages of results, this timeout applies to
392
+ each page request separately, not to the entire operation
381
393
 
382
394
  Returns:
383
395
  CrawlJob with current status and data
@@ -386,9 +398,10 @@ class FirecrawlClient:
386
398
  Exception: If the status check fails
387
399
  """
388
400
  return crawl_module.get_crawl_status(
389
- self.http_client,
401
+ self.http_client,
390
402
  job_id,
391
- pagination_config=pagination_config
403
+ pagination_config=pagination_config,
404
+ request_timeout=request_timeout,
392
405
  )
393
406
 
394
407
  def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
@@ -4,6 +4,7 @@ Async v2 client mirroring the regular client surface using true async HTTP trans
4
4
 
5
5
  import os
6
6
  import asyncio
7
+ import time
7
8
  from typing import Optional, List, Dict, Any, Union, Callable, Literal
8
9
  from .types import (
9
10
  ScrapeOptions,
@@ -47,11 +48,15 @@ from .methods.aio import extract as async_extract # type: ignore[attr-defined]
47
48
  from .watcher_async import AsyncWatcher
48
49
 
49
50
  class AsyncFirecrawlClient:
51
+ @staticmethod
52
+ def _is_cloud_service(url: str) -> bool:
53
+ return "api.firecrawl.dev" in url.lower()
54
+
50
55
  def __init__(self, api_key: Optional[str] = None, api_url: str = "https://api.firecrawl.dev"):
51
56
  if api_key is None:
52
57
  api_key = os.getenv("FIRECRAWL_API_KEY")
53
- if not api_key:
54
- raise ValueError("API key is required. Set FIRECRAWL_API_KEY or pass api_key.")
58
+ if self._is_cloud_service(api_url) and not api_key:
59
+ raise ValueError("API key is required for the cloud API. Set FIRECRAWL_API_KEY or pass api_key.")
55
60
  self.http_client = HttpClient(api_key, api_url)
56
61
  self.async_http_client = AsyncHttpClient(api_key, api_url)
57
62
 
@@ -77,33 +82,91 @@ class AsyncFirecrawlClient:
77
82
  request = CrawlRequest(url=url, **kwargs)
78
83
  return await async_crawl.start_crawl(self.async_http_client, request)
79
84
 
80
- async def wait_crawl(self, job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> CrawlJob:
81
- # simple polling loop using blocking get (ok for test-level async)
82
- start = asyncio.get_event_loop().time()
85
+ async def wait_crawl(
86
+ self,
87
+ job_id: str,
88
+ poll_interval: int = 2,
89
+ timeout: Optional[int] = None,
90
+ *,
91
+ request_timeout: Optional[float] = None,
92
+ ) -> CrawlJob:
93
+ """
94
+ Polls the status of a crawl job until it reaches a terminal state.
95
+
96
+ Args:
97
+ job_id (str): The ID of the crawl job to poll.
98
+ poll_interval (int, optional): Number of seconds to wait between polling attempts. Defaults to 2.
99
+ timeout (Optional[int], optional): Maximum number of seconds to wait for the entire crawl job to complete before timing out. If None, waits indefinitely. Defaults to None.
100
+ request_timeout (Optional[float], optional): Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout. If None, no per-request timeout is set. Defaults to None.
101
+
102
+ Returns:
103
+ CrawlJob: The final status of the crawl job when it reaches a terminal state.
104
+
105
+ Raises:
106
+ TimeoutError: If the crawl does not reach a terminal state within the specified timeout.
107
+
108
+ Terminal states:
109
+ - "completed": The crawl finished successfully.
110
+ - "failed": The crawl finished with an error.
111
+ - "cancelled": The crawl was cancelled.
112
+ """
113
+ start = time.monotonic()
83
114
  while True:
84
- status = await async_crawl.get_crawl_status(self.async_http_client, job_id)
85
- if status.status in ["completed", "failed"]:
115
+ status = await async_crawl.get_crawl_status(
116
+ self.async_http_client,
117
+ job_id,
118
+ request_timeout=request_timeout,
119
+ )
120
+ if status.status in ["completed", "failed", "cancelled"]:
86
121
  return status
87
- if timeout and (asyncio.get_event_loop().time() - start) > timeout:
122
+ if timeout and (time.monotonic() - start) > timeout:
88
123
  raise TimeoutError("Crawl wait timed out")
89
124
  await asyncio.sleep(poll_interval)
90
125
 
91
126
  async def crawl(self, **kwargs) -> CrawlJob:
92
127
  # wrapper combining start and wait
93
- resp = await self.start_crawl(**{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout")})
128
+ resp = await self.start_crawl(
129
+ **{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout", "request_timeout")}
130
+ )
94
131
  poll_interval = kwargs.get("poll_interval", 2)
95
132
  timeout = kwargs.get("timeout")
96
- return await self.wait_crawl(resp.id, poll_interval=poll_interval, timeout=timeout)
133
+ request_timeout = kwargs.get("request_timeout")
134
+ effective_request_timeout = request_timeout if request_timeout is not None else timeout
135
+ return await self.wait_crawl(
136
+ resp.id,
137
+ poll_interval=poll_interval,
138
+ timeout=timeout,
139
+ request_timeout=effective_request_timeout,
140
+ )
97
141
 
98
142
  async def get_crawl_status(
99
- self,
143
+ self,
100
144
  job_id: str,
101
- pagination_config: Optional[PaginationConfig] = None
145
+ pagination_config: Optional[PaginationConfig] = None,
146
+ *,
147
+ request_timeout: Optional[float] = None,
102
148
  ) -> CrawlJob:
149
+ """
150
+ Get the status of a crawl job.
151
+
152
+ Args:
153
+ job_id: ID of the crawl job
154
+ pagination_config: Optional configuration for pagination behavior
155
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
156
+ is enabled (default) and there are multiple pages of results, this timeout applies to
157
+ each page request separately, not to the entire operation
158
+
159
+ Returns:
160
+ CrawlJob with current status and data
161
+
162
+ Raises:
163
+ Exception: If the status check fails
164
+ """
103
165
  return await async_crawl.get_crawl_status(
104
- self.async_http_client,
166
+ self.async_http_client,
105
167
  job_id,
106
- pagination_config=pagination_config
168
+ pagination_config=pagination_config,
169
+ request_timeout=request_timeout,
107
170
  )
108
171
 
109
172
  async def cancel_crawl(self, job_id: str) -> bool:
@@ -87,9 +87,11 @@ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlRe
87
87
 
88
88
 
89
89
  async def get_crawl_status(
90
- client: AsyncHttpClient,
90
+ client: AsyncHttpClient,
91
91
  job_id: str,
92
- pagination_config: Optional[PaginationConfig] = None
92
+ pagination_config: Optional[PaginationConfig] = None,
93
+ *,
94
+ request_timeout: Optional[float] = None,
93
95
  ) -> CrawlJob:
94
96
  """
95
97
  Get the status of a crawl job.
@@ -98,6 +100,9 @@ async def get_crawl_status(
98
100
  client: Async HTTP client instance
99
101
  job_id: ID of the crawl job
100
102
  pagination_config: Optional configuration for pagination limits
103
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
104
+ is enabled (default) and there are multiple pages of results, this timeout applies to
105
+ each page request separately, not to the entire operation
101
106
 
102
107
  Returns:
103
108
  CrawlJob with job information
@@ -105,7 +110,7 @@ async def get_crawl_status(
105
110
  Raises:
106
111
  Exception: If the status check fails
107
112
  """
108
- response = await client.get(f"/v2/crawl/{job_id}")
113
+ response = await client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
109
114
  if response.status_code >= 400:
110
115
  handle_response_error(response, "get crawl status")
111
116
  body = response.json()
@@ -120,10 +125,11 @@ async def get_crawl_status(
120
125
  auto_paginate = pagination_config.auto_paginate if pagination_config else True
121
126
  if auto_paginate and body.get("next"):
122
127
  documents = await _fetch_all_pages_async(
123
- client,
124
- body.get("next"),
125
- documents,
126
- pagination_config
128
+ client,
129
+ body.get("next"),
130
+ documents,
131
+ pagination_config,
132
+ request_timeout=request_timeout,
127
133
  )
128
134
 
129
135
  return CrawlJob(
@@ -142,7 +148,9 @@ async def _fetch_all_pages_async(
142
148
  client: AsyncHttpClient,
143
149
  next_url: str,
144
150
  initial_documents: List[Document],
145
- pagination_config: Optional[PaginationConfig] = None
151
+ pagination_config: Optional[PaginationConfig] = None,
152
+ *,
153
+ request_timeout: Optional[float] = None,
146
154
  ) -> List[Document]:
147
155
  """
148
156
  Fetch all pages of crawl results asynchronously.
@@ -152,6 +160,7 @@ async def _fetch_all_pages_async(
152
160
  next_url: URL for the next page
153
161
  initial_documents: Documents from the first page
154
162
  pagination_config: Optional configuration for pagination limits
163
+ request_timeout: Optional timeout (in seconds) for the underlying HTTP request
155
164
 
156
165
  Returns:
157
166
  List of all documents from all pages
@@ -176,7 +185,7 @@ async def _fetch_all_pages_async(
176
185
  break
177
186
 
178
187
  # Fetch next page
179
- response = await client.get(current_url)
188
+ response = await client.get(current_url, timeout=request_timeout)
180
189
 
181
190
  if response.status_code >= 400:
182
191
  # Log error but continue with what we have