firecrawl 4.4.0__tar.gz → 4.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-4.4.0 → firecrawl-4.6.0}/PKG-INFO +1 -1
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__init__.py +1 -1
- firecrawl-4.6.0/firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_pagination.py +70 -1
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/client.py +30 -17
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/client_async.py +77 -14
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/crawl.py +18 -9
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/crawl.py +68 -37
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/types.py +21 -2
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/http_client.py +5 -3
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/http_client_async.py +9 -5
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/normalize.py +7 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/SOURCES.txt +2 -0
- firecrawl-4.6.0/tests/test_api_key_handling.py +44 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/LICENSE +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/README.md +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_extract.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_map.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_search.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/client.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/firecrawl.backup.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/types.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v1/__init__.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v1/client.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/__init__.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/__init__.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/batch.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/extract.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/map.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/scrape.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/search.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/usage.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/batch.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/extract.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/map.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/scrape.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/search.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/usage.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/__init__.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/error_handler.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/get_version.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/utils/validation.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/watcher.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/watcher_async.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/pyproject.toml +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/setup.cfg +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/setup.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/tests/test_change_tracking.py +0 -0
- {firecrawl-4.4.0 → firecrawl-4.6.0}/tests/test_timeout_conversion.py +0 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from unittest.mock import Mock, MagicMock
|
|
3
|
+
from firecrawl.v2.methods.scrape import scrape
|
|
4
|
+
from firecrawl.v2.types import ScrapeOptions, Document
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestBrandingFormat:
|
|
8
|
+
"""Unit tests for branding format support."""
|
|
9
|
+
|
|
10
|
+
def test_scrape_with_branding_format_returns_branding_data(self):
|
|
11
|
+
"""Test that scraping with branding format returns branding data."""
|
|
12
|
+
mock_response = Mock()
|
|
13
|
+
mock_response.ok = True
|
|
14
|
+
mock_response.json.return_value = {
|
|
15
|
+
"success": True,
|
|
16
|
+
"data": {
|
|
17
|
+
"markdown": "# Example",
|
|
18
|
+
"branding": {
|
|
19
|
+
"colorScheme": "light",
|
|
20
|
+
"colors": {
|
|
21
|
+
"primary": "#E11D48",
|
|
22
|
+
"secondary": "#3B82F6",
|
|
23
|
+
"accent": "#F59E0B"
|
|
24
|
+
},
|
|
25
|
+
"typography": {
|
|
26
|
+
"fontFamilies": {
|
|
27
|
+
"primary": "Inter",
|
|
28
|
+
"heading": "Poppins"
|
|
29
|
+
},
|
|
30
|
+
"fontSizes": {
|
|
31
|
+
"h1": "2.5rem",
|
|
32
|
+
"body": "1rem"
|
|
33
|
+
}
|
|
34
|
+
},
|
|
35
|
+
"spacing": {
|
|
36
|
+
"baseUnit": 8
|
|
37
|
+
},
|
|
38
|
+
"components": {
|
|
39
|
+
"buttonPrimary": {
|
|
40
|
+
"background": "#E11D48",
|
|
41
|
+
"textColor": "#FFFFFF",
|
|
42
|
+
"borderRadius": "0.5rem"
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
mock_client = Mock()
|
|
50
|
+
mock_client.post.return_value = mock_response
|
|
51
|
+
|
|
52
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
|
|
53
|
+
|
|
54
|
+
assert result.branding is not None
|
|
55
|
+
assert result.branding.color_scheme == "light"
|
|
56
|
+
assert result.branding.colors["primary"] == "#E11D48"
|
|
57
|
+
assert result.branding.typography["fontFamilies"]["primary"] == "Inter"
|
|
58
|
+
assert result.branding.spacing["baseUnit"] == 8
|
|
59
|
+
assert result.branding.components["buttonPrimary"]["background"] == "#E11D48"
|
|
60
|
+
|
|
61
|
+
def test_scrape_with_branding_and_markdown_formats_returns_both(self):
|
|
62
|
+
"""Test that scraping with both branding and markdown formats returns both."""
|
|
63
|
+
mock_response = Mock()
|
|
64
|
+
mock_response.ok = True
|
|
65
|
+
mock_response.json.return_value = {
|
|
66
|
+
"success": True,
|
|
67
|
+
"data": {
|
|
68
|
+
"markdown": "# Example Content",
|
|
69
|
+
"branding": {
|
|
70
|
+
"colorScheme": "dark",
|
|
71
|
+
"colors": {
|
|
72
|
+
"primary": "#10B981"
|
|
73
|
+
},
|
|
74
|
+
"typography": {
|
|
75
|
+
"fontFamilies": {
|
|
76
|
+
"primary": "Roboto"
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
mock_client = Mock()
|
|
84
|
+
mock_client.post.return_value = mock_response
|
|
85
|
+
|
|
86
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["markdown", "branding"]))
|
|
87
|
+
|
|
88
|
+
assert result.markdown == "# Example Content"
|
|
89
|
+
assert result.branding is not None
|
|
90
|
+
assert result.branding.color_scheme == "dark"
|
|
91
|
+
assert result.branding.colors["primary"] == "#10B981"
|
|
92
|
+
|
|
93
|
+
def test_scrape_without_branding_format_does_not_return_branding(self):
|
|
94
|
+
"""Test that scraping without branding format does not return branding."""
|
|
95
|
+
mock_response = Mock()
|
|
96
|
+
mock_response.ok = True
|
|
97
|
+
mock_response.json.return_value = {
|
|
98
|
+
"success": True,
|
|
99
|
+
"data": {
|
|
100
|
+
"markdown": "# Example"
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
mock_client = Mock()
|
|
105
|
+
mock_client.post.return_value = mock_response
|
|
106
|
+
|
|
107
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["markdown"]))
|
|
108
|
+
|
|
109
|
+
assert result.markdown == "# Example"
|
|
110
|
+
assert result.branding is None
|
|
111
|
+
|
|
112
|
+
def test_branding_format_with_all_nested_fields(self):
|
|
113
|
+
"""Test branding format with all nested fields populated."""
|
|
114
|
+
mock_response = Mock()
|
|
115
|
+
mock_response.ok = True
|
|
116
|
+
mock_response.json.return_value = {
|
|
117
|
+
"success": True,
|
|
118
|
+
"data": {
|
|
119
|
+
"branding": {
|
|
120
|
+
"colorScheme": "light",
|
|
121
|
+
"logo": "https://example.com/logo.png",
|
|
122
|
+
"fonts": [
|
|
123
|
+
{"family": "Inter", "weight": 400},
|
|
124
|
+
{"family": "Poppins", "weight": 700}
|
|
125
|
+
],
|
|
126
|
+
"colors": {
|
|
127
|
+
"primary": "#E11D48",
|
|
128
|
+
"background": "#FFFFFF"
|
|
129
|
+
},
|
|
130
|
+
"typography": {
|
|
131
|
+
"fontFamilies": {"primary": "Inter"},
|
|
132
|
+
"fontStacks": {"body": ["Inter", "sans-serif"]},
|
|
133
|
+
"fontSizes": {"h1": "2.5rem"},
|
|
134
|
+
"lineHeights": {"body": 1.5},
|
|
135
|
+
"fontWeights": {"regular": 400}
|
|
136
|
+
},
|
|
137
|
+
"spacing": {
|
|
138
|
+
"baseUnit": 8,
|
|
139
|
+
"padding": {"sm": 8, "md": 16}
|
|
140
|
+
},
|
|
141
|
+
"components": {
|
|
142
|
+
"buttonPrimary": {
|
|
143
|
+
"background": "#E11D48",
|
|
144
|
+
"textColor": "#FFFFFF"
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
"icons": {
|
|
148
|
+
"style": "outline",
|
|
149
|
+
"primaryColor": "#E11D48"
|
|
150
|
+
},
|
|
151
|
+
"images": {
|
|
152
|
+
"logo": "https://example.com/logo.png",
|
|
153
|
+
"favicon": "https://example.com/favicon.ico"
|
|
154
|
+
},
|
|
155
|
+
"animations": {
|
|
156
|
+
"transitionDuration": "200ms",
|
|
157
|
+
"easing": "ease-in-out"
|
|
158
|
+
},
|
|
159
|
+
"layout": {
|
|
160
|
+
"grid": {"columns": 12, "maxWidth": "1200px"},
|
|
161
|
+
"headerHeight": "64px"
|
|
162
|
+
},
|
|
163
|
+
"tone": {
|
|
164
|
+
"voice": "professional",
|
|
165
|
+
"emojiUsage": "minimal"
|
|
166
|
+
},
|
|
167
|
+
"personality": {
|
|
168
|
+
"tone": "professional",
|
|
169
|
+
"energy": "medium",
|
|
170
|
+
"targetAudience": "developers"
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
mock_client = Mock()
|
|
177
|
+
mock_client.post.return_value = mock_response
|
|
178
|
+
|
|
179
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
|
|
180
|
+
|
|
181
|
+
assert result.branding is not None
|
|
182
|
+
assert result.branding.color_scheme == "light"
|
|
183
|
+
assert result.branding.logo == "https://example.com/logo.png"
|
|
184
|
+
assert len(result.branding.fonts) == 2
|
|
185
|
+
assert result.branding.typography["fontStacks"]["body"] == ["Inter", "sans-serif"]
|
|
186
|
+
assert result.branding.spacing["padding"] == {"sm": 8, "md": 16}
|
|
187
|
+
assert result.branding.icons["style"] == "outline"
|
|
188
|
+
assert result.branding.images["favicon"] == "https://example.com/favicon.ico"
|
|
189
|
+
assert result.branding.animations["easing"] == "ease-in-out"
|
|
190
|
+
assert result.branding.layout["grid"]["columns"] == 12
|
|
191
|
+
assert result.branding.personality["tone"] == "professional"
|
|
192
|
+
|
|
193
|
+
def test_branding_colorscheme_normalization(self):
|
|
194
|
+
"""Test that colorScheme is normalized to color_scheme."""
|
|
195
|
+
mock_response = Mock()
|
|
196
|
+
mock_response.ok = True
|
|
197
|
+
mock_response.json.return_value = {
|
|
198
|
+
"success": True,
|
|
199
|
+
"data": {
|
|
200
|
+
"branding": {
|
|
201
|
+
"colorScheme": "dark",
|
|
202
|
+
"colors": {"primary": "#000000"}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
mock_client = Mock()
|
|
208
|
+
mock_client.post.return_value = mock_response
|
|
209
|
+
|
|
210
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
|
|
211
|
+
|
|
212
|
+
assert result.branding is not None
|
|
213
|
+
assert result.branding.color_scheme == "dark"
|
|
214
|
+
assert not hasattr(result.branding, "colorScheme")
|
|
@@ -89,6 +89,40 @@ class TestCrawlPagination:
|
|
|
89
89
|
assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
|
|
90
90
|
assert len(result.data) == 1
|
|
91
91
|
assert isinstance(result.data[0], Document)
|
|
92
|
+
|
|
93
|
+
def test_get_crawl_status_propagates_request_timeout(self):
|
|
94
|
+
"""Ensure request_timeout is forwarded to the HTTP client."""
|
|
95
|
+
mock_response = Mock()
|
|
96
|
+
mock_response.ok = True
|
|
97
|
+
mock_response.json.return_value = {
|
|
98
|
+
"success": True,
|
|
99
|
+
"status": "completed",
|
|
100
|
+
"completed": 1,
|
|
101
|
+
"total": 1,
|
|
102
|
+
"creditsUsed": 1,
|
|
103
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
104
|
+
"next": None,
|
|
105
|
+
"data": [self.sample_doc],
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
self.mock_client.get.return_value = mock_response
|
|
109
|
+
|
|
110
|
+
timeout_seconds = 5.5
|
|
111
|
+
import firecrawl.v2.methods.crawl as crawl_module
|
|
112
|
+
|
|
113
|
+
assert crawl_module.__file__.endswith("firecrawl/v2/methods/crawl.py")
|
|
114
|
+
assert crawl_module.get_crawl_status.__kwdefaults__ is not None
|
|
115
|
+
assert "request_timeout" in crawl_module.get_crawl_status.__kwdefaults__
|
|
116
|
+
result = get_crawl_status(
|
|
117
|
+
self.mock_client,
|
|
118
|
+
self.job_id,
|
|
119
|
+
request_timeout=timeout_seconds,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
assert result.status == "completed"
|
|
123
|
+
self.mock_client.get.assert_called_with(
|
|
124
|
+
f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
|
|
125
|
+
)
|
|
92
126
|
|
|
93
127
|
def test_get_crawl_status_with_pagination(self):
|
|
94
128
|
"""Test get_crawl_status with auto_paginate=True."""
|
|
@@ -423,7 +457,42 @@ class TestAsyncPagination:
|
|
|
423
457
|
assert result.next is None
|
|
424
458
|
assert len(result.data) == 2
|
|
425
459
|
assert self.mock_client.get.call_count == 2
|
|
426
|
-
|
|
460
|
+
|
|
461
|
+
@pytest.mark.asyncio
|
|
462
|
+
async def test_get_crawl_status_async_propagates_request_timeout(self):
|
|
463
|
+
"""Ensure async request_timeout is forwarded to the HTTP client."""
|
|
464
|
+
mock_response = Mock()
|
|
465
|
+
mock_response.status_code = 200
|
|
466
|
+
mock_response.json.return_value = {
|
|
467
|
+
"success": True,
|
|
468
|
+
"status": "completed",
|
|
469
|
+
"completed": 1,
|
|
470
|
+
"total": 1,
|
|
471
|
+
"creditsUsed": 1,
|
|
472
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
473
|
+
"next": None,
|
|
474
|
+
"data": [self.sample_doc],
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
self.mock_client.get.return_value = mock_response
|
|
478
|
+
|
|
479
|
+
timeout_seconds = 3.3
|
|
480
|
+
import firecrawl.v2.methods.aio.crawl as crawl_module_async
|
|
481
|
+
|
|
482
|
+
assert crawl_module_async.__file__.endswith("firecrawl/v2/methods/aio/crawl.py")
|
|
483
|
+
assert crawl_module_async.get_crawl_status.__kwdefaults__ is not None
|
|
484
|
+
assert "request_timeout" in crawl_module_async.get_crawl_status.__kwdefaults__
|
|
485
|
+
result = await get_crawl_status_async(
|
|
486
|
+
self.mock_client,
|
|
487
|
+
self.job_id,
|
|
488
|
+
request_timeout=timeout_seconds,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
assert result.status == "completed"
|
|
492
|
+
self.mock_client.get.assert_awaited_with(
|
|
493
|
+
f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
|
|
494
|
+
)
|
|
495
|
+
|
|
427
496
|
@pytest.mark.asyncio
|
|
428
497
|
async def test_get_batch_scrape_status_async_with_pagination(self):
|
|
429
498
|
"""Test async get_batch_scrape_status with pagination."""
|
|
@@ -54,10 +54,14 @@ from .watcher import Watcher
|
|
|
54
54
|
class FirecrawlClient:
|
|
55
55
|
"""
|
|
56
56
|
Main Firecrawl v2 API client.
|
|
57
|
-
|
|
57
|
+
|
|
58
58
|
This client provides a clean, modular interface to all Firecrawl functionality.
|
|
59
59
|
"""
|
|
60
|
-
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _is_cloud_service(url: str) -> bool:
|
|
63
|
+
return "api.firecrawl.dev" in url.lower()
|
|
64
|
+
|
|
61
65
|
def __init__(
|
|
62
66
|
self,
|
|
63
67
|
api_key: Optional[str] = None,
|
|
@@ -68,7 +72,7 @@ class FirecrawlClient:
|
|
|
68
72
|
):
|
|
69
73
|
"""
|
|
70
74
|
Initialize the Firecrawl client.
|
|
71
|
-
|
|
75
|
+
|
|
72
76
|
Args:
|
|
73
77
|
api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
|
|
74
78
|
api_url: Base URL for the Firecrawl API
|
|
@@ -78,13 +82,13 @@ class FirecrawlClient:
|
|
|
78
82
|
"""
|
|
79
83
|
if api_key is None:
|
|
80
84
|
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
81
|
-
|
|
82
|
-
if not api_key:
|
|
85
|
+
|
|
86
|
+
if self._is_cloud_service(api_url) and not api_key:
|
|
83
87
|
raise ValueError(
|
|
84
|
-
"API key is required. Set FIRECRAWL_API_KEY environment variable "
|
|
88
|
+
"API key is required for the cloud API. Set FIRECRAWL_API_KEY environment variable "
|
|
85
89
|
"or pass api_key parameter."
|
|
86
90
|
)
|
|
87
|
-
|
|
91
|
+
|
|
88
92
|
self.config = ClientConfig(
|
|
89
93
|
api_key=api_key,
|
|
90
94
|
api_url=api_url,
|
|
@@ -92,7 +96,7 @@ class FirecrawlClient:
|
|
|
92
96
|
max_retries=max_retries,
|
|
93
97
|
backoff_factor=backoff_factor
|
|
94
98
|
)
|
|
95
|
-
|
|
99
|
+
|
|
96
100
|
self.http_client = HttpClient(api_key, api_url)
|
|
97
101
|
|
|
98
102
|
def scrape(
|
|
@@ -236,6 +240,7 @@ class FirecrawlClient:
|
|
|
236
240
|
zero_data_retention: bool = False,
|
|
237
241
|
poll_interval: int = 2,
|
|
238
242
|
timeout: Optional[int] = None,
|
|
243
|
+
request_timeout: Optional[float] = None,
|
|
239
244
|
integration: Optional[str] = None,
|
|
240
245
|
) -> CrawlJob:
|
|
241
246
|
"""
|
|
@@ -259,7 +264,8 @@ class FirecrawlClient:
|
|
|
259
264
|
scrape_options: Page scraping configuration
|
|
260
265
|
zero_data_retention: Whether to delete data after 24 hours
|
|
261
266
|
poll_interval: Seconds between status checks
|
|
262
|
-
timeout: Maximum seconds to wait (None for no timeout)
|
|
267
|
+
timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
|
|
268
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout
|
|
263
269
|
|
|
264
270
|
Returns:
|
|
265
271
|
CrawlJob when job completes
|
|
@@ -290,10 +296,11 @@ class FirecrawlClient:
|
|
|
290
296
|
)
|
|
291
297
|
|
|
292
298
|
return crawl_module.crawl(
|
|
293
|
-
self.http_client,
|
|
294
|
-
request,
|
|
295
|
-
poll_interval=poll_interval,
|
|
296
|
-
timeout=timeout
|
|
299
|
+
self.http_client,
|
|
300
|
+
request,
|
|
301
|
+
poll_interval=poll_interval,
|
|
302
|
+
timeout=timeout,
|
|
303
|
+
request_timeout=request_timeout,
|
|
297
304
|
)
|
|
298
305
|
|
|
299
306
|
def start_crawl(
|
|
@@ -368,9 +375,11 @@ class FirecrawlClient:
|
|
|
368
375
|
return crawl_module.start_crawl(self.http_client, request)
|
|
369
376
|
|
|
370
377
|
def get_crawl_status(
|
|
371
|
-
self,
|
|
378
|
+
self,
|
|
372
379
|
job_id: str,
|
|
373
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
380
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
381
|
+
*,
|
|
382
|
+
request_timeout: Optional[float] = None,
|
|
374
383
|
) -> CrawlJob:
|
|
375
384
|
"""
|
|
376
385
|
Get the status of a crawl job.
|
|
@@ -378,6 +387,9 @@ class FirecrawlClient:
|
|
|
378
387
|
Args:
|
|
379
388
|
job_id: ID of the crawl job
|
|
380
389
|
pagination_config: Optional configuration for pagination behavior
|
|
390
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
391
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
392
|
+
each page request separately, not to the entire operation
|
|
381
393
|
|
|
382
394
|
Returns:
|
|
383
395
|
CrawlJob with current status and data
|
|
@@ -386,9 +398,10 @@ class FirecrawlClient:
|
|
|
386
398
|
Exception: If the status check fails
|
|
387
399
|
"""
|
|
388
400
|
return crawl_module.get_crawl_status(
|
|
389
|
-
self.http_client,
|
|
401
|
+
self.http_client,
|
|
390
402
|
job_id,
|
|
391
|
-
pagination_config=pagination_config
|
|
403
|
+
pagination_config=pagination_config,
|
|
404
|
+
request_timeout=request_timeout,
|
|
392
405
|
)
|
|
393
406
|
|
|
394
407
|
def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
|
|
@@ -4,6 +4,7 @@ Async v2 client mirroring the regular client surface using true async HTTP trans
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import asyncio
|
|
7
|
+
import time
|
|
7
8
|
from typing import Optional, List, Dict, Any, Union, Callable, Literal
|
|
8
9
|
from .types import (
|
|
9
10
|
ScrapeOptions,
|
|
@@ -47,11 +48,15 @@ from .methods.aio import extract as async_extract # type: ignore[attr-defined]
|
|
|
47
48
|
from .watcher_async import AsyncWatcher
|
|
48
49
|
|
|
49
50
|
class AsyncFirecrawlClient:
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _is_cloud_service(url: str) -> bool:
|
|
53
|
+
return "api.firecrawl.dev" in url.lower()
|
|
54
|
+
|
|
50
55
|
def __init__(self, api_key: Optional[str] = None, api_url: str = "https://api.firecrawl.dev"):
|
|
51
56
|
if api_key is None:
|
|
52
57
|
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
53
|
-
if not api_key:
|
|
54
|
-
raise ValueError("API key is required. Set FIRECRAWL_API_KEY or pass api_key.")
|
|
58
|
+
if self._is_cloud_service(api_url) and not api_key:
|
|
59
|
+
raise ValueError("API key is required for the cloud API. Set FIRECRAWL_API_KEY or pass api_key.")
|
|
55
60
|
self.http_client = HttpClient(api_key, api_url)
|
|
56
61
|
self.async_http_client = AsyncHttpClient(api_key, api_url)
|
|
57
62
|
|
|
@@ -77,33 +82,91 @@ class AsyncFirecrawlClient:
|
|
|
77
82
|
request = CrawlRequest(url=url, **kwargs)
|
|
78
83
|
return await async_crawl.start_crawl(self.async_http_client, request)
|
|
79
84
|
|
|
80
|
-
async def wait_crawl(
|
|
81
|
-
|
|
82
|
-
|
|
85
|
+
async def wait_crawl(
|
|
86
|
+
self,
|
|
87
|
+
job_id: str,
|
|
88
|
+
poll_interval: int = 2,
|
|
89
|
+
timeout: Optional[int] = None,
|
|
90
|
+
*,
|
|
91
|
+
request_timeout: Optional[float] = None,
|
|
92
|
+
) -> CrawlJob:
|
|
93
|
+
"""
|
|
94
|
+
Polls the status of a crawl job until it reaches a terminal state.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
job_id (str): The ID of the crawl job to poll.
|
|
98
|
+
poll_interval (int, optional): Number of seconds to wait between polling attempts. Defaults to 2.
|
|
99
|
+
timeout (Optional[int], optional): Maximum number of seconds to wait for the entire crawl job to complete before timing out. If None, waits indefinitely. Defaults to None.
|
|
100
|
+
request_timeout (Optional[float], optional): Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout. If None, no per-request timeout is set. Defaults to None.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
CrawlJob: The final status of the crawl job when it reaches a terminal state.
|
|
104
|
+
|
|
105
|
+
Raises:
|
|
106
|
+
TimeoutError: If the crawl does not reach a terminal state within the specified timeout.
|
|
107
|
+
|
|
108
|
+
Terminal states:
|
|
109
|
+
- "completed": The crawl finished successfully.
|
|
110
|
+
- "failed": The crawl finished with an error.
|
|
111
|
+
- "cancelled": The crawl was cancelled.
|
|
112
|
+
"""
|
|
113
|
+
start = time.monotonic()
|
|
83
114
|
while True:
|
|
84
|
-
status = await async_crawl.get_crawl_status(
|
|
85
|
-
|
|
115
|
+
status = await async_crawl.get_crawl_status(
|
|
116
|
+
self.async_http_client,
|
|
117
|
+
job_id,
|
|
118
|
+
request_timeout=request_timeout,
|
|
119
|
+
)
|
|
120
|
+
if status.status in ["completed", "failed", "cancelled"]:
|
|
86
121
|
return status
|
|
87
|
-
if timeout and (
|
|
122
|
+
if timeout and (time.monotonic() - start) > timeout:
|
|
88
123
|
raise TimeoutError("Crawl wait timed out")
|
|
89
124
|
await asyncio.sleep(poll_interval)
|
|
90
125
|
|
|
91
126
|
async def crawl(self, **kwargs) -> CrawlJob:
|
|
92
127
|
# wrapper combining start and wait
|
|
93
|
-
resp = await self.start_crawl(
|
|
128
|
+
resp = await self.start_crawl(
|
|
129
|
+
**{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout", "request_timeout")}
|
|
130
|
+
)
|
|
94
131
|
poll_interval = kwargs.get("poll_interval", 2)
|
|
95
132
|
timeout = kwargs.get("timeout")
|
|
96
|
-
|
|
133
|
+
request_timeout = kwargs.get("request_timeout")
|
|
134
|
+
effective_request_timeout = request_timeout if request_timeout is not None else timeout
|
|
135
|
+
return await self.wait_crawl(
|
|
136
|
+
resp.id,
|
|
137
|
+
poll_interval=poll_interval,
|
|
138
|
+
timeout=timeout,
|
|
139
|
+
request_timeout=effective_request_timeout,
|
|
140
|
+
)
|
|
97
141
|
|
|
98
142
|
async def get_crawl_status(
|
|
99
|
-
self,
|
|
143
|
+
self,
|
|
100
144
|
job_id: str,
|
|
101
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
145
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
146
|
+
*,
|
|
147
|
+
request_timeout: Optional[float] = None,
|
|
102
148
|
) -> CrawlJob:
|
|
149
|
+
"""
|
|
150
|
+
Get the status of a crawl job.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
job_id: ID of the crawl job
|
|
154
|
+
pagination_config: Optional configuration for pagination behavior
|
|
155
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
156
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
157
|
+
each page request separately, not to the entire operation
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
CrawlJob with current status and data
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
Exception: If the status check fails
|
|
164
|
+
"""
|
|
103
165
|
return await async_crawl.get_crawl_status(
|
|
104
|
-
self.async_http_client,
|
|
166
|
+
self.async_http_client,
|
|
105
167
|
job_id,
|
|
106
|
-
pagination_config=pagination_config
|
|
168
|
+
pagination_config=pagination_config,
|
|
169
|
+
request_timeout=request_timeout,
|
|
107
170
|
)
|
|
108
171
|
|
|
109
172
|
async def cancel_crawl(self, job_id: str) -> bool:
|
|
@@ -87,9 +87,11 @@ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlRe
|
|
|
87
87
|
|
|
88
88
|
|
|
89
89
|
async def get_crawl_status(
|
|
90
|
-
client: AsyncHttpClient,
|
|
90
|
+
client: AsyncHttpClient,
|
|
91
91
|
job_id: str,
|
|
92
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
92
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
93
|
+
*,
|
|
94
|
+
request_timeout: Optional[float] = None,
|
|
93
95
|
) -> CrawlJob:
|
|
94
96
|
"""
|
|
95
97
|
Get the status of a crawl job.
|
|
@@ -98,6 +100,9 @@ async def get_crawl_status(
|
|
|
98
100
|
client: Async HTTP client instance
|
|
99
101
|
job_id: ID of the crawl job
|
|
100
102
|
pagination_config: Optional configuration for pagination limits
|
|
103
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
104
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
105
|
+
each page request separately, not to the entire operation
|
|
101
106
|
|
|
102
107
|
Returns:
|
|
103
108
|
CrawlJob with job information
|
|
@@ -105,7 +110,7 @@ async def get_crawl_status(
|
|
|
105
110
|
Raises:
|
|
106
111
|
Exception: If the status check fails
|
|
107
112
|
"""
|
|
108
|
-
response = await client.get(f"/v2/crawl/{job_id}")
|
|
113
|
+
response = await client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
|
|
109
114
|
if response.status_code >= 400:
|
|
110
115
|
handle_response_error(response, "get crawl status")
|
|
111
116
|
body = response.json()
|
|
@@ -120,10 +125,11 @@ async def get_crawl_status(
|
|
|
120
125
|
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
121
126
|
if auto_paginate and body.get("next"):
|
|
122
127
|
documents = await _fetch_all_pages_async(
|
|
123
|
-
client,
|
|
124
|
-
body.get("next"),
|
|
125
|
-
documents,
|
|
126
|
-
pagination_config
|
|
128
|
+
client,
|
|
129
|
+
body.get("next"),
|
|
130
|
+
documents,
|
|
131
|
+
pagination_config,
|
|
132
|
+
request_timeout=request_timeout,
|
|
127
133
|
)
|
|
128
134
|
|
|
129
135
|
return CrawlJob(
|
|
@@ -142,7 +148,9 @@ async def _fetch_all_pages_async(
|
|
|
142
148
|
client: AsyncHttpClient,
|
|
143
149
|
next_url: str,
|
|
144
150
|
initial_documents: List[Document],
|
|
145
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
151
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
152
|
+
*,
|
|
153
|
+
request_timeout: Optional[float] = None,
|
|
146
154
|
) -> List[Document]:
|
|
147
155
|
"""
|
|
148
156
|
Fetch all pages of crawl results asynchronously.
|
|
@@ -152,6 +160,7 @@ async def _fetch_all_pages_async(
|
|
|
152
160
|
next_url: URL for the next page
|
|
153
161
|
initial_documents: Documents from the first page
|
|
154
162
|
pagination_config: Optional configuration for pagination limits
|
|
163
|
+
request_timeout: Optional timeout (in seconds) for the underlying HTTP request
|
|
155
164
|
|
|
156
165
|
Returns:
|
|
157
166
|
List of all documents from all pages
|
|
@@ -176,7 +185,7 @@ async def _fetch_all_pages_async(
|
|
|
176
185
|
break
|
|
177
186
|
|
|
178
187
|
# Fetch next page
|
|
179
|
-
response = await client.get(current_url)
|
|
188
|
+
response = await client.get(current_url, timeout=request_timeout)
|
|
180
189
|
|
|
181
190
|
if response.status_code >= 400:
|
|
182
191
|
# Log error but continue with what we have
|