firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from unittest.mock import Mock, MagicMock
|
|
3
|
+
from firecrawl.v2.methods.scrape import scrape
|
|
4
|
+
from firecrawl.v2.types import ScrapeOptions, Document
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestBrandingFormat:
|
|
8
|
+
"""Unit tests for branding format support."""
|
|
9
|
+
|
|
10
|
+
def test_scrape_with_branding_format_returns_branding_data(self):
|
|
11
|
+
"""Test that scraping with branding format returns branding data."""
|
|
12
|
+
mock_response = Mock()
|
|
13
|
+
mock_response.ok = True
|
|
14
|
+
mock_response.json.return_value = {
|
|
15
|
+
"success": True,
|
|
16
|
+
"data": {
|
|
17
|
+
"markdown": "# Example",
|
|
18
|
+
"branding": {
|
|
19
|
+
"colorScheme": "light",
|
|
20
|
+
"colors": {
|
|
21
|
+
"primary": "#E11D48",
|
|
22
|
+
"secondary": "#3B82F6",
|
|
23
|
+
"accent": "#F59E0B"
|
|
24
|
+
},
|
|
25
|
+
"typography": {
|
|
26
|
+
"fontFamilies": {
|
|
27
|
+
"primary": "Inter",
|
|
28
|
+
"heading": "Poppins"
|
|
29
|
+
},
|
|
30
|
+
"fontSizes": {
|
|
31
|
+
"h1": "2.5rem",
|
|
32
|
+
"body": "1rem"
|
|
33
|
+
}
|
|
34
|
+
},
|
|
35
|
+
"spacing": {
|
|
36
|
+
"baseUnit": 8
|
|
37
|
+
},
|
|
38
|
+
"components": {
|
|
39
|
+
"buttonPrimary": {
|
|
40
|
+
"background": "#E11D48",
|
|
41
|
+
"textColor": "#FFFFFF",
|
|
42
|
+
"borderRadius": "0.5rem"
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
mock_client = Mock()
|
|
50
|
+
mock_client.post.return_value = mock_response
|
|
51
|
+
|
|
52
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
|
|
53
|
+
|
|
54
|
+
assert result.branding is not None
|
|
55
|
+
assert result.branding.color_scheme == "light"
|
|
56
|
+
assert result.branding.colors["primary"] == "#E11D48"
|
|
57
|
+
assert result.branding.typography["fontFamilies"]["primary"] == "Inter"
|
|
58
|
+
assert result.branding.spacing["baseUnit"] == 8
|
|
59
|
+
assert result.branding.components["buttonPrimary"]["background"] == "#E11D48"
|
|
60
|
+
|
|
61
|
+
def test_scrape_with_branding_and_markdown_formats_returns_both(self):
|
|
62
|
+
"""Test that scraping with both branding and markdown formats returns both."""
|
|
63
|
+
mock_response = Mock()
|
|
64
|
+
mock_response.ok = True
|
|
65
|
+
mock_response.json.return_value = {
|
|
66
|
+
"success": True,
|
|
67
|
+
"data": {
|
|
68
|
+
"markdown": "# Example Content",
|
|
69
|
+
"branding": {
|
|
70
|
+
"colorScheme": "dark",
|
|
71
|
+
"colors": {
|
|
72
|
+
"primary": "#10B981"
|
|
73
|
+
},
|
|
74
|
+
"typography": {
|
|
75
|
+
"fontFamilies": {
|
|
76
|
+
"primary": "Roboto"
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
mock_client = Mock()
|
|
84
|
+
mock_client.post.return_value = mock_response
|
|
85
|
+
|
|
86
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["markdown", "branding"]))
|
|
87
|
+
|
|
88
|
+
assert result.markdown == "# Example Content"
|
|
89
|
+
assert result.branding is not None
|
|
90
|
+
assert result.branding.color_scheme == "dark"
|
|
91
|
+
assert result.branding.colors["primary"] == "#10B981"
|
|
92
|
+
|
|
93
|
+
def test_scrape_without_branding_format_does_not_return_branding(self):
|
|
94
|
+
"""Test that scraping without branding format does not return branding."""
|
|
95
|
+
mock_response = Mock()
|
|
96
|
+
mock_response.ok = True
|
|
97
|
+
mock_response.json.return_value = {
|
|
98
|
+
"success": True,
|
|
99
|
+
"data": {
|
|
100
|
+
"markdown": "# Example"
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
mock_client = Mock()
|
|
105
|
+
mock_client.post.return_value = mock_response
|
|
106
|
+
|
|
107
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["markdown"]))
|
|
108
|
+
|
|
109
|
+
assert result.markdown == "# Example"
|
|
110
|
+
assert result.branding is None
|
|
111
|
+
|
|
112
|
+
def test_branding_format_with_all_nested_fields(self):
|
|
113
|
+
"""Test branding format with all nested fields populated."""
|
|
114
|
+
mock_response = Mock()
|
|
115
|
+
mock_response.ok = True
|
|
116
|
+
mock_response.json.return_value = {
|
|
117
|
+
"success": True,
|
|
118
|
+
"data": {
|
|
119
|
+
"branding": {
|
|
120
|
+
"colorScheme": "light",
|
|
121
|
+
"logo": "https://example.com/logo.png",
|
|
122
|
+
"fonts": [
|
|
123
|
+
{"family": "Inter", "weight": 400},
|
|
124
|
+
{"family": "Poppins", "weight": 700}
|
|
125
|
+
],
|
|
126
|
+
"colors": {
|
|
127
|
+
"primary": "#E11D48",
|
|
128
|
+
"background": "#FFFFFF"
|
|
129
|
+
},
|
|
130
|
+
"typography": {
|
|
131
|
+
"fontFamilies": {"primary": "Inter"},
|
|
132
|
+
"fontStacks": {"body": ["Inter", "sans-serif"]},
|
|
133
|
+
"fontSizes": {"h1": "2.5rem"},
|
|
134
|
+
"lineHeights": {"body": 1.5},
|
|
135
|
+
"fontWeights": {"regular": 400}
|
|
136
|
+
},
|
|
137
|
+
"spacing": {
|
|
138
|
+
"baseUnit": 8,
|
|
139
|
+
"padding": {"sm": 8, "md": 16}
|
|
140
|
+
},
|
|
141
|
+
"components": {
|
|
142
|
+
"buttonPrimary": {
|
|
143
|
+
"background": "#E11D48",
|
|
144
|
+
"textColor": "#FFFFFF"
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
"icons": {
|
|
148
|
+
"style": "outline",
|
|
149
|
+
"primaryColor": "#E11D48"
|
|
150
|
+
},
|
|
151
|
+
"images": {
|
|
152
|
+
"logo": "https://example.com/logo.png",
|
|
153
|
+
"favicon": "https://example.com/favicon.ico"
|
|
154
|
+
},
|
|
155
|
+
"animations": {
|
|
156
|
+
"transitionDuration": "200ms",
|
|
157
|
+
"easing": "ease-in-out"
|
|
158
|
+
},
|
|
159
|
+
"layout": {
|
|
160
|
+
"grid": {"columns": 12, "maxWidth": "1200px"},
|
|
161
|
+
"headerHeight": "64px"
|
|
162
|
+
},
|
|
163
|
+
"tone": {
|
|
164
|
+
"voice": "professional",
|
|
165
|
+
"emojiUsage": "minimal"
|
|
166
|
+
},
|
|
167
|
+
"personality": {
|
|
168
|
+
"tone": "professional",
|
|
169
|
+
"energy": "medium",
|
|
170
|
+
"targetAudience": "developers"
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
mock_client = Mock()
|
|
177
|
+
mock_client.post.return_value = mock_response
|
|
178
|
+
|
|
179
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
|
|
180
|
+
|
|
181
|
+
assert result.branding is not None
|
|
182
|
+
assert result.branding.color_scheme == "light"
|
|
183
|
+
assert result.branding.logo == "https://example.com/logo.png"
|
|
184
|
+
assert len(result.branding.fonts) == 2
|
|
185
|
+
assert result.branding.typography["fontStacks"]["body"] == ["Inter", "sans-serif"]
|
|
186
|
+
assert result.branding.spacing["padding"] == {"sm": 8, "md": 16}
|
|
187
|
+
assert result.branding.icons["style"] == "outline"
|
|
188
|
+
assert result.branding.images["favicon"] == "https://example.com/favicon.ico"
|
|
189
|
+
assert result.branding.animations["easing"] == "ease-in-out"
|
|
190
|
+
assert result.branding.layout["grid"]["columns"] == 12
|
|
191
|
+
assert result.branding.personality["tone"] == "professional"
|
|
192
|
+
|
|
193
|
+
def test_branding_colorscheme_normalization(self):
|
|
194
|
+
"""Test that colorScheme is normalized to color_scheme."""
|
|
195
|
+
mock_response = Mock()
|
|
196
|
+
mock_response.ok = True
|
|
197
|
+
mock_response.json.return_value = {
|
|
198
|
+
"success": True,
|
|
199
|
+
"data": {
|
|
200
|
+
"branding": {
|
|
201
|
+
"colorScheme": "dark",
|
|
202
|
+
"colors": {"primary": "#000000"}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
mock_client = Mock()
|
|
208
|
+
mock_client.post.return_value = mock_response
|
|
209
|
+
|
|
210
|
+
result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
|
|
211
|
+
|
|
212
|
+
assert result.branding is not None
|
|
213
|
+
assert result.branding.color_scheme == "dark"
|
|
214
|
+
assert not hasattr(result.branding, "colorScheme")
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unit tests for crawl params functionality in Firecrawl v2 SDK.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from firecrawl.v2.types import CrawlParamsRequest, CrawlParamsData
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestCrawlParamsRequest:
|
|
10
|
+
"""Unit tests for CrawlParamsRequest."""
|
|
11
|
+
|
|
12
|
+
def test_crawl_params_request_creation(self):
|
|
13
|
+
"""Test creating CrawlParamsRequest with valid data."""
|
|
14
|
+
request = CrawlParamsRequest(
|
|
15
|
+
url="https://example.com",
|
|
16
|
+
prompt="Extract all blog posts"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
assert request.url == "https://example.com"
|
|
20
|
+
assert request.prompt == "Extract all blog posts"
|
|
21
|
+
|
|
22
|
+
def test_crawl_params_request_serialization(self):
|
|
23
|
+
"""Test that CrawlParamsRequest serializes correctly."""
|
|
24
|
+
request = CrawlParamsRequest(
|
|
25
|
+
url="https://example.com",
|
|
26
|
+
prompt="Extract all blog posts and documentation"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
data = request.model_dump()
|
|
30
|
+
|
|
31
|
+
assert data["url"] == "https://example.com"
|
|
32
|
+
assert data["prompt"] == "Extract all blog posts and documentation"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TestCrawlParamsData:
|
|
36
|
+
"""Unit tests for CrawlParamsData."""
|
|
37
|
+
|
|
38
|
+
def test_crawl_params_data_creation(self):
|
|
39
|
+
"""Test creating CrawlParamsData with minimal data."""
|
|
40
|
+
data = CrawlParamsData()
|
|
41
|
+
|
|
42
|
+
assert data.include_paths is None
|
|
43
|
+
assert data.exclude_paths is None
|
|
44
|
+
assert data.max_discovery_depth is None
|
|
45
|
+
assert data.ignore_sitemap is False
|
|
46
|
+
assert data.limit is None
|
|
47
|
+
assert data.crawl_entire_domain is False
|
|
48
|
+
assert data.allow_external_links is False
|
|
49
|
+
assert data.scrape_options is None
|
|
50
|
+
assert data.warning is None
|
|
51
|
+
|
|
52
|
+
def test_crawl_params_data_with_values(self):
|
|
53
|
+
"""Test creating CrawlParamsData with values."""
|
|
54
|
+
data = CrawlParamsData(
|
|
55
|
+
include_paths=["/blog/*"],
|
|
56
|
+
exclude_paths=["/admin/*"],
|
|
57
|
+
max_discovery_depth=3,
|
|
58
|
+
limit=50,
|
|
59
|
+
crawl_entire_domain=True,
|
|
60
|
+
allow_external_links=False,
|
|
61
|
+
warning="Test warning"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
assert data.include_paths == ["/blog/*"]
|
|
65
|
+
assert data.exclude_paths == ["/admin/*"]
|
|
66
|
+
assert data.max_discovery_depth == 3
|
|
67
|
+
assert data.limit == 50
|
|
68
|
+
assert data.crawl_entire_domain is True
|
|
69
|
+
assert data.allow_external_links is False
|
|
70
|
+
assert data.warning == "Test warning"
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import CrawlRequest, ScrapeOptions
|
|
3
|
+
from firecrawl.v2.methods.crawl import _prepare_crawl_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestCrawlRequestPreparation:
|
|
7
|
+
"""Unit tests for crawl request preparation."""
|
|
8
|
+
|
|
9
|
+
def test_basic_request_preparation(self):
|
|
10
|
+
"""Test basic request preparation with minimal fields."""
|
|
11
|
+
request = CrawlRequest(url="https://example.com")
|
|
12
|
+
data = _prepare_crawl_request(request)
|
|
13
|
+
|
|
14
|
+
# Check basic fields
|
|
15
|
+
assert data["url"] == "https://example.com"
|
|
16
|
+
|
|
17
|
+
# Check that no options are present
|
|
18
|
+
assert "limit" not in data
|
|
19
|
+
assert "prompt" not in data
|
|
20
|
+
|
|
21
|
+
def test_crawl_options_conversion(self):
|
|
22
|
+
"""Test that CrawlOptions fields are converted to camelCase."""
|
|
23
|
+
request = CrawlRequest(
|
|
24
|
+
url="https://example.com",
|
|
25
|
+
limit=10,
|
|
26
|
+
max_discovery_depth=3,
|
|
27
|
+
sitemap="skip",
|
|
28
|
+
crawl_entire_domain=False,
|
|
29
|
+
allow_external_links=True
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
data = _prepare_crawl_request(request)
|
|
33
|
+
|
|
34
|
+
# Check basic field
|
|
35
|
+
assert data["url"] == "https://example.com"
|
|
36
|
+
|
|
37
|
+
# Check snake_case to camelCase conversions
|
|
38
|
+
assert "limit" in data
|
|
39
|
+
assert data["limit"] == 10
|
|
40
|
+
assert "maxDiscoveryDepth" in data
|
|
41
|
+
assert data["maxDiscoveryDepth"] == 3
|
|
42
|
+
assert "sitemap" in data
|
|
43
|
+
assert data["sitemap"] == "skip"
|
|
44
|
+
assert "crawlEntireDomain" in data
|
|
45
|
+
assert data["crawlEntireDomain"] is False
|
|
46
|
+
assert "allowExternalLinks" in data
|
|
47
|
+
assert data["allowExternalLinks"] is True
|
|
48
|
+
|
|
49
|
+
# Check that snake_case fields are not present
|
|
50
|
+
assert "ignore_sitemap" not in data
|
|
51
|
+
assert "crawl_entire_domain" not in data
|
|
52
|
+
assert "allow_external_links" not in data
|
|
53
|
+
|
|
54
|
+
def test_scrape_options_conversion(self):
|
|
55
|
+
"""Test that nested ScrapeOptions are converted to camelCase."""
|
|
56
|
+
scrape_opts = ScrapeOptions(
|
|
57
|
+
formats=["markdown", "html"],
|
|
58
|
+
headers={"User-Agent": "Test"},
|
|
59
|
+
include_tags=["h1", "h2"],
|
|
60
|
+
exclude_tags=["nav"],
|
|
61
|
+
only_main_content=False,
|
|
62
|
+
timeout=15000,
|
|
63
|
+
wait_for=2000,
|
|
64
|
+
mobile=True,
|
|
65
|
+
skip_tls_verification=True,
|
|
66
|
+
remove_base64_images=False
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
request = CrawlRequest(
|
|
70
|
+
url="https://example.com",
|
|
71
|
+
scrape_options=scrape_opts
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
data = _prepare_crawl_request(request)
|
|
75
|
+
|
|
76
|
+
assert "scrapeOptions" in data
|
|
77
|
+
assert "scrape_options" not in data
|
|
78
|
+
|
|
79
|
+
# Check nested conversions
|
|
80
|
+
scrape_data = data["scrapeOptions"]
|
|
81
|
+
assert "includeTags" in scrape_data
|
|
82
|
+
assert scrape_data["includeTags"] == ["h1", "h2"]
|
|
83
|
+
assert "excludeTags" in scrape_data
|
|
84
|
+
assert scrape_data["excludeTags"] == ["nav"]
|
|
85
|
+
assert "onlyMainContent" in scrape_data
|
|
86
|
+
assert scrape_data["onlyMainContent"] is False
|
|
87
|
+
assert "waitFor" in scrape_data
|
|
88
|
+
assert scrape_data["waitFor"] == 2000
|
|
89
|
+
assert "skipTlsVerification" in scrape_data
|
|
90
|
+
assert scrape_data["skipTlsVerification"] is True
|
|
91
|
+
assert "removeBase64Images" in scrape_data
|
|
92
|
+
assert scrape_data["removeBase64Images"] is False
|
|
93
|
+
|
|
94
|
+
def test_all_fields_conversion(self):
|
|
95
|
+
"""Test request preparation with all possible fields."""
|
|
96
|
+
scrape_opts = ScrapeOptions(
|
|
97
|
+
formats=["markdown"],
|
|
98
|
+
headers={"User-Agent": "Test"},
|
|
99
|
+
only_main_content=False,
|
|
100
|
+
mobile=True
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
request = CrawlRequest(
|
|
104
|
+
url="https://example.com",
|
|
105
|
+
prompt="Extract all blog posts and documentation",
|
|
106
|
+
include_paths=["/blog/*", "/docs/*"],
|
|
107
|
+
exclude_paths=["/admin/*"],
|
|
108
|
+
max_discovery_depth=3,
|
|
109
|
+
sitemap="include",
|
|
110
|
+
limit=100,
|
|
111
|
+
crawl_entire_domain=True,
|
|
112
|
+
allow_external_links=False,
|
|
113
|
+
scrape_options=scrape_opts
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
data = _prepare_crawl_request(request)
|
|
117
|
+
|
|
118
|
+
# Check basic fields
|
|
119
|
+
assert data["url"] == "https://example.com"
|
|
120
|
+
assert data["prompt"] == "Extract all blog posts and documentation"
|
|
121
|
+
|
|
122
|
+
# Check all CrawlOptions fields
|
|
123
|
+
assert "includePaths" in data
|
|
124
|
+
assert data["includePaths"] == ["/blog/*", "/docs/*"]
|
|
125
|
+
assert "excludePaths" in data
|
|
126
|
+
assert data["excludePaths"] == ["/admin/*"]
|
|
127
|
+
assert "maxDiscoveryDepth" in data
|
|
128
|
+
assert data["maxDiscoveryDepth"] == 3
|
|
129
|
+
assert "sitemap" in data
|
|
130
|
+
assert data["sitemap"] == "include"
|
|
131
|
+
assert "limit" in data
|
|
132
|
+
assert data["limit"] == 100
|
|
133
|
+
assert "crawlEntireDomain" in data
|
|
134
|
+
assert data["crawlEntireDomain"] is True
|
|
135
|
+
assert "allowExternalLinks" in data
|
|
136
|
+
assert data["allowExternalLinks"] is False
|
|
137
|
+
|
|
138
|
+
# Check nested scrape options
|
|
139
|
+
assert "scrapeOptions" in data
|
|
140
|
+
scrape_data = data["scrapeOptions"]
|
|
141
|
+
assert "onlyMainContent" in scrape_data
|
|
142
|
+
assert scrape_data["onlyMainContent"] is False
|
|
143
|
+
assert "mobile" in scrape_data
|
|
144
|
+
assert scrape_data["mobile"] is True
|
|
145
|
+
|
|
146
|
+
def test_none_values_handling(self):
|
|
147
|
+
"""Test that None values are handled correctly."""
|
|
148
|
+
request = CrawlRequest(
|
|
149
|
+
url="https://example.com",
|
|
150
|
+
prompt=None,
|
|
151
|
+
limit=None,
|
|
152
|
+
scrape_options=None
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
data = _prepare_crawl_request(request)
|
|
156
|
+
|
|
157
|
+
# Only the required field should be present
|
|
158
|
+
assert "url" in data
|
|
159
|
+
assert len(data) == 1 # Only url should be present
|
|
160
|
+
|
|
161
|
+
def test_prompt_parameter(self):
|
|
162
|
+
"""Test that prompt parameter is included when provided."""
|
|
163
|
+
request = CrawlRequest(
|
|
164
|
+
url="https://example.com",
|
|
165
|
+
prompt="Extract all blog posts"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
data = _prepare_crawl_request(request)
|
|
169
|
+
|
|
170
|
+
assert "url" in data
|
|
171
|
+
assert "prompt" in data
|
|
172
|
+
assert data["prompt"] == "Extract all blog posts"
|
|
173
|
+
|
|
174
|
+
def test_empty_options(self):
|
|
175
|
+
"""Test that empty options are handled correctly."""
|
|
176
|
+
request = CrawlRequest(
|
|
177
|
+
url="https://example.com"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
data = _prepare_crawl_request(request)
|
|
181
|
+
|
|
182
|
+
# Should only have the required url field
|
|
183
|
+
assert "url" in data
|
|
184
|
+
assert len(data) == 1 # Only url should be present
|
|
185
|
+
|
|
186
|
+
def test_validation_integration(self):
|
|
187
|
+
"""Test that validation is called during preparation."""
|
|
188
|
+
# This should raise an error due to validation
|
|
189
|
+
with pytest.raises(ValueError, match="URL cannot be empty"):
|
|
190
|
+
request = CrawlRequest(url="")
|
|
191
|
+
_prepare_crawl_request(request)
|
|
192
|
+
|
|
193
|
+
# This should raise an error due to validation
|
|
194
|
+
with pytest.raises(ValueError, match="Limit must be positive"):
|
|
195
|
+
request = CrawlRequest(
|
|
196
|
+
url="https://example.com",
|
|
197
|
+
limit=0
|
|
198
|
+
)
|
|
199
|
+
_prepare_crawl_request(request)
|
|
200
|
+
|
|
201
|
+
def test_scrape_options_shared_function_integration(self):
|
|
202
|
+
"""Test that the shared prepare_scrape_options function is being used."""
|
|
203
|
+
# Test with all snake_case fields to ensure conversion
|
|
204
|
+
scrape_opts = ScrapeOptions(
|
|
205
|
+
include_tags=["h1", "h2"],
|
|
206
|
+
exclude_tags=["nav"],
|
|
207
|
+
only_main_content=False,
|
|
208
|
+
wait_for=2000,
|
|
209
|
+
skip_tls_verification=True,
|
|
210
|
+
remove_base64_images=False
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
request = CrawlRequest(
|
|
214
|
+
url="https://example.com",
|
|
215
|
+
scrape_options=scrape_opts
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
data = _prepare_crawl_request(request)
|
|
219
|
+
|
|
220
|
+
# Check that scrapeOptions is present and converted
|
|
221
|
+
assert "scrapeOptions" in data
|
|
222
|
+
scrape_data = data["scrapeOptions"]
|
|
223
|
+
|
|
224
|
+
# Check all conversions are working
|
|
225
|
+
assert "includeTags" in scrape_data
|
|
226
|
+
assert "excludeTags" in scrape_data
|
|
227
|
+
assert "onlyMainContent" in scrape_data
|
|
228
|
+
assert "waitFor" in scrape_data
|
|
229
|
+
assert "skipTlsVerification" in scrape_data
|
|
230
|
+
assert "removeBase64Images" in scrape_data
|
|
231
|
+
|
|
232
|
+
# Check that snake_case fields are not present
|
|
233
|
+
assert "include_tags" not in scrape_data
|
|
234
|
+
assert "exclude_tags" not in scrape_data
|
|
235
|
+
assert "only_main_content" not in scrape_data
|
|
236
|
+
assert "wait_for" not in scrape_data
|
|
237
|
+
assert "skip_tls_verification" not in scrape_data
|
|
238
|
+
assert "remove_base64_images" not in scrape_data
|
|
239
|
+
assert "raw_html" not in scrape_data
|
|
240
|
+
assert "screenshot_full_page" not in scrape_data
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import CrawlRequest, ScrapeOptions
|
|
3
|
+
from firecrawl.v2.methods.crawl import _validate_crawl_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestCrawlRequestValidation:
|
|
7
|
+
"""Unit tests for crawl request validation."""
|
|
8
|
+
|
|
9
|
+
def test_validate_empty_url(self):
|
|
10
|
+
"""Test validation with empty URL."""
|
|
11
|
+
with pytest.raises(ValueError, match="URL cannot be empty"):
|
|
12
|
+
request = CrawlRequest(url="")
|
|
13
|
+
_validate_crawl_request(request)
|
|
14
|
+
|
|
15
|
+
def test_validate_whitespace_url(self):
|
|
16
|
+
"""Test validation with whitespace-only URL."""
|
|
17
|
+
with pytest.raises(ValueError, match="URL cannot be empty"):
|
|
18
|
+
request = CrawlRequest(url=" ")
|
|
19
|
+
_validate_crawl_request(request)
|
|
20
|
+
|
|
21
|
+
def test_validate_valid_url(self):
|
|
22
|
+
"""Test validation with valid URL."""
|
|
23
|
+
request = CrawlRequest(url="https://example.com")
|
|
24
|
+
_validate_crawl_request(request) # Should not raise
|
|
25
|
+
|
|
26
|
+
def test_validate_invalid_limit(self):
|
|
27
|
+
"""Test validation with invalid limit."""
|
|
28
|
+
with pytest.raises(ValueError, match="Limit must be positive"):
|
|
29
|
+
request = CrawlRequest(
|
|
30
|
+
url="https://example.com",
|
|
31
|
+
limit=0
|
|
32
|
+
)
|
|
33
|
+
_validate_crawl_request(request)
|
|
34
|
+
|
|
35
|
+
def test_validate_negative_limit(self):
|
|
36
|
+
"""Test validation with negative limit."""
|
|
37
|
+
with pytest.raises(ValueError, match="Limit must be positive"):
|
|
38
|
+
request = CrawlRequest(
|
|
39
|
+
url="https://example.com",
|
|
40
|
+
limit=-5
|
|
41
|
+
)
|
|
42
|
+
_validate_crawl_request(request)
|
|
43
|
+
|
|
44
|
+
def test_validate_valid_limit(self):
|
|
45
|
+
"""Test validation with valid limit."""
|
|
46
|
+
request = CrawlRequest(
|
|
47
|
+
url="https://example.com",
|
|
48
|
+
limit=10
|
|
49
|
+
)
|
|
50
|
+
_validate_crawl_request(request) # Should not raise
|
|
51
|
+
|
|
52
|
+
def test_validate_with_prompt(self):
|
|
53
|
+
"""Test validation with prompt."""
|
|
54
|
+
request = CrawlRequest(
|
|
55
|
+
url="https://example.com",
|
|
56
|
+
prompt="Extract all blog posts"
|
|
57
|
+
)
|
|
58
|
+
_validate_crawl_request(request) # Should not raise
|
|
59
|
+
|
|
60
|
+
def test_validate_with_prompt_and_options(self):
|
|
61
|
+
"""Test validation with prompt and options."""
|
|
62
|
+
request = CrawlRequest(
|
|
63
|
+
url="https://example.com",
|
|
64
|
+
prompt="Extract all blog posts",
|
|
65
|
+
limit=10
|
|
66
|
+
)
|
|
67
|
+
_validate_crawl_request(request) # Should not raise
|
|
68
|
+
|
|
69
|
+
def test_validate_none_options(self):
|
|
70
|
+
"""Test validation with None options."""
|
|
71
|
+
request = CrawlRequest(url="https://example.com")
|
|
72
|
+
_validate_crawl_request(request) # Should not raise
|
|
73
|
+
|
|
74
|
+
def test_validate_complex_options(self):
|
|
75
|
+
"""Test validation with complex options."""
|
|
76
|
+
scrape_opts = ScrapeOptions(
|
|
77
|
+
formats=["markdown"],
|
|
78
|
+
only_main_content=False,
|
|
79
|
+
mobile=True
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
request = CrawlRequest(
|
|
83
|
+
url="https://example.com",
|
|
84
|
+
limit=50,
|
|
85
|
+
max_discovery_depth=3,
|
|
86
|
+
scrape_options=scrape_opts
|
|
87
|
+
)
|
|
88
|
+
_validate_crawl_request(request) # Should not raise
|
|
89
|
+
|
|
90
|
+
def test_validate_scrape_options_integration(self):
|
|
91
|
+
"""Test that scrape_options validation is integrated."""
|
|
92
|
+
# Test with valid scrape options
|
|
93
|
+
scrape_opts = ScrapeOptions(formats=["markdown"], timeout=30000)
|
|
94
|
+
request = CrawlRequest(
|
|
95
|
+
url="https://example.com",
|
|
96
|
+
scrape_options=scrape_opts
|
|
97
|
+
)
|
|
98
|
+
_validate_crawl_request(request) # Should not raise
|
|
99
|
+
|
|
100
|
+
# Test with invalid scrape options (should raise error)
|
|
101
|
+
invalid_scrape_opts = ScrapeOptions(timeout=-1000)
|
|
102
|
+
request = CrawlRequest(
|
|
103
|
+
url="https://example.com",
|
|
104
|
+
scrape_options=invalid_scrape_opts
|
|
105
|
+
)
|
|
106
|
+
with pytest.raises(ValueError, match="Timeout must be positive"):
|
|
107
|
+
_validate_crawl_request(request)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import MapOptions
|
|
3
|
+
from firecrawl.v2.methods.map import _prepare_map_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestMapRequestPreparation:
|
|
7
|
+
"""Unit tests for map request preparation."""
|
|
8
|
+
|
|
9
|
+
def test_basic_request_preparation(self):
|
|
10
|
+
data = _prepare_map_request("https://example.com")
|
|
11
|
+
assert data["url"] == "https://example.com"
|
|
12
|
+
# Default sitemap handling should be "include" when no flags provided
|
|
13
|
+
assert "sitemap" not in data # we only send when options provided
|
|
14
|
+
|
|
15
|
+
def test_sitemap_transformations(self):
|
|
16
|
+
# sitemap -> "only"
|
|
17
|
+
opts = MapOptions(sitemap="only")
|
|
18
|
+
data = _prepare_map_request("https://example.com", opts)
|
|
19
|
+
assert data["sitemap"] == "only"
|
|
20
|
+
|
|
21
|
+
# sitemap -> "skip"
|
|
22
|
+
opts = MapOptions(sitemap="skip")
|
|
23
|
+
data = _prepare_map_request("https://example.com", opts)
|
|
24
|
+
assert data["sitemap"] == "skip"
|
|
25
|
+
|
|
26
|
+
# default when options present but sitemap left as default -> include
|
|
27
|
+
opts = MapOptions(search="docs")
|
|
28
|
+
data = _prepare_map_request("https://example.com", opts)
|
|
29
|
+
assert data["sitemap"] == "include"
|
|
30
|
+
|
|
31
|
+
def test_field_conversions(self):
|
|
32
|
+
opts = MapOptions(
|
|
33
|
+
search="docs",
|
|
34
|
+
include_subdomains=True,
|
|
35
|
+
limit=25,
|
|
36
|
+
sitemap="only",
|
|
37
|
+
timeout=15000,
|
|
38
|
+
integration=" _unit-test ",
|
|
39
|
+
)
|
|
40
|
+
data = _prepare_map_request("https://example.com", opts)
|
|
41
|
+
|
|
42
|
+
assert data["url"] == "https://example.com"
|
|
43
|
+
assert data["search"] == "docs"
|
|
44
|
+
assert data["includeSubdomains"] is True
|
|
45
|
+
assert data["limit"] == 25
|
|
46
|
+
assert data["sitemap"] == "only"
|
|
47
|
+
assert data["timeout"] == 15000
|
|
48
|
+
assert data["integration"] == "_unit-test"
|
|
49
|
+
|
|
50
|
+
def test_invalid_url(self):
|
|
51
|
+
with pytest.raises(ValueError):
|
|
52
|
+
_prepare_map_request("")
|
|
53
|
+
with pytest.raises(ValueError):
|
|
54
|
+
_prepare_map_request(" ")
|