firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +4653 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
- firecrawl-3.0.3.dist-info/RECORD +78 -0
- tests/test_timeout_conversion.py +117 -0
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl-2.16.5.dist-info/RECORD +0 -12
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
from firecrawl import Firecrawl
|
|
2
|
+
import os
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from firecrawl.types import SearchData, SearchResult, Document, ScrapeFormats, ScrapeOptions
|
|
5
|
+
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
firecrawl = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
9
|
+
|
|
10
|
+
def _collect_texts(entries):
|
|
11
|
+
texts = []
|
|
12
|
+
for r in entries or []:
|
|
13
|
+
title = getattr(r, 'title', None) if hasattr(r, 'title') else None
|
|
14
|
+
desc = getattr(r, 'description', None) if hasattr(r, 'description') else None
|
|
15
|
+
if title:
|
|
16
|
+
texts.append(str(title).lower())
|
|
17
|
+
if desc:
|
|
18
|
+
texts.append(str(desc).lower())
|
|
19
|
+
return texts
|
|
20
|
+
|
|
21
|
+
def _is_document(entry) -> bool:
|
|
22
|
+
try:
|
|
23
|
+
from firecrawl.v2.types import Document
|
|
24
|
+
return isinstance(entry, Document) or \
|
|
25
|
+
hasattr(entry, 'markdown') or \
|
|
26
|
+
hasattr(entry, 'html') or \
|
|
27
|
+
hasattr(entry, 'raw_html') or \
|
|
28
|
+
hasattr(entry, 'json') or \
|
|
29
|
+
hasattr(entry, 'screenshot') or \
|
|
30
|
+
hasattr(entry, 'change_tracking') or \
|
|
31
|
+
hasattr(entry, 'summary')
|
|
32
|
+
except Exception:
|
|
33
|
+
return hasattr(entry, 'markdown') or \
|
|
34
|
+
hasattr(entry, 'html') or \
|
|
35
|
+
hasattr(entry, 'raw_html') or \
|
|
36
|
+
hasattr(entry, 'json') or \
|
|
37
|
+
hasattr(entry, 'screenshot') or \
|
|
38
|
+
hasattr(entry, 'change_tracking') or \
|
|
39
|
+
hasattr(entry, 'summary')
|
|
40
|
+
|
|
41
|
+
def test_search_minimal_request():
|
|
42
|
+
results = firecrawl.search(
|
|
43
|
+
query="What is the capital of France?"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
assert isinstance(results, SearchData)
|
|
47
|
+
assert hasattr(results, 'web')
|
|
48
|
+
assert results.web is not None
|
|
49
|
+
assert len(results.web) > 0
|
|
50
|
+
assert hasattr(results, 'news')
|
|
51
|
+
assert results.news is None
|
|
52
|
+
assert hasattr(results, 'images')
|
|
53
|
+
assert results.images is None
|
|
54
|
+
|
|
55
|
+
for result in results.web:
|
|
56
|
+
assert isinstance(result, SearchResult)
|
|
57
|
+
assert hasattr(result, 'url')
|
|
58
|
+
assert hasattr(result, 'title')
|
|
59
|
+
assert hasattr(result, 'description')
|
|
60
|
+
assert result.url.startswith('http')
|
|
61
|
+
assert result.title is not None
|
|
62
|
+
assert result.description is not None
|
|
63
|
+
|
|
64
|
+
all_text = ' '.join(_collect_texts(results.web))
|
|
65
|
+
|
|
66
|
+
assert 'paris' in all_text
|
|
67
|
+
|
|
68
|
+
assert results.news is None
|
|
69
|
+
assert results.images is None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_search_with_sources():
|
|
73
|
+
"""Test search with specific sources."""
|
|
74
|
+
results = firecrawl.search(
|
|
75
|
+
query="firecrawl",
|
|
76
|
+
sources=["web", "news"],
|
|
77
|
+
limit=3
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
assert isinstance(results, SearchData)
|
|
81
|
+
|
|
82
|
+
assert results.web is not None
|
|
83
|
+
assert len(results.web) <= 3
|
|
84
|
+
|
|
85
|
+
if results.news is not None:
|
|
86
|
+
assert len(results.news) <= 3
|
|
87
|
+
|
|
88
|
+
assert results.images is None
|
|
89
|
+
|
|
90
|
+
web_titles = [result.title.lower() for result in results.web]
|
|
91
|
+
web_descriptions = [result.description.lower() for result in results.web]
|
|
92
|
+
all_web_text = ' '.join(web_titles + web_descriptions)
|
|
93
|
+
|
|
94
|
+
assert 'firecrawl' in all_web_text
|
|
95
|
+
|
|
96
|
+
def test_search_result_structure():
|
|
97
|
+
"""Test that SearchResult objects have the correct structure."""
|
|
98
|
+
results = firecrawl.search(
|
|
99
|
+
query="test query",
|
|
100
|
+
limit=1
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if results.web and len(results.web) > 0:
|
|
104
|
+
result = results.web[0]
|
|
105
|
+
|
|
106
|
+
assert hasattr(result, 'url')
|
|
107
|
+
assert hasattr(result, 'title')
|
|
108
|
+
assert hasattr(result, 'description')
|
|
109
|
+
|
|
110
|
+
assert isinstance(result.url, str)
|
|
111
|
+
assert isinstance(result.title, str) or result.title is None
|
|
112
|
+
assert isinstance(result.description, str) or result.description is None
|
|
113
|
+
|
|
114
|
+
# Test URL format
|
|
115
|
+
assert result.url.startswith('http')
|
|
116
|
+
|
|
117
|
+
def test_search_all_parameters():
|
|
118
|
+
"""Test search with all available parameters (comprehensive e2e test)."""
|
|
119
|
+
from firecrawl.types import ScrapeOptions, JsonFormat, Location, WaitAction
|
|
120
|
+
|
|
121
|
+
# Define a schema for JSON extraction
|
|
122
|
+
schema = {
|
|
123
|
+
"type": "object",
|
|
124
|
+
"properties": {
|
|
125
|
+
"title": {"type": "string"},
|
|
126
|
+
"description": {"type": "string"},
|
|
127
|
+
"url": {"type": "string"}
|
|
128
|
+
},
|
|
129
|
+
"required": ["title", "description"]
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
results = firecrawl.search(
|
|
133
|
+
query="artificial intelligence",
|
|
134
|
+
sources=[
|
|
135
|
+
{"type": "web"},
|
|
136
|
+
{"type": "news"}
|
|
137
|
+
],
|
|
138
|
+
limit=3,
|
|
139
|
+
tbs="qdr:m", # Last month
|
|
140
|
+
location="US",
|
|
141
|
+
ignore_invalid_urls=True,
|
|
142
|
+
timeout=60000,
|
|
143
|
+
scrape_options=ScrapeOptions(
|
|
144
|
+
formats=[
|
|
145
|
+
"markdown",
|
|
146
|
+
"html",
|
|
147
|
+
{
|
|
148
|
+
"type": "json",
|
|
149
|
+
"prompt": "Extract the title and description from the page",
|
|
150
|
+
"schema": schema
|
|
151
|
+
},
|
|
152
|
+
{"type": "summary"}
|
|
153
|
+
],
|
|
154
|
+
headers={"User-Agent": "Firecrawl-Test/1.0"},
|
|
155
|
+
include_tags=["h1", "h2", "p"],
|
|
156
|
+
exclude_tags=["nav", "footer"],
|
|
157
|
+
only_main_content=True,
|
|
158
|
+
wait_for=2000,
|
|
159
|
+
mobile=False,
|
|
160
|
+
skip_tls_verification=False,
|
|
161
|
+
remove_base64_images=True,
|
|
162
|
+
block_ads=True,
|
|
163
|
+
proxy="basic",
|
|
164
|
+
max_age=3600000, # 1 hour cache
|
|
165
|
+
store_in_cache=True,
|
|
166
|
+
location=Location(
|
|
167
|
+
country="US",
|
|
168
|
+
languages=["en"]
|
|
169
|
+
),
|
|
170
|
+
actions=[
|
|
171
|
+
WaitAction(milliseconds=1000)
|
|
172
|
+
]
|
|
173
|
+
# Note: raw_html and screenshot_full_page are not supported by v2 API yet
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Test structure
|
|
178
|
+
assert isinstance(results, SearchData)
|
|
179
|
+
assert hasattr(results, 'web')
|
|
180
|
+
assert hasattr(results, 'news')
|
|
181
|
+
assert hasattr(results, 'images')
|
|
182
|
+
|
|
183
|
+
# Test that web results exist
|
|
184
|
+
assert results.web is not None
|
|
185
|
+
assert len(results.web) <= 3 # Should respect limit
|
|
186
|
+
|
|
187
|
+
# Test that results contain expected content for non-document entries only
|
|
188
|
+
non_doc_entries = [r for r in (results.web or []) if not _is_document(r)]
|
|
189
|
+
if non_doc_entries:
|
|
190
|
+
all_web_text = ' '.join(_collect_texts(non_doc_entries))
|
|
191
|
+
ai_terms = ['artificial', 'intelligence', 'ai', 'machine', 'learning']
|
|
192
|
+
assert any(term in all_web_text for term in ai_terms)
|
|
193
|
+
|
|
194
|
+
# Test that each result has proper structure
|
|
195
|
+
for result in results.web:
|
|
196
|
+
assert isinstance(result, (SearchResult, Document))
|
|
197
|
+
if isinstance(result, Document):
|
|
198
|
+
# Document path: ensure content present
|
|
199
|
+
assert (result.markdown is not None) or (result.html is not None)
|
|
200
|
+
else:
|
|
201
|
+
# LinkResult path
|
|
202
|
+
assert hasattr(result, 'url')
|
|
203
|
+
assert isinstance(result.url, str) and result.url.startswith('http')
|
|
204
|
+
|
|
205
|
+
# Test that news results exist (if API supports it)
|
|
206
|
+
if results.news is not None:
|
|
207
|
+
assert len(results.news) <= 3
|
|
208
|
+
for result in results.news:
|
|
209
|
+
assert isinstance(result, (SearchResult, Document))
|
|
210
|
+
if isinstance(result, Document):
|
|
211
|
+
assert (result.markdown is not None) or (result.html is not None)
|
|
212
|
+
else:
|
|
213
|
+
assert hasattr(result, 'url')
|
|
214
|
+
assert isinstance(result.url, str) and result.url.startswith('http')
|
|
215
|
+
|
|
216
|
+
# Test that unspecified sources are None
|
|
217
|
+
assert results.images is None
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def test_search_formats_flexibility():
|
|
221
|
+
"""Test that both list and ScrapeFormats work for formats."""
|
|
222
|
+
from firecrawl.types import ScrapeFormats
|
|
223
|
+
|
|
224
|
+
# Test with list format
|
|
225
|
+
results1 = firecrawl.search(
|
|
226
|
+
query="python programming",
|
|
227
|
+
limit=1,
|
|
228
|
+
scrape_options=ScrapeOptions(
|
|
229
|
+
formats=["markdown"]
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Test with ScrapeFormats object
|
|
234
|
+
results2 = firecrawl.search(
|
|
235
|
+
query="python programming",
|
|
236
|
+
limit=1,
|
|
237
|
+
scrape_options=ScrapeOptions(
|
|
238
|
+
formats=ScrapeFormats(markdown=True)
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Both should work without errors
|
|
243
|
+
assert isinstance(results1, SearchData)
|
|
244
|
+
assert isinstance(results2, SearchData)
|
|
245
|
+
assert results1.web is not None
|
|
246
|
+
assert results2.web is not None
|
|
247
|
+
|
|
248
|
+
def test_search_with_json_format_object():
|
|
249
|
+
"""Search with scrape_options including a JSON format object (prompt + schema)."""
|
|
250
|
+
json_schema = {
|
|
251
|
+
"type": "object",
|
|
252
|
+
"properties": {
|
|
253
|
+
"title": {"type": "string"}
|
|
254
|
+
},
|
|
255
|
+
"required": ["title"],
|
|
256
|
+
}
|
|
257
|
+
results = firecrawl.search(
|
|
258
|
+
query="site:docs.firecrawl.dev",
|
|
259
|
+
limit=1,
|
|
260
|
+
scrape_options=ScrapeOptions(
|
|
261
|
+
formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}]
|
|
262
|
+
),
|
|
263
|
+
)
|
|
264
|
+
assert isinstance(results, SearchData)
|
|
265
|
+
assert results.web is not None and len(results.web) >= 0
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
from firecrawl import Firecrawl
|
|
4
|
+
|
|
5
|
+
load_dotenv()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestUsageE2E:
|
|
9
|
+
def setup_method(self):
|
|
10
|
+
# Environment is exported by conftest at import time
|
|
11
|
+
self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
12
|
+
|
|
13
|
+
def test_get_concurrency(self):
|
|
14
|
+
resp = self.client.get_concurrency()
|
|
15
|
+
# Shape assertions (endpoint not live yet, but types are defined)
|
|
16
|
+
assert hasattr(resp, "concurrency")
|
|
17
|
+
assert hasattr(resp, "max_concurrency")
|
|
18
|
+
|
|
19
|
+
def test_get_credit_usage(self):
|
|
20
|
+
resp = self.client.get_credit_usage()
|
|
21
|
+
assert hasattr(resp, "remaining_credits")
|
|
22
|
+
|
|
23
|
+
def test_get_token_usage(self):
|
|
24
|
+
resp = self.client.get_token_usage()
|
|
25
|
+
assert hasattr(resp, "remaining_tokens")
|
|
26
|
+
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from firecrawl import Firecrawl
|
|
5
|
+
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
if not os.getenv("API_KEY"):
|
|
9
|
+
raise ValueError("API_KEY is not set")
|
|
10
|
+
|
|
11
|
+
if not os.getenv("API_URL"):
|
|
12
|
+
raise ValueError("API_URL is not set")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestWatcherE2E:
|
|
16
|
+
def setup_method(self):
|
|
17
|
+
from firecrawl import Firecrawl
|
|
18
|
+
self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
19
|
+
|
|
20
|
+
def test_crawl_watcher(self):
|
|
21
|
+
# Start a small crawl job
|
|
22
|
+
start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=2)
|
|
23
|
+
job_id = start_job.id
|
|
24
|
+
|
|
25
|
+
statuses = []
|
|
26
|
+
w = self.client.watcher(job_id, kind="crawl", poll_interval=1, timeout=120)
|
|
27
|
+
w.add_listener(lambda s: statuses.append(s.status))
|
|
28
|
+
w.start()
|
|
29
|
+
|
|
30
|
+
# Wait for terminal state up to 180 seconds
|
|
31
|
+
deadline = time.time() + 180
|
|
32
|
+
while time.time() < deadline:
|
|
33
|
+
if statuses and statuses[-1] in ["completed", "failed"]:
|
|
34
|
+
break
|
|
35
|
+
time.sleep(1)
|
|
36
|
+
|
|
37
|
+
w.stop()
|
|
38
|
+
|
|
39
|
+
assert len(statuses) > 0
|
|
40
|
+
assert statuses[-1] in ["completed", "failed"]
|
|
41
|
+
|
|
42
|
+
def test_batch_watcher(self):
|
|
43
|
+
urls = [
|
|
44
|
+
"https://docs.firecrawl.dev",
|
|
45
|
+
"https://firecrawl.dev",
|
|
46
|
+
]
|
|
47
|
+
start_resp = self.client.start_batch_scrape(urls, formats=["markdown"], max_concurrency=1)
|
|
48
|
+
job_id = start_resp.id
|
|
49
|
+
|
|
50
|
+
statuses = []
|
|
51
|
+
w = self.client.watcher(job_id, kind="batch", poll_interval=1, timeout=180)
|
|
52
|
+
w.add_listener(lambda s: statuses.append(s.status))
|
|
53
|
+
w.start()
|
|
54
|
+
|
|
55
|
+
deadline = time.time() + 240
|
|
56
|
+
while time.time() < deadline:
|
|
57
|
+
if statuses and statuses[-1] in ["completed", "failed", "cancelled"]:
|
|
58
|
+
break
|
|
59
|
+
time.sleep(1)
|
|
60
|
+
|
|
61
|
+
w.stop()
|
|
62
|
+
|
|
63
|
+
assert len(statuses) > 0
|
|
64
|
+
assert statuses[-1] in ["completed", "failed", "cancelled"]
|
|
65
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import CrawlParamsRequest
|
|
3
|
+
from firecrawl.v2.methods.aio import crawl as aio_crawl
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.mark.asyncio
|
|
7
|
+
async def test_crawl_params_request_validation():
|
|
8
|
+
with pytest.raises(ValueError):
|
|
9
|
+
await aio_crawl.crawl_params_preview(None, CrawlParamsRequest(url="", prompt="x"))
|
|
10
|
+
with pytest.raises(ValueError):
|
|
11
|
+
await aio_crawl.crawl_params_preview(None, CrawlParamsRequest(url="https://x", prompt=""))
|
|
12
|
+
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from firecrawl.v2.types import CrawlRequest, ScrapeOptions, WebhookConfig
|
|
2
|
+
from firecrawl.v2.methods.aio.crawl import _prepare_crawl_request
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TestAsyncCrawlRequestPreparation:
|
|
6
|
+
def test_basic_request(self):
|
|
7
|
+
req = CrawlRequest(url="https://example.com")
|
|
8
|
+
payload = _prepare_crawl_request(req)
|
|
9
|
+
assert payload["url"] == "https://example.com"
|
|
10
|
+
|
|
11
|
+
def test_field_mappings(self):
|
|
12
|
+
req = CrawlRequest(
|
|
13
|
+
url="https://example.com",
|
|
14
|
+
include_paths=["/docs/*"],
|
|
15
|
+
exclude_paths=["/admin/*"],
|
|
16
|
+
max_discovery_depth=2,
|
|
17
|
+
ignore_sitemap=True,
|
|
18
|
+
ignore_query_parameters=True,
|
|
19
|
+
crawl_entire_domain=True,
|
|
20
|
+
allow_external_links=False,
|
|
21
|
+
allow_subdomains=True,
|
|
22
|
+
max_concurrency=5,
|
|
23
|
+
zero_data_retention=True,
|
|
24
|
+
)
|
|
25
|
+
payload = _prepare_crawl_request(req)
|
|
26
|
+
assert payload["includePaths"] == ["/docs/*"]
|
|
27
|
+
assert payload["excludePaths"] == ["/admin/*"]
|
|
28
|
+
assert payload["maxDiscoveryDepth"] == 2
|
|
29
|
+
assert payload["ignoreSitemap"] is True
|
|
30
|
+
assert payload["ignoreQueryParameters"] is True
|
|
31
|
+
assert payload["crawlEntireDomain"] is True
|
|
32
|
+
assert payload["allowExternalLinks"] is False
|
|
33
|
+
assert payload["allowSubdomains"] is True
|
|
34
|
+
assert payload["maxConcurrency"] == 5
|
|
35
|
+
assert payload["zeroDataRetention"] is True
|
|
36
|
+
|
|
37
|
+
def test_webhook_preparation(self):
|
|
38
|
+
# string webhook
|
|
39
|
+
req = CrawlRequest(url="https://example.com", webhook="https://example.com/hook")
|
|
40
|
+
payload = _prepare_crawl_request(req)
|
|
41
|
+
assert payload["webhook"] == "https://example.com/hook"
|
|
42
|
+
|
|
43
|
+
# object webhook
|
|
44
|
+
req2 = CrawlRequest(url="https://example.com", webhook=WebhookConfig(url="https://x/h", headers={"X": "1"}, events=["completed"]))
|
|
45
|
+
payload2 = _prepare_crawl_request(req2)
|
|
46
|
+
assert isinstance(payload2["webhook"], dict)
|
|
47
|
+
assert payload2["webhook"]["url"] == "https://x/h"
|
|
48
|
+
assert payload2["webhook"]["headers"] == {"X": "1"}
|
|
49
|
+
|
|
50
|
+
def test_webhook_none_values_excluded(self):
|
|
51
|
+
req = CrawlRequest(
|
|
52
|
+
url="https://example.com",
|
|
53
|
+
webhook=WebhookConfig(url="https://example.com/webhook", headers=None, metadata=None, events=None),
|
|
54
|
+
)
|
|
55
|
+
payload = _prepare_crawl_request(req)
|
|
56
|
+
webhook = payload["webhook"]
|
|
57
|
+
assert webhook["url"] == "https://example.com/webhook"
|
|
58
|
+
assert "headers" not in webhook
|
|
59
|
+
assert "metadata" not in webhook
|
|
60
|
+
assert "events" not in webhook
|
|
61
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from firecrawl.v2.types import CrawlRequest, ScrapeOptions
|
|
2
|
+
from firecrawl.v2.methods.aio.crawl import _prepare_crawl_request
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestAsyncCrawlValidation:
|
|
7
|
+
def test_invalid_url(self):
|
|
8
|
+
with pytest.raises(ValueError):
|
|
9
|
+
_prepare_crawl_request(CrawlRequest(url=""))
|
|
10
|
+
with pytest.raises(ValueError):
|
|
11
|
+
_prepare_crawl_request(CrawlRequest(url=" "))
|
|
12
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import MapOptions
|
|
3
|
+
from firecrawl.v2.methods.aio.map import _prepare_map_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestAsyncMapRequestPreparation:
|
|
7
|
+
def test_basic(self):
|
|
8
|
+
payload = _prepare_map_request("https://example.com")
|
|
9
|
+
assert payload["url"] == "https://example.com"
|
|
10
|
+
|
|
11
|
+
def test_fields(self):
|
|
12
|
+
opts = MapOptions(search="docs", include_subdomains=True, limit=10, sitemap="only", timeout=15000)
|
|
13
|
+
payload = _prepare_map_request("https://example.com", opts)
|
|
14
|
+
assert payload["search"] == "docs"
|
|
15
|
+
assert payload["includeSubdomains"] is True
|
|
16
|
+
assert payload["limit"] == 10
|
|
17
|
+
assert payload["sitemap"] == "only"
|
|
18
|
+
assert payload["timeout"] == 15000
|
|
19
|
+
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import ScrapeOptions, Location
|
|
3
|
+
from firecrawl.v2.methods.aio.scrape import _prepare_scrape_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestAsyncScrapeRequestPreparation:
|
|
7
|
+
@pytest.mark.asyncio
|
|
8
|
+
async def test_basic_request_preparation(self):
|
|
9
|
+
payload = await _prepare_scrape_request("https://example.com", None)
|
|
10
|
+
assert payload["url"] == "https://example.com"
|
|
11
|
+
|
|
12
|
+
@pytest.mark.asyncio
|
|
13
|
+
async def test_options_conversion(self):
|
|
14
|
+
opts = ScrapeOptions(
|
|
15
|
+
formats=["markdown", {"type": "screenshot", "full_page": True, "quality": 80}],
|
|
16
|
+
include_tags=["main"],
|
|
17
|
+
exclude_tags=["nav"],
|
|
18
|
+
only_main_content=True,
|
|
19
|
+
wait_for=500,
|
|
20
|
+
timeout=30000,
|
|
21
|
+
mobile=True,
|
|
22
|
+
parsers=["pdf"],
|
|
23
|
+
location=Location(country="us", languages=["en"]),
|
|
24
|
+
skip_tls_verification=False,
|
|
25
|
+
remove_base64_images=False,
|
|
26
|
+
fast_mode=True,
|
|
27
|
+
use_mock="test",
|
|
28
|
+
block_ads=False,
|
|
29
|
+
proxy="basic",
|
|
30
|
+
max_age=1000,
|
|
31
|
+
store_in_cache=False,
|
|
32
|
+
)
|
|
33
|
+
payload = await _prepare_scrape_request("https://example.com", opts)
|
|
34
|
+
assert payload["url"] == "https://example.com"
|
|
35
|
+
assert isinstance(payload.get("formats"), list) and "markdown" in payload["formats"]
|
|
36
|
+
assert payload["includeTags"] == ["main"]
|
|
37
|
+
assert payload["excludeTags"] == ["nav"]
|
|
38
|
+
assert payload["onlyMainContent"] is True
|
|
39
|
+
assert payload["waitFor"] == 500
|
|
40
|
+
assert payload["timeout"] == 30000
|
|
41
|
+
assert payload["mobile"] is True
|
|
42
|
+
assert payload["skipTlsVerification"] is False
|
|
43
|
+
assert payload["removeBase64Images"] is False
|
|
44
|
+
assert payload["fastMode"] is True
|
|
45
|
+
assert payload["useMock"] == "test"
|
|
46
|
+
assert payload["blockAds"] is False
|
|
47
|
+
assert payload["proxy"] == "basic"
|
|
48
|
+
assert payload["maxAge"] == 1000
|
|
49
|
+
assert payload["storeInCache"] is False
|
|
50
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import SearchRequest, ScrapeOptions
|
|
3
|
+
from firecrawl.v2.methods.aio.search import _prepare_search_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestAsyncSearchRequestPreparation:
|
|
7
|
+
def test_basic_request_preparation(self):
|
|
8
|
+
request = SearchRequest(query="test query")
|
|
9
|
+
data = _prepare_search_request(request)
|
|
10
|
+
assert data["query"] == "test query"
|
|
11
|
+
assert "ignore_invalid_urls" not in data
|
|
12
|
+
assert "scrape_options" not in data
|
|
13
|
+
|
|
14
|
+
def test_all_fields_conversion(self):
|
|
15
|
+
scrape_opts = ScrapeOptions(
|
|
16
|
+
formats=["markdown"],
|
|
17
|
+
headers={"User-Agent": "Test"},
|
|
18
|
+
include_tags=["h1", "h2"],
|
|
19
|
+
exclude_tags=["nav"],
|
|
20
|
+
only_main_content=False,
|
|
21
|
+
timeout=15000,
|
|
22
|
+
wait_for=2000,
|
|
23
|
+
mobile=True,
|
|
24
|
+
skip_tls_verification=True,
|
|
25
|
+
remove_base64_images=False,
|
|
26
|
+
)
|
|
27
|
+
request = SearchRequest(
|
|
28
|
+
query="test query",
|
|
29
|
+
sources=["web", "news"],
|
|
30
|
+
limit=10,
|
|
31
|
+
tbs="qdr:w",
|
|
32
|
+
location="US",
|
|
33
|
+
ignore_invalid_urls=False,
|
|
34
|
+
timeout=30000,
|
|
35
|
+
scrape_options=scrape_opts,
|
|
36
|
+
)
|
|
37
|
+
data = _prepare_search_request(request)
|
|
38
|
+
assert data["ignoreInvalidURLs"] is False
|
|
39
|
+
assert "scrapeOptions" in data
|
|
40
|
+
|
|
41
|
+
def test_exclude_none_behavior(self):
|
|
42
|
+
request = SearchRequest(
|
|
43
|
+
query="test",
|
|
44
|
+
sources=None,
|
|
45
|
+
limit=None,
|
|
46
|
+
tbs=None,
|
|
47
|
+
location=None,
|
|
48
|
+
ignore_invalid_urls=None,
|
|
49
|
+
timeout=None,
|
|
50
|
+
scrape_options=None,
|
|
51
|
+
)
|
|
52
|
+
data = _prepare_search_request(request)
|
|
53
|
+
assert "query" in data
|
|
54
|
+
assert len(data) == 1
|
|
55
|
+
|
|
56
|
+
def test_empty_scrape_options(self):
|
|
57
|
+
request = SearchRequest(query="test", scrape_options=ScrapeOptions())
|
|
58
|
+
data = _prepare_search_request(request)
|
|
59
|
+
assert "scrapeOptions" in data
|
|
60
|
+
scrape_data = data["scrapeOptions"]
|
|
61
|
+
assert "onlyMainContent" in scrape_data
|
|
62
|
+
assert "mobile" in scrape_data
|
|
63
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from firecrawl.v2.types import ScrapeOptions, Location
|
|
2
|
+
from firecrawl.v2.methods.aio.batch import _prepare as _prepare_batch
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TestAsyncBatchRequestPreparation:
|
|
6
|
+
def test_urls_validation_and_conversion(self):
|
|
7
|
+
payload = _prepare_batch(["https://example.com", "http://foo.bar"], options=None)
|
|
8
|
+
assert payload["urls"] == ["https://example.com", "http://foo.bar"]
|
|
9
|
+
|
|
10
|
+
def test_options_and_batch_fields(self):
|
|
11
|
+
opts = ScrapeOptions(formats=["markdown"], only_main_content=True)
|
|
12
|
+
payload = _prepare_batch(
|
|
13
|
+
["https://example.com"],
|
|
14
|
+
options=opts,
|
|
15
|
+
webhook="https://hook.example",
|
|
16
|
+
append_to_id="00000000-0000-0000-0000-000000000000",
|
|
17
|
+
ignore_invalid_urls=True,
|
|
18
|
+
max_concurrency=3,
|
|
19
|
+
zero_data_retention=True,
|
|
20
|
+
integration="zapier",
|
|
21
|
+
)
|
|
22
|
+
assert payload["webhook"] == "https://hook.example"
|
|
23
|
+
assert payload["appendToId"] == "00000000-0000-0000-0000-000000000000"
|
|
24
|
+
assert payload["ignoreInvalidURLs"] is True
|
|
25
|
+
assert payload["maxConcurrency"] == 3
|
|
26
|
+
assert payload["zeroDataRetention"] is True
|
|
27
|
+
assert payload["integration"] == "zapier"
|
|
28
|
+
|