firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (82) hide show
  1. firecrawl/__init__.py +27 -19
  2. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. firecrawl/__tests__/e2e/v2/test_search.py +265 -0
  18. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
  36. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. firecrawl/client.py +241 -0
  41. firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
  42. firecrawl/types.py +157 -0
  43. firecrawl/v1/__init__.py +14 -0
  44. firecrawl/v1/client.py +4653 -0
  45. firecrawl/v2/__init__.py +4 -0
  46. firecrawl/v2/client.py +802 -0
  47. firecrawl/v2/client_async.py +250 -0
  48. firecrawl/v2/methods/aio/__init__.py +1 -0
  49. firecrawl/v2/methods/aio/batch.py +85 -0
  50. firecrawl/v2/methods/aio/crawl.py +174 -0
  51. firecrawl/v2/methods/aio/extract.py +126 -0
  52. firecrawl/v2/methods/aio/map.py +59 -0
  53. firecrawl/v2/methods/aio/scrape.py +36 -0
  54. firecrawl/v2/methods/aio/search.py +58 -0
  55. firecrawl/v2/methods/aio/usage.py +42 -0
  56. firecrawl/v2/methods/batch.py +420 -0
  57. firecrawl/v2/methods/crawl.py +468 -0
  58. firecrawl/v2/methods/extract.py +131 -0
  59. firecrawl/v2/methods/map.py +77 -0
  60. firecrawl/v2/methods/scrape.py +68 -0
  61. firecrawl/v2/methods/search.py +173 -0
  62. firecrawl/v2/methods/usage.py +41 -0
  63. firecrawl/v2/types.py +546 -0
  64. firecrawl/v2/utils/__init__.py +9 -0
  65. firecrawl/v2/utils/error_handler.py +107 -0
  66. firecrawl/v2/utils/get_version.py +15 -0
  67. firecrawl/v2/utils/http_client.py +153 -0
  68. firecrawl/v2/utils/http_client_async.py +64 -0
  69. firecrawl/v2/utils/validation.py +324 -0
  70. firecrawl/v2/watcher.py +312 -0
  71. firecrawl/v2/watcher_async.py +245 -0
  72. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
  73. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
  74. firecrawl-3.0.3.dist-info/RECORD +78 -0
  75. tests/test_timeout_conversion.py +117 -0
  76. firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  77. firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  78. firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  79. firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
  80. firecrawl-2.16.5.dist-info/RECORD +0 -12
  81. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
  82. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,265 @@
1
+ from firecrawl import Firecrawl
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from firecrawl.types import SearchData, SearchResult, Document, ScrapeFormats, ScrapeOptions
5
+
6
+ load_dotenv()
7
+
8
+ firecrawl = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
9
+
10
+ def _collect_texts(entries):
11
+ texts = []
12
+ for r in entries or []:
13
+ title = getattr(r, 'title', None) if hasattr(r, 'title') else None
14
+ desc = getattr(r, 'description', None) if hasattr(r, 'description') else None
15
+ if title:
16
+ texts.append(str(title).lower())
17
+ if desc:
18
+ texts.append(str(desc).lower())
19
+ return texts
20
+
21
+ def _is_document(entry) -> bool:
22
+ try:
23
+ from firecrawl.v2.types import Document
24
+ return isinstance(entry, Document) or \
25
+ hasattr(entry, 'markdown') or \
26
+ hasattr(entry, 'html') or \
27
+ hasattr(entry, 'raw_html') or \
28
+ hasattr(entry, 'json') or \
29
+ hasattr(entry, 'screenshot') or \
30
+ hasattr(entry, 'change_tracking') or \
31
+ hasattr(entry, 'summary')
32
+ except Exception:
33
+ return hasattr(entry, 'markdown') or \
34
+ hasattr(entry, 'html') or \
35
+ hasattr(entry, 'raw_html') or \
36
+ hasattr(entry, 'json') or \
37
+ hasattr(entry, 'screenshot') or \
38
+ hasattr(entry, 'change_tracking') or \
39
+ hasattr(entry, 'summary')
40
+
41
+ def test_search_minimal_request():
42
+ results = firecrawl.search(
43
+ query="What is the capital of France?"
44
+ )
45
+
46
+ assert isinstance(results, SearchData)
47
+ assert hasattr(results, 'web')
48
+ assert results.web is not None
49
+ assert len(results.web) > 0
50
+ assert hasattr(results, 'news')
51
+ assert results.news is None
52
+ assert hasattr(results, 'images')
53
+ assert results.images is None
54
+
55
+ for result in results.web:
56
+ assert isinstance(result, SearchResult)
57
+ assert hasattr(result, 'url')
58
+ assert hasattr(result, 'title')
59
+ assert hasattr(result, 'description')
60
+ assert result.url.startswith('http')
61
+ assert result.title is not None
62
+ assert result.description is not None
63
+
64
+ all_text = ' '.join(_collect_texts(results.web))
65
+
66
+ assert 'paris' in all_text
67
+
68
+ assert results.news is None
69
+ assert results.images is None
70
+
71
+
72
+ def test_search_with_sources():
73
+ """Test search with specific sources."""
74
+ results = firecrawl.search(
75
+ query="firecrawl",
76
+ sources=["web", "news"],
77
+ limit=3
78
+ )
79
+
80
+ assert isinstance(results, SearchData)
81
+
82
+ assert results.web is not None
83
+ assert len(results.web) <= 3
84
+
85
+ if results.news is not None:
86
+ assert len(results.news) <= 3
87
+
88
+ assert results.images is None
89
+
90
+ web_titles = [result.title.lower() for result in results.web]
91
+ web_descriptions = [result.description.lower() for result in results.web]
92
+ all_web_text = ' '.join(web_titles + web_descriptions)
93
+
94
+ assert 'firecrawl' in all_web_text
95
+
96
+ def test_search_result_structure():
97
+ """Test that SearchResult objects have the correct structure."""
98
+ results = firecrawl.search(
99
+ query="test query",
100
+ limit=1
101
+ )
102
+
103
+ if results.web and len(results.web) > 0:
104
+ result = results.web[0]
105
+
106
+ assert hasattr(result, 'url')
107
+ assert hasattr(result, 'title')
108
+ assert hasattr(result, 'description')
109
+
110
+ assert isinstance(result.url, str)
111
+ assert isinstance(result.title, str) or result.title is None
112
+ assert isinstance(result.description, str) or result.description is None
113
+
114
+ # Test URL format
115
+ assert result.url.startswith('http')
116
+
117
+ def test_search_all_parameters():
118
+ """Test search with all available parameters (comprehensive e2e test)."""
119
+ from firecrawl.types import ScrapeOptions, JsonFormat, Location, WaitAction
120
+
121
+ # Define a schema for JSON extraction
122
+ schema = {
123
+ "type": "object",
124
+ "properties": {
125
+ "title": {"type": "string"},
126
+ "description": {"type": "string"},
127
+ "url": {"type": "string"}
128
+ },
129
+ "required": ["title", "description"]
130
+ }
131
+
132
+ results = firecrawl.search(
133
+ query="artificial intelligence",
134
+ sources=[
135
+ {"type": "web"},
136
+ {"type": "news"}
137
+ ],
138
+ limit=3,
139
+ tbs="qdr:m", # Last month
140
+ location="US",
141
+ ignore_invalid_urls=True,
142
+ timeout=60000,
143
+ scrape_options=ScrapeOptions(
144
+ formats=[
145
+ "markdown",
146
+ "html",
147
+ {
148
+ "type": "json",
149
+ "prompt": "Extract the title and description from the page",
150
+ "schema": schema
151
+ },
152
+ {"type": "summary"}
153
+ ],
154
+ headers={"User-Agent": "Firecrawl-Test/1.0"},
155
+ include_tags=["h1", "h2", "p"],
156
+ exclude_tags=["nav", "footer"],
157
+ only_main_content=True,
158
+ wait_for=2000,
159
+ mobile=False,
160
+ skip_tls_verification=False,
161
+ remove_base64_images=True,
162
+ block_ads=True,
163
+ proxy="basic",
164
+ max_age=3600000, # 1 hour cache
165
+ store_in_cache=True,
166
+ location=Location(
167
+ country="US",
168
+ languages=["en"]
169
+ ),
170
+ actions=[
171
+ WaitAction(milliseconds=1000)
172
+ ]
173
+ # Note: raw_html and screenshot_full_page are not supported by v2 API yet
174
+ )
175
+ )
176
+
177
+ # Test structure
178
+ assert isinstance(results, SearchData)
179
+ assert hasattr(results, 'web')
180
+ assert hasattr(results, 'news')
181
+ assert hasattr(results, 'images')
182
+
183
+ # Test that web results exist
184
+ assert results.web is not None
185
+ assert len(results.web) <= 3 # Should respect limit
186
+
187
+ # Test that results contain expected content for non-document entries only
188
+ non_doc_entries = [r for r in (results.web or []) if not _is_document(r)]
189
+ if non_doc_entries:
190
+ all_web_text = ' '.join(_collect_texts(non_doc_entries))
191
+ ai_terms = ['artificial', 'intelligence', 'ai', 'machine', 'learning']
192
+ assert any(term in all_web_text for term in ai_terms)
193
+
194
+ # Test that each result has proper structure
195
+ for result in results.web:
196
+ assert isinstance(result, (SearchResult, Document))
197
+ if isinstance(result, Document):
198
+ # Document path: ensure content present
199
+ assert (result.markdown is not None) or (result.html is not None)
200
+ else:
201
+ # LinkResult path
202
+ assert hasattr(result, 'url')
203
+ assert isinstance(result.url, str) and result.url.startswith('http')
204
+
205
+ # Test that news results exist (if API supports it)
206
+ if results.news is not None:
207
+ assert len(results.news) <= 3
208
+ for result in results.news:
209
+ assert isinstance(result, (SearchResult, Document))
210
+ if isinstance(result, Document):
211
+ assert (result.markdown is not None) or (result.html is not None)
212
+ else:
213
+ assert hasattr(result, 'url')
214
+ assert isinstance(result.url, str) and result.url.startswith('http')
215
+
216
+ # Test that unspecified sources are None
217
+ assert results.images is None
218
+
219
+
220
+ def test_search_formats_flexibility():
221
+ """Test that both list and ScrapeFormats work for formats."""
222
+ from firecrawl.types import ScrapeFormats
223
+
224
+ # Test with list format
225
+ results1 = firecrawl.search(
226
+ query="python programming",
227
+ limit=1,
228
+ scrape_options=ScrapeOptions(
229
+ formats=["markdown"]
230
+ )
231
+ )
232
+
233
+ # Test with ScrapeFormats object
234
+ results2 = firecrawl.search(
235
+ query="python programming",
236
+ limit=1,
237
+ scrape_options=ScrapeOptions(
238
+ formats=ScrapeFormats(markdown=True)
239
+ )
240
+ )
241
+
242
+ # Both should work without errors
243
+ assert isinstance(results1, SearchData)
244
+ assert isinstance(results2, SearchData)
245
+ assert results1.web is not None
246
+ assert results2.web is not None
247
+
248
+ def test_search_with_json_format_object():
249
+ """Search with scrape_options including a JSON format object (prompt + schema)."""
250
+ json_schema = {
251
+ "type": "object",
252
+ "properties": {
253
+ "title": {"type": "string"}
254
+ },
255
+ "required": ["title"],
256
+ }
257
+ results = firecrawl.search(
258
+ query="site:docs.firecrawl.dev",
259
+ limit=1,
260
+ scrape_options=ScrapeOptions(
261
+ formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}]
262
+ ),
263
+ )
264
+ assert isinstance(results, SearchData)
265
+ assert results.web is not None and len(results.web) >= 0
@@ -0,0 +1,26 @@
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from firecrawl import Firecrawl
4
+
5
+ load_dotenv()
6
+
7
+
8
+ class TestUsageE2E:
9
+ def setup_method(self):
10
+ # Environment is exported by conftest at import time
11
+ self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
12
+
13
+ def test_get_concurrency(self):
14
+ resp = self.client.get_concurrency()
15
+ # Shape assertions (endpoint not live yet, but types are defined)
16
+ assert hasattr(resp, "concurrency")
17
+ assert hasattr(resp, "max_concurrency")
18
+
19
+ def test_get_credit_usage(self):
20
+ resp = self.client.get_credit_usage()
21
+ assert hasattr(resp, "remaining_credits")
22
+
23
+ def test_get_token_usage(self):
24
+ resp = self.client.get_token_usage()
25
+ assert hasattr(resp, "remaining_tokens")
26
+
@@ -0,0 +1,65 @@
1
+ import os
2
+ import time
3
+ from dotenv import load_dotenv
4
+ from firecrawl import Firecrawl
5
+
6
+ load_dotenv()
7
+
8
+ if not os.getenv("API_KEY"):
9
+ raise ValueError("API_KEY is not set")
10
+
11
+ if not os.getenv("API_URL"):
12
+ raise ValueError("API_URL is not set")
13
+
14
+
15
+ class TestWatcherE2E:
16
+ def setup_method(self):
17
+ from firecrawl import Firecrawl
18
+ self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
19
+
20
+ def test_crawl_watcher(self):
21
+ # Start a small crawl job
22
+ start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=2)
23
+ job_id = start_job.id
24
+
25
+ statuses = []
26
+ w = self.client.watcher(job_id, kind="crawl", poll_interval=1, timeout=120)
27
+ w.add_listener(lambda s: statuses.append(s.status))
28
+ w.start()
29
+
30
+ # Wait for terminal state up to 180 seconds
31
+ deadline = time.time() + 180
32
+ while time.time() < deadline:
33
+ if statuses and statuses[-1] in ["completed", "failed"]:
34
+ break
35
+ time.sleep(1)
36
+
37
+ w.stop()
38
+
39
+ assert len(statuses) > 0
40
+ assert statuses[-1] in ["completed", "failed"]
41
+
42
+ def test_batch_watcher(self):
43
+ urls = [
44
+ "https://docs.firecrawl.dev",
45
+ "https://firecrawl.dev",
46
+ ]
47
+ start_resp = self.client.start_batch_scrape(urls, formats=["markdown"], max_concurrency=1)
48
+ job_id = start_resp.id
49
+
50
+ statuses = []
51
+ w = self.client.watcher(job_id, kind="batch", poll_interval=1, timeout=180)
52
+ w.add_listener(lambda s: statuses.append(s.status))
53
+ w.start()
54
+
55
+ deadline = time.time() + 240
56
+ while time.time() < deadline:
57
+ if statuses and statuses[-1] in ["completed", "failed", "cancelled"]:
58
+ break
59
+ time.sleep(1)
60
+
61
+ w.stop()
62
+
63
+ assert len(statuses) > 0
64
+ assert statuses[-1] in ["completed", "failed", "cancelled"]
65
+
@@ -0,0 +1,12 @@
1
+ import pytest
2
+ from firecrawl.v2.types import CrawlParamsRequest
3
+ from firecrawl.v2.methods.aio import crawl as aio_crawl
4
+
5
+
6
+ @pytest.mark.asyncio
7
+ async def test_crawl_params_request_validation():
8
+ with pytest.raises(ValueError):
9
+ await aio_crawl.crawl_params_preview(None, CrawlParamsRequest(url="", prompt="x"))
10
+ with pytest.raises(ValueError):
11
+ await aio_crawl.crawl_params_preview(None, CrawlParamsRequest(url="https://x", prompt=""))
12
+
@@ -0,0 +1,61 @@
1
+ from firecrawl.v2.types import CrawlRequest, ScrapeOptions, WebhookConfig
2
+ from firecrawl.v2.methods.aio.crawl import _prepare_crawl_request
3
+
4
+
5
+ class TestAsyncCrawlRequestPreparation:
6
+ def test_basic_request(self):
7
+ req = CrawlRequest(url="https://example.com")
8
+ payload = _prepare_crawl_request(req)
9
+ assert payload["url"] == "https://example.com"
10
+
11
+ def test_field_mappings(self):
12
+ req = CrawlRequest(
13
+ url="https://example.com",
14
+ include_paths=["/docs/*"],
15
+ exclude_paths=["/admin/*"],
16
+ max_discovery_depth=2,
17
+ ignore_sitemap=True,
18
+ ignore_query_parameters=True,
19
+ crawl_entire_domain=True,
20
+ allow_external_links=False,
21
+ allow_subdomains=True,
22
+ max_concurrency=5,
23
+ zero_data_retention=True,
24
+ )
25
+ payload = _prepare_crawl_request(req)
26
+ assert payload["includePaths"] == ["/docs/*"]
27
+ assert payload["excludePaths"] == ["/admin/*"]
28
+ assert payload["maxDiscoveryDepth"] == 2
29
+ assert payload["ignoreSitemap"] is True
30
+ assert payload["ignoreQueryParameters"] is True
31
+ assert payload["crawlEntireDomain"] is True
32
+ assert payload["allowExternalLinks"] is False
33
+ assert payload["allowSubdomains"] is True
34
+ assert payload["maxConcurrency"] == 5
35
+ assert payload["zeroDataRetention"] is True
36
+
37
+ def test_webhook_preparation(self):
38
+ # string webhook
39
+ req = CrawlRequest(url="https://example.com", webhook="https://example.com/hook")
40
+ payload = _prepare_crawl_request(req)
41
+ assert payload["webhook"] == "https://example.com/hook"
42
+
43
+ # object webhook
44
+ req2 = CrawlRequest(url="https://example.com", webhook=WebhookConfig(url="https://x/h", headers={"X": "1"}, events=["completed"]))
45
+ payload2 = _prepare_crawl_request(req2)
46
+ assert isinstance(payload2["webhook"], dict)
47
+ assert payload2["webhook"]["url"] == "https://x/h"
48
+ assert payload2["webhook"]["headers"] == {"X": "1"}
49
+
50
+ def test_webhook_none_values_excluded(self):
51
+ req = CrawlRequest(
52
+ url="https://example.com",
53
+ webhook=WebhookConfig(url="https://example.com/webhook", headers=None, metadata=None, events=None),
54
+ )
55
+ payload = _prepare_crawl_request(req)
56
+ webhook = payload["webhook"]
57
+ assert webhook["url"] == "https://example.com/webhook"
58
+ assert "headers" not in webhook
59
+ assert "metadata" not in webhook
60
+ assert "events" not in webhook
61
+
@@ -0,0 +1,12 @@
1
+ from firecrawl.v2.types import CrawlRequest, ScrapeOptions
2
+ from firecrawl.v2.methods.aio.crawl import _prepare_crawl_request
3
+ import pytest
4
+
5
+
6
+ class TestAsyncCrawlValidation:
7
+ def test_invalid_url(self):
8
+ with pytest.raises(ValueError):
9
+ _prepare_crawl_request(CrawlRequest(url=""))
10
+ with pytest.raises(ValueError):
11
+ _prepare_crawl_request(CrawlRequest(url=" "))
12
+
@@ -0,0 +1,19 @@
1
+ import pytest
2
+ from firecrawl.v2.types import MapOptions
3
+ from firecrawl.v2.methods.aio.map import _prepare_map_request
4
+
5
+
6
+ class TestAsyncMapRequestPreparation:
7
+ def test_basic(self):
8
+ payload = _prepare_map_request("https://example.com")
9
+ assert payload["url"] == "https://example.com"
10
+
11
+ def test_fields(self):
12
+ opts = MapOptions(search="docs", include_subdomains=True, limit=10, sitemap="only", timeout=15000)
13
+ payload = _prepare_map_request("https://example.com", opts)
14
+ assert payload["search"] == "docs"
15
+ assert payload["includeSubdomains"] is True
16
+ assert payload["limit"] == 10
17
+ assert payload["sitemap"] == "only"
18
+ assert payload["timeout"] == 15000
19
+
@@ -0,0 +1,50 @@
1
+ import pytest
2
+ from firecrawl.v2.types import ScrapeOptions, Location
3
+ from firecrawl.v2.methods.aio.scrape import _prepare_scrape_request
4
+
5
+
6
+ class TestAsyncScrapeRequestPreparation:
7
+ @pytest.mark.asyncio
8
+ async def test_basic_request_preparation(self):
9
+ payload = await _prepare_scrape_request("https://example.com", None)
10
+ assert payload["url"] == "https://example.com"
11
+
12
+ @pytest.mark.asyncio
13
+ async def test_options_conversion(self):
14
+ opts = ScrapeOptions(
15
+ formats=["markdown", {"type": "screenshot", "full_page": True, "quality": 80}],
16
+ include_tags=["main"],
17
+ exclude_tags=["nav"],
18
+ only_main_content=True,
19
+ wait_for=500,
20
+ timeout=30000,
21
+ mobile=True,
22
+ parsers=["pdf"],
23
+ location=Location(country="us", languages=["en"]),
24
+ skip_tls_verification=False,
25
+ remove_base64_images=False,
26
+ fast_mode=True,
27
+ use_mock="test",
28
+ block_ads=False,
29
+ proxy="basic",
30
+ max_age=1000,
31
+ store_in_cache=False,
32
+ )
33
+ payload = await _prepare_scrape_request("https://example.com", opts)
34
+ assert payload["url"] == "https://example.com"
35
+ assert isinstance(payload.get("formats"), list) and "markdown" in payload["formats"]
36
+ assert payload["includeTags"] == ["main"]
37
+ assert payload["excludeTags"] == ["nav"]
38
+ assert payload["onlyMainContent"] is True
39
+ assert payload["waitFor"] == 500
40
+ assert payload["timeout"] == 30000
41
+ assert payload["mobile"] is True
42
+ assert payload["skipTlsVerification"] is False
43
+ assert payload["removeBase64Images"] is False
44
+ assert payload["fastMode"] is True
45
+ assert payload["useMock"] == "test"
46
+ assert payload["blockAds"] is False
47
+ assert payload["proxy"] == "basic"
48
+ assert payload["maxAge"] == 1000
49
+ assert payload["storeInCache"] is False
50
+
@@ -0,0 +1,63 @@
1
+ import pytest
2
+ from firecrawl.v2.types import SearchRequest, ScrapeOptions
3
+ from firecrawl.v2.methods.aio.search import _prepare_search_request
4
+
5
+
6
+ class TestAsyncSearchRequestPreparation:
7
+ def test_basic_request_preparation(self):
8
+ request = SearchRequest(query="test query")
9
+ data = _prepare_search_request(request)
10
+ assert data["query"] == "test query"
11
+ assert "ignore_invalid_urls" not in data
12
+ assert "scrape_options" not in data
13
+
14
+ def test_all_fields_conversion(self):
15
+ scrape_opts = ScrapeOptions(
16
+ formats=["markdown"],
17
+ headers={"User-Agent": "Test"},
18
+ include_tags=["h1", "h2"],
19
+ exclude_tags=["nav"],
20
+ only_main_content=False,
21
+ timeout=15000,
22
+ wait_for=2000,
23
+ mobile=True,
24
+ skip_tls_verification=True,
25
+ remove_base64_images=False,
26
+ )
27
+ request = SearchRequest(
28
+ query="test query",
29
+ sources=["web", "news"],
30
+ limit=10,
31
+ tbs="qdr:w",
32
+ location="US",
33
+ ignore_invalid_urls=False,
34
+ timeout=30000,
35
+ scrape_options=scrape_opts,
36
+ )
37
+ data = _prepare_search_request(request)
38
+ assert data["ignoreInvalidURLs"] is False
39
+ assert "scrapeOptions" in data
40
+
41
+ def test_exclude_none_behavior(self):
42
+ request = SearchRequest(
43
+ query="test",
44
+ sources=None,
45
+ limit=None,
46
+ tbs=None,
47
+ location=None,
48
+ ignore_invalid_urls=None,
49
+ timeout=None,
50
+ scrape_options=None,
51
+ )
52
+ data = _prepare_search_request(request)
53
+ assert "query" in data
54
+ assert len(data) == 1
55
+
56
+ def test_empty_scrape_options(self):
57
+ request = SearchRequest(query="test", scrape_options=ScrapeOptions())
58
+ data = _prepare_search_request(request)
59
+ assert "scrapeOptions" in data
60
+ scrape_data = data["scrapeOptions"]
61
+ assert "onlyMainContent" in scrape_data
62
+ assert "mobile" in scrape_data
63
+
@@ -0,0 +1,28 @@
1
+ from firecrawl.v2.types import ScrapeOptions, Location
2
+ from firecrawl.v2.methods.aio.batch import _prepare as _prepare_batch
3
+
4
+
5
+ class TestAsyncBatchRequestPreparation:
6
+ def test_urls_validation_and_conversion(self):
7
+ payload = _prepare_batch(["https://example.com", "http://foo.bar"], options=None)
8
+ assert payload["urls"] == ["https://example.com", "http://foo.bar"]
9
+
10
+ def test_options_and_batch_fields(self):
11
+ opts = ScrapeOptions(formats=["markdown"], only_main_content=True)
12
+ payload = _prepare_batch(
13
+ ["https://example.com"],
14
+ options=opts,
15
+ webhook="https://hook.example",
16
+ append_to_id="00000000-0000-0000-0000-000000000000",
17
+ ignore_invalid_urls=True,
18
+ max_concurrency=3,
19
+ zero_data_retention=True,
20
+ integration="zapier",
21
+ )
22
+ assert payload["webhook"] == "https://hook.example"
23
+ assert payload["appendToId"] == "00000000-0000-0000-0000-000000000000"
24
+ assert payload["ignoreInvalidURLs"] is True
25
+ assert payload["maxConcurrency"] == 3
26
+ assert payload["zeroDataRetention"] is True
27
+ assert payload["integration"] == "zapier"
28
+