firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +4653 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
- firecrawl-3.0.3.dist-info/RECORD +78 -0
- tests/test_timeout_conversion.py +117 -0
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl-2.16.5.dist-info/RECORD +0 -12
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
import httpx
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from firecrawl.v2.client_async import AsyncFirecrawlClient
|
|
7
|
+
from firecrawl.v2.utils.http_client_async import AsyncHttpClient
|
|
8
|
+
from firecrawl.v2.utils.http_client import HttpClient
|
|
9
|
+
from firecrawl.v2.methods.aio import batch as aio_batch
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.asyncio
|
|
13
|
+
async def test_scrape_concurrency(monkeypatch):
|
|
14
|
+
async def fake_post(self, endpoint, data, headers=None, timeout=None):
|
|
15
|
+
await asyncio.sleep(0.1)
|
|
16
|
+
return httpx.Response(200, json={"success": True, "data": {}})
|
|
17
|
+
|
|
18
|
+
monkeypatch.setattr(AsyncHttpClient, "post", fake_post)
|
|
19
|
+
|
|
20
|
+
client = AsyncFirecrawlClient(api_key="test", api_url="http://localhost")
|
|
21
|
+
|
|
22
|
+
start = time.perf_counter()
|
|
23
|
+
await asyncio.gather(
|
|
24
|
+
client.scrape("https://firecrawl.dev"),
|
|
25
|
+
client.scrape("https://firecrawl.dev"),
|
|
26
|
+
client.scrape("https://firecrawl.dev")
|
|
27
|
+
)
|
|
28
|
+
elapsed = time.perf_counter() - start
|
|
29
|
+
|
|
30
|
+
# If calls run concurrently, total should be close to single 0.1s delay, not 0.3s
|
|
31
|
+
assert elapsed < 0.25
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.mark.asyncio
|
|
35
|
+
async def test_event_loop_not_blocked(monkeypatch):
|
|
36
|
+
ticks = 0
|
|
37
|
+
|
|
38
|
+
async def ticker():
|
|
39
|
+
nonlocal ticks
|
|
40
|
+
for _ in range(5):
|
|
41
|
+
await asyncio.sleep(0.05)
|
|
42
|
+
ticks += 1
|
|
43
|
+
|
|
44
|
+
async def fake_post(self, endpoint, data, headers=None, timeout=None):
|
|
45
|
+
await asyncio.sleep(0.2)
|
|
46
|
+
return httpx.Response(200, json={"success": True, "data": {}})
|
|
47
|
+
|
|
48
|
+
monkeypatch.setattr(AsyncHttpClient, "post", fake_post)
|
|
49
|
+
|
|
50
|
+
client = AsyncFirecrawlClient(api_key="test", api_url="http://localhost")
|
|
51
|
+
|
|
52
|
+
await asyncio.gather(ticker(), client.scrape("https://a"))
|
|
53
|
+
# If scrape awaited properly, ticker should have progressed several steps
|
|
54
|
+
assert ticks >= 3
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@pytest.mark.asyncio
|
|
58
|
+
async def test_wait_batch_scrape_polling_interval(monkeypatch):
|
|
59
|
+
# Simulate one scraping status then completed
|
|
60
|
+
class S: # simple status holder
|
|
61
|
+
def __init__(self, status):
|
|
62
|
+
self.status = status
|
|
63
|
+
|
|
64
|
+
states = ["scraping", "completed"]
|
|
65
|
+
|
|
66
|
+
async def fake_status(client, job_id):
|
|
67
|
+
state = states.pop(0)
|
|
68
|
+
return S(state)
|
|
69
|
+
|
|
70
|
+
monkeypatch.setattr(aio_batch, "get_batch_scrape_status", fake_status)
|
|
71
|
+
|
|
72
|
+
client = AsyncFirecrawlClient(api_key="test", api_url="http://localhost")
|
|
73
|
+
|
|
74
|
+
start = time.perf_counter()
|
|
75
|
+
await client.wait_batch_scrape("job-1", poll_interval=0.1, timeout=2)
|
|
76
|
+
elapsed = time.perf_counter() - start
|
|
77
|
+
|
|
78
|
+
# Should take roughly one poll interval to reach completed
|
|
79
|
+
assert 0.09 <= elapsed <= 0.5
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@pytest.mark.asyncio
|
|
83
|
+
async def test_async_transport_used_no_threads(monkeypatch):
|
|
84
|
+
# Make any to_thread usage blow up
|
|
85
|
+
monkeypatch.setattr(asyncio, "to_thread", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("to_thread not allowed")))
|
|
86
|
+
# Make sync HttpClient unusable
|
|
87
|
+
def _boom(*a, **k):
|
|
88
|
+
raise RuntimeError("sync client should not be used")
|
|
89
|
+
monkeypatch.setattr(HttpClient, "post", _boom)
|
|
90
|
+
monkeypatch.setattr(HttpClient, "get", _boom)
|
|
91
|
+
monkeypatch.setattr(HttpClient, "delete", _boom)
|
|
92
|
+
|
|
93
|
+
# Track true async concurrency
|
|
94
|
+
active = 0
|
|
95
|
+
max_active = 0
|
|
96
|
+
async def fake_post(self, endpoint, data, headers=None, timeout=None):
|
|
97
|
+
nonlocal active, max_active
|
|
98
|
+
active += 1
|
|
99
|
+
max_active = max(max_active, active)
|
|
100
|
+
try:
|
|
101
|
+
await asyncio.sleep(0.1)
|
|
102
|
+
return httpx.Response(200, json={"success": True, "data": {}})
|
|
103
|
+
finally:
|
|
104
|
+
active -= 1
|
|
105
|
+
|
|
106
|
+
monkeypatch.setattr(AsyncHttpClient, "post", fake_post)
|
|
107
|
+
|
|
108
|
+
client = AsyncFirecrawlClient(api_key="test", api_url="http://localhost")
|
|
109
|
+
|
|
110
|
+
await asyncio.gather(
|
|
111
|
+
client.scrape("https://firecrawl.dev"),
|
|
112
|
+
client.scrape("https://firecrawl.dev"),
|
|
113
|
+
client.search("q"), # uses async search
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
assert max_active >= 2
|
|
117
|
+
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import ScrapeOptions, Location, WebhookConfig
|
|
3
|
+
from firecrawl.v2.methods.batch import prepare_batch_scrape_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestBatchScrapeRequestPreparation:
|
|
7
|
+
"""Unit tests for batch scrape request preparation."""
|
|
8
|
+
|
|
9
|
+
def test_urls_validation(self):
|
|
10
|
+
# empty list
|
|
11
|
+
with pytest.raises(ValueError):
|
|
12
|
+
prepare_batch_scrape_request([])
|
|
13
|
+
# invalid protocol
|
|
14
|
+
with pytest.raises(ValueError):
|
|
15
|
+
prepare_batch_scrape_request(["example.com"]) # missing http(s)
|
|
16
|
+
# valid
|
|
17
|
+
data = prepare_batch_scrape_request(["https://example.com", "http://foo.bar"])
|
|
18
|
+
assert data["urls"] == ["https://example.com", "http://foo.bar"]
|
|
19
|
+
|
|
20
|
+
def test_flatten_scrape_options(self):
|
|
21
|
+
opts = ScrapeOptions(
|
|
22
|
+
formats=["markdown", "change_tracking", {"type": "screenshot", "full_page": True, "quality": 80}],
|
|
23
|
+
include_tags=["main"],
|
|
24
|
+
exclude_tags=["nav"],
|
|
25
|
+
only_main_content=True,
|
|
26
|
+
wait_for=500,
|
|
27
|
+
timeout=30000,
|
|
28
|
+
mobile=True,
|
|
29
|
+
parsers=["pdf"],
|
|
30
|
+
actions=[{"type": "screenshot", "full_page": True}],
|
|
31
|
+
location=Location(country="us", languages=["en"]),
|
|
32
|
+
skip_tls_verification=False,
|
|
33
|
+
remove_base64_images=False,
|
|
34
|
+
fast_mode=True,
|
|
35
|
+
use_mock="test",
|
|
36
|
+
block_ads=False,
|
|
37
|
+
proxy="basic",
|
|
38
|
+
max_age=1000,
|
|
39
|
+
store_in_cache=False,
|
|
40
|
+
)
|
|
41
|
+
data = prepare_batch_scrape_request(["https://example.com"], options=opts)
|
|
42
|
+
|
|
43
|
+
# Formats should be at top-level as list, with screenshot normalized to object w/ fullPage
|
|
44
|
+
assert isinstance(data.get("formats"), list)
|
|
45
|
+
assert "markdown" in data["formats"]
|
|
46
|
+
# snake_case format should be converted to camelCase
|
|
47
|
+
assert "changeTracking" in data["formats"]
|
|
48
|
+
found_obj = next((f for f in data["formats"] if isinstance(f, dict) and f.get("type") == "screenshot"), None)
|
|
49
|
+
assert found_obj is not None and found_obj.get("fullPage") is True and found_obj.get("quality") == 80
|
|
50
|
+
|
|
51
|
+
# Field conversions to camelCase
|
|
52
|
+
assert data["includeTags"] == ["main"]
|
|
53
|
+
assert data["excludeTags"] == ["nav"]
|
|
54
|
+
assert data["onlyMainContent"] is True
|
|
55
|
+
assert data["waitFor"] == 500
|
|
56
|
+
assert data["timeout"] == 30000
|
|
57
|
+
assert data["mobile"] is True
|
|
58
|
+
assert data["parsers"] == ["pdf"]
|
|
59
|
+
assert isinstance(data["actions"], list) and data["actions"][0]["type"] == "screenshot"
|
|
60
|
+
assert isinstance(data["location"], dict) and data["location"]["country"] == "us"
|
|
61
|
+
assert data["skipTlsVerification"] is False
|
|
62
|
+
assert data["removeBase64Images"] is False
|
|
63
|
+
assert data["fastMode"] is True
|
|
64
|
+
assert data["useMock"] == "test"
|
|
65
|
+
assert data["blockAds"] is False
|
|
66
|
+
assert data["proxy"] == "basic"
|
|
67
|
+
assert data["maxAge"] == 1000
|
|
68
|
+
assert data["storeInCache"] is False
|
|
69
|
+
|
|
70
|
+
def test_batch_specific_fields(self):
|
|
71
|
+
webhook = WebhookConfig(url="https://hook.test", headers={"X": "Y"}, events=["completed"])
|
|
72
|
+
data = prepare_batch_scrape_request(
|
|
73
|
+
["https://example.com"],
|
|
74
|
+
webhook=webhook,
|
|
75
|
+
append_to_id="00000000-0000-0000-0000-000000000000",
|
|
76
|
+
ignore_invalid_urls=True,
|
|
77
|
+
max_concurrency=5,
|
|
78
|
+
zero_data_retention=True,
|
|
79
|
+
integration="test",
|
|
80
|
+
)
|
|
81
|
+
assert isinstance(data["webhook"], dict) and data["webhook"]["url"] == "https://hook.test"
|
|
82
|
+
assert data["appendToId"] == "00000000-0000-0000-0000-000000000000"
|
|
83
|
+
assert data["ignoreInvalidURLs"] is True
|
|
84
|
+
assert data["maxConcurrency"] == 5
|
|
85
|
+
assert data["zeroDataRetention"] is True
|
|
86
|
+
assert data["integration"] == "test"
|
|
87
|
+
|
|
88
|
+
def test_string_webhook_is_passed_verbatim(self):
|
|
89
|
+
data = prepare_batch_scrape_request(["https://example.com"], webhook="https://hook.simple")
|
|
90
|
+
assert data["webhook"] == "https://hook.simple"
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unit tests for crawl params functionality in Firecrawl v2 SDK.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from firecrawl.v2.types import CrawlParamsRequest, CrawlParamsData
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestCrawlParamsRequest:
|
|
10
|
+
"""Unit tests for CrawlParamsRequest."""
|
|
11
|
+
|
|
12
|
+
def test_crawl_params_request_creation(self):
|
|
13
|
+
"""Test creating CrawlParamsRequest with valid data."""
|
|
14
|
+
request = CrawlParamsRequest(
|
|
15
|
+
url="https://example.com",
|
|
16
|
+
prompt="Extract all blog posts"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
assert request.url == "https://example.com"
|
|
20
|
+
assert request.prompt == "Extract all blog posts"
|
|
21
|
+
|
|
22
|
+
def test_crawl_params_request_serialization(self):
|
|
23
|
+
"""Test that CrawlParamsRequest serializes correctly."""
|
|
24
|
+
request = CrawlParamsRequest(
|
|
25
|
+
url="https://example.com",
|
|
26
|
+
prompt="Extract all blog posts and documentation"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
data = request.model_dump()
|
|
30
|
+
|
|
31
|
+
assert data["url"] == "https://example.com"
|
|
32
|
+
assert data["prompt"] == "Extract all blog posts and documentation"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TestCrawlParamsData:
|
|
36
|
+
"""Unit tests for CrawlParamsData."""
|
|
37
|
+
|
|
38
|
+
def test_crawl_params_data_creation(self):
|
|
39
|
+
"""Test creating CrawlParamsData with minimal data."""
|
|
40
|
+
data = CrawlParamsData()
|
|
41
|
+
|
|
42
|
+
assert data.include_paths is None
|
|
43
|
+
assert data.exclude_paths is None
|
|
44
|
+
assert data.max_discovery_depth is None
|
|
45
|
+
assert data.ignore_sitemap is False
|
|
46
|
+
assert data.limit is None
|
|
47
|
+
assert data.crawl_entire_domain is False
|
|
48
|
+
assert data.allow_external_links is False
|
|
49
|
+
assert data.scrape_options is None
|
|
50
|
+
assert data.warning is None
|
|
51
|
+
|
|
52
|
+
def test_crawl_params_data_with_values(self):
|
|
53
|
+
"""Test creating CrawlParamsData with values."""
|
|
54
|
+
data = CrawlParamsData(
|
|
55
|
+
include_paths=["/blog/*"],
|
|
56
|
+
exclude_paths=["/admin/*"],
|
|
57
|
+
max_discovery_depth=3,
|
|
58
|
+
limit=50,
|
|
59
|
+
crawl_entire_domain=True,
|
|
60
|
+
allow_external_links=False,
|
|
61
|
+
warning="Test warning"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
assert data.include_paths == ["/blog/*"]
|
|
65
|
+
assert data.exclude_paths == ["/admin/*"]
|
|
66
|
+
assert data.max_discovery_depth == 3
|
|
67
|
+
assert data.limit == 50
|
|
68
|
+
assert data.crawl_entire_domain is True
|
|
69
|
+
assert data.allow_external_links is False
|
|
70
|
+
assert data.warning == "Test warning"
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import CrawlRequest, ScrapeOptions
|
|
3
|
+
from firecrawl.v2.methods.crawl import _prepare_crawl_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestCrawlRequestPreparation:
|
|
7
|
+
"""Unit tests for crawl request preparation."""
|
|
8
|
+
|
|
9
|
+
def test_basic_request_preparation(self):
|
|
10
|
+
"""Test basic request preparation with minimal fields."""
|
|
11
|
+
request = CrawlRequest(url="https://example.com")
|
|
12
|
+
data = _prepare_crawl_request(request)
|
|
13
|
+
|
|
14
|
+
# Check basic fields
|
|
15
|
+
assert data["url"] == "https://example.com"
|
|
16
|
+
|
|
17
|
+
# Check that no options are present
|
|
18
|
+
assert "limit" not in data
|
|
19
|
+
assert "prompt" not in data
|
|
20
|
+
|
|
21
|
+
def test_crawl_options_conversion(self):
|
|
22
|
+
"""Test that CrawlOptions fields are converted to camelCase."""
|
|
23
|
+
request = CrawlRequest(
|
|
24
|
+
url="https://example.com",
|
|
25
|
+
limit=10,
|
|
26
|
+
max_discovery_depth=3,
|
|
27
|
+
ignore_sitemap=True,
|
|
28
|
+
crawl_entire_domain=False,
|
|
29
|
+
allow_external_links=True
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
data = _prepare_crawl_request(request)
|
|
33
|
+
|
|
34
|
+
# Check basic field
|
|
35
|
+
assert data["url"] == "https://example.com"
|
|
36
|
+
|
|
37
|
+
# Check snake_case to camelCase conversions
|
|
38
|
+
assert "limit" in data
|
|
39
|
+
assert data["limit"] == 10
|
|
40
|
+
assert "maxDiscoveryDepth" in data
|
|
41
|
+
assert data["maxDiscoveryDepth"] == 3
|
|
42
|
+
assert "ignoreSitemap" in data
|
|
43
|
+
assert data["ignoreSitemap"] is True
|
|
44
|
+
assert "crawlEntireDomain" in data
|
|
45
|
+
assert data["crawlEntireDomain"] is False
|
|
46
|
+
assert "allowExternalLinks" in data
|
|
47
|
+
assert data["allowExternalLinks"] is True
|
|
48
|
+
|
|
49
|
+
# Check that snake_case fields are not present
|
|
50
|
+
assert "ignore_sitemap" not in data
|
|
51
|
+
assert "crawl_entire_domain" not in data
|
|
52
|
+
assert "allow_external_links" not in data
|
|
53
|
+
|
|
54
|
+
def test_scrape_options_conversion(self):
|
|
55
|
+
"""Test that nested ScrapeOptions are converted to camelCase."""
|
|
56
|
+
scrape_opts = ScrapeOptions(
|
|
57
|
+
formats=["markdown", "html"],
|
|
58
|
+
headers={"User-Agent": "Test"},
|
|
59
|
+
include_tags=["h1", "h2"],
|
|
60
|
+
exclude_tags=["nav"],
|
|
61
|
+
only_main_content=False,
|
|
62
|
+
timeout=15000,
|
|
63
|
+
wait_for=2000,
|
|
64
|
+
mobile=True,
|
|
65
|
+
skip_tls_verification=True,
|
|
66
|
+
remove_base64_images=False
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
request = CrawlRequest(
|
|
70
|
+
url="https://example.com",
|
|
71
|
+
scrape_options=scrape_opts
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
data = _prepare_crawl_request(request)
|
|
75
|
+
|
|
76
|
+
assert "scrapeOptions" in data
|
|
77
|
+
assert "scrape_options" not in data
|
|
78
|
+
|
|
79
|
+
# Check nested conversions
|
|
80
|
+
scrape_data = data["scrapeOptions"]
|
|
81
|
+
assert "includeTags" in scrape_data
|
|
82
|
+
assert scrape_data["includeTags"] == ["h1", "h2"]
|
|
83
|
+
assert "excludeTags" in scrape_data
|
|
84
|
+
assert scrape_data["excludeTags"] == ["nav"]
|
|
85
|
+
assert "onlyMainContent" in scrape_data
|
|
86
|
+
assert scrape_data["onlyMainContent"] is False
|
|
87
|
+
assert "waitFor" in scrape_data
|
|
88
|
+
assert scrape_data["waitFor"] == 2000
|
|
89
|
+
assert "skipTlsVerification" in scrape_data
|
|
90
|
+
assert scrape_data["skipTlsVerification"] is True
|
|
91
|
+
assert "removeBase64Images" in scrape_data
|
|
92
|
+
assert scrape_data["removeBase64Images"] is False
|
|
93
|
+
|
|
94
|
+
def test_all_fields_conversion(self):
|
|
95
|
+
"""Test request preparation with all possible fields."""
|
|
96
|
+
scrape_opts = ScrapeOptions(
|
|
97
|
+
formats=["markdown"],
|
|
98
|
+
headers={"User-Agent": "Test"},
|
|
99
|
+
only_main_content=False,
|
|
100
|
+
mobile=True
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
request = CrawlRequest(
|
|
104
|
+
url="https://example.com",
|
|
105
|
+
prompt="Extract all blog posts and documentation",
|
|
106
|
+
include_paths=["/blog/*", "/docs/*"],
|
|
107
|
+
exclude_paths=["/admin/*"],
|
|
108
|
+
max_discovery_depth=3,
|
|
109
|
+
ignore_sitemap=False,
|
|
110
|
+
limit=100,
|
|
111
|
+
crawl_entire_domain=True,
|
|
112
|
+
allow_external_links=False,
|
|
113
|
+
scrape_options=scrape_opts
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
data = _prepare_crawl_request(request)
|
|
117
|
+
|
|
118
|
+
# Check basic fields
|
|
119
|
+
assert data["url"] == "https://example.com"
|
|
120
|
+
assert data["prompt"] == "Extract all blog posts and documentation"
|
|
121
|
+
|
|
122
|
+
# Check all CrawlOptions fields
|
|
123
|
+
assert "includePaths" in data
|
|
124
|
+
assert data["includePaths"] == ["/blog/*", "/docs/*"]
|
|
125
|
+
assert "excludePaths" in data
|
|
126
|
+
assert data["excludePaths"] == ["/admin/*"]
|
|
127
|
+
assert "maxDiscoveryDepth" in data
|
|
128
|
+
assert data["maxDiscoveryDepth"] == 3
|
|
129
|
+
assert "ignoreSitemap" in data
|
|
130
|
+
assert data["ignoreSitemap"] is False
|
|
131
|
+
assert "limit" in data
|
|
132
|
+
assert data["limit"] == 100
|
|
133
|
+
assert "crawlEntireDomain" in data
|
|
134
|
+
assert data["crawlEntireDomain"] is True
|
|
135
|
+
assert "allowExternalLinks" in data
|
|
136
|
+
assert data["allowExternalLinks"] is False
|
|
137
|
+
|
|
138
|
+
# Check nested scrape options
|
|
139
|
+
assert "scrapeOptions" in data
|
|
140
|
+
scrape_data = data["scrapeOptions"]
|
|
141
|
+
assert "onlyMainContent" in scrape_data
|
|
142
|
+
assert scrape_data["onlyMainContent"] is False
|
|
143
|
+
assert "mobile" in scrape_data
|
|
144
|
+
assert scrape_data["mobile"] is True
|
|
145
|
+
|
|
146
|
+
def test_none_values_handling(self):
|
|
147
|
+
"""Test that None values are handled correctly."""
|
|
148
|
+
request = CrawlRequest(
|
|
149
|
+
url="https://example.com",
|
|
150
|
+
prompt=None,
|
|
151
|
+
limit=None,
|
|
152
|
+
scrape_options=None
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
data = _prepare_crawl_request(request)
|
|
156
|
+
|
|
157
|
+
# Only the required field should be present
|
|
158
|
+
assert "url" in data
|
|
159
|
+
assert len(data) == 1 # Only url should be present
|
|
160
|
+
|
|
161
|
+
def test_prompt_parameter(self):
|
|
162
|
+
"""Test that prompt parameter is included when provided."""
|
|
163
|
+
request = CrawlRequest(
|
|
164
|
+
url="https://example.com",
|
|
165
|
+
prompt="Extract all blog posts"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
data = _prepare_crawl_request(request)
|
|
169
|
+
|
|
170
|
+
assert "url" in data
|
|
171
|
+
assert "prompt" in data
|
|
172
|
+
assert data["prompt"] == "Extract all blog posts"
|
|
173
|
+
|
|
174
|
+
def test_empty_options(self):
|
|
175
|
+
"""Test that empty options are handled correctly."""
|
|
176
|
+
request = CrawlRequest(
|
|
177
|
+
url="https://example.com"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
data = _prepare_crawl_request(request)
|
|
181
|
+
|
|
182
|
+
# Should only have the required url field
|
|
183
|
+
assert "url" in data
|
|
184
|
+
assert len(data) == 1 # Only url should be present
|
|
185
|
+
|
|
186
|
+
def test_validation_integration(self):
|
|
187
|
+
"""Test that validation is called during preparation."""
|
|
188
|
+
# This should raise an error due to validation
|
|
189
|
+
with pytest.raises(ValueError, match="URL cannot be empty"):
|
|
190
|
+
request = CrawlRequest(url="")
|
|
191
|
+
_prepare_crawl_request(request)
|
|
192
|
+
|
|
193
|
+
# This should raise an error due to validation
|
|
194
|
+
with pytest.raises(ValueError, match="Limit must be positive"):
|
|
195
|
+
request = CrawlRequest(
|
|
196
|
+
url="https://example.com",
|
|
197
|
+
limit=0
|
|
198
|
+
)
|
|
199
|
+
_prepare_crawl_request(request)
|
|
200
|
+
|
|
201
|
+
def test_scrape_options_shared_function_integration(self):
|
|
202
|
+
"""Test that the shared prepare_scrape_options function is being used."""
|
|
203
|
+
# Test with all snake_case fields to ensure conversion
|
|
204
|
+
scrape_opts = ScrapeOptions(
|
|
205
|
+
include_tags=["h1", "h2"],
|
|
206
|
+
exclude_tags=["nav"],
|
|
207
|
+
only_main_content=False,
|
|
208
|
+
wait_for=2000,
|
|
209
|
+
skip_tls_verification=True,
|
|
210
|
+
remove_base64_images=False
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
request = CrawlRequest(
|
|
214
|
+
url="https://example.com",
|
|
215
|
+
scrape_options=scrape_opts
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
data = _prepare_crawl_request(request)
|
|
219
|
+
|
|
220
|
+
# Check that scrapeOptions is present and converted
|
|
221
|
+
assert "scrapeOptions" in data
|
|
222
|
+
scrape_data = data["scrapeOptions"]
|
|
223
|
+
|
|
224
|
+
# Check all conversions are working
|
|
225
|
+
assert "includeTags" in scrape_data
|
|
226
|
+
assert "excludeTags" in scrape_data
|
|
227
|
+
assert "onlyMainContent" in scrape_data
|
|
228
|
+
assert "waitFor" in scrape_data
|
|
229
|
+
assert "skipTlsVerification" in scrape_data
|
|
230
|
+
assert "removeBase64Images" in scrape_data
|
|
231
|
+
|
|
232
|
+
# Check that snake_case fields are not present
|
|
233
|
+
assert "include_tags" not in scrape_data
|
|
234
|
+
assert "exclude_tags" not in scrape_data
|
|
235
|
+
assert "only_main_content" not in scrape_data
|
|
236
|
+
assert "wait_for" not in scrape_data
|
|
237
|
+
assert "skip_tls_verification" not in scrape_data
|
|
238
|
+
assert "remove_base64_images" not in scrape_data
|
|
239
|
+
assert "raw_html" not in scrape_data
|
|
240
|
+
assert "screenshot_full_page" not in scrape_data
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import CrawlRequest, ScrapeOptions
|
|
3
|
+
from firecrawl.v2.methods.crawl import _validate_crawl_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestCrawlRequestValidation:
|
|
7
|
+
"""Unit tests for crawl request validation."""
|
|
8
|
+
|
|
9
|
+
def test_validate_empty_url(self):
|
|
10
|
+
"""Test validation with empty URL."""
|
|
11
|
+
with pytest.raises(ValueError, match="URL cannot be empty"):
|
|
12
|
+
request = CrawlRequest(url="")
|
|
13
|
+
_validate_crawl_request(request)
|
|
14
|
+
|
|
15
|
+
def test_validate_whitespace_url(self):
|
|
16
|
+
"""Test validation with whitespace-only URL."""
|
|
17
|
+
with pytest.raises(ValueError, match="URL cannot be empty"):
|
|
18
|
+
request = CrawlRequest(url=" ")
|
|
19
|
+
_validate_crawl_request(request)
|
|
20
|
+
|
|
21
|
+
def test_validate_valid_url(self):
|
|
22
|
+
"""Test validation with valid URL."""
|
|
23
|
+
request = CrawlRequest(url="https://example.com")
|
|
24
|
+
_validate_crawl_request(request) # Should not raise
|
|
25
|
+
|
|
26
|
+
def test_validate_invalid_limit(self):
|
|
27
|
+
"""Test validation with invalid limit."""
|
|
28
|
+
with pytest.raises(ValueError, match="Limit must be positive"):
|
|
29
|
+
request = CrawlRequest(
|
|
30
|
+
url="https://example.com",
|
|
31
|
+
limit=0
|
|
32
|
+
)
|
|
33
|
+
_validate_crawl_request(request)
|
|
34
|
+
|
|
35
|
+
def test_validate_negative_limit(self):
|
|
36
|
+
"""Test validation with negative limit."""
|
|
37
|
+
with pytest.raises(ValueError, match="Limit must be positive"):
|
|
38
|
+
request = CrawlRequest(
|
|
39
|
+
url="https://example.com",
|
|
40
|
+
limit=-5
|
|
41
|
+
)
|
|
42
|
+
_validate_crawl_request(request)
|
|
43
|
+
|
|
44
|
+
def test_validate_valid_limit(self):
|
|
45
|
+
"""Test validation with valid limit."""
|
|
46
|
+
request = CrawlRequest(
|
|
47
|
+
url="https://example.com",
|
|
48
|
+
limit=10
|
|
49
|
+
)
|
|
50
|
+
_validate_crawl_request(request) # Should not raise
|
|
51
|
+
|
|
52
|
+
def test_validate_with_prompt(self):
|
|
53
|
+
"""Test validation with prompt."""
|
|
54
|
+
request = CrawlRequest(
|
|
55
|
+
url="https://example.com",
|
|
56
|
+
prompt="Extract all blog posts"
|
|
57
|
+
)
|
|
58
|
+
_validate_crawl_request(request) # Should not raise
|
|
59
|
+
|
|
60
|
+
def test_validate_with_prompt_and_options(self):
|
|
61
|
+
"""Test validation with prompt and options."""
|
|
62
|
+
request = CrawlRequest(
|
|
63
|
+
url="https://example.com",
|
|
64
|
+
prompt="Extract all blog posts",
|
|
65
|
+
limit=10
|
|
66
|
+
)
|
|
67
|
+
_validate_crawl_request(request) # Should not raise
|
|
68
|
+
|
|
69
|
+
def test_validate_none_options(self):
|
|
70
|
+
"""Test validation with None options."""
|
|
71
|
+
request = CrawlRequest(url="https://example.com")
|
|
72
|
+
_validate_crawl_request(request) # Should not raise
|
|
73
|
+
|
|
74
|
+
def test_validate_complex_options(self):
|
|
75
|
+
"""Test validation with complex options."""
|
|
76
|
+
scrape_opts = ScrapeOptions(
|
|
77
|
+
formats=["markdown"],
|
|
78
|
+
only_main_content=False,
|
|
79
|
+
mobile=True
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
request = CrawlRequest(
|
|
83
|
+
url="https://example.com",
|
|
84
|
+
limit=50,
|
|
85
|
+
max_discovery_depth=3,
|
|
86
|
+
scrape_options=scrape_opts
|
|
87
|
+
)
|
|
88
|
+
_validate_crawl_request(request) # Should not raise
|
|
89
|
+
|
|
90
|
+
def test_validate_scrape_options_integration(self):
|
|
91
|
+
"""Test that scrape_options validation is integrated."""
|
|
92
|
+
# Test with valid scrape options
|
|
93
|
+
scrape_opts = ScrapeOptions(formats=["markdown"], timeout=30000)
|
|
94
|
+
request = CrawlRequest(
|
|
95
|
+
url="https://example.com",
|
|
96
|
+
scrape_options=scrape_opts
|
|
97
|
+
)
|
|
98
|
+
_validate_crawl_request(request) # Should not raise
|
|
99
|
+
|
|
100
|
+
# Test with invalid scrape options (should raise error)
|
|
101
|
+
invalid_scrape_opts = ScrapeOptions(timeout=-1000)
|
|
102
|
+
request = CrawlRequest(
|
|
103
|
+
url="https://example.com",
|
|
104
|
+
scrape_options=invalid_scrape_opts
|
|
105
|
+
)
|
|
106
|
+
with pytest.raises(ValueError, match="Timeout must be positive"):
|
|
107
|
+
_validate_crawl_request(request)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from firecrawl.v2.types import MapOptions
|
|
3
|
+
from firecrawl.v2.methods.map import _prepare_map_request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestMapRequestPreparation:
|
|
7
|
+
"""Unit tests for map request preparation."""
|
|
8
|
+
|
|
9
|
+
def test_basic_request_preparation(self):
|
|
10
|
+
data = _prepare_map_request("https://example.com")
|
|
11
|
+
assert data["url"] == "https://example.com"
|
|
12
|
+
# Default sitemap handling should be "include" when no flags provided
|
|
13
|
+
assert "sitemap" not in data # we only send when options provided
|
|
14
|
+
|
|
15
|
+
def test_sitemap_transformations(self):
|
|
16
|
+
# sitemap -> "only"
|
|
17
|
+
opts = MapOptions(sitemap="only")
|
|
18
|
+
data = _prepare_map_request("https://example.com", opts)
|
|
19
|
+
assert data["sitemap"] == "only"
|
|
20
|
+
|
|
21
|
+
# sitemap -> "skip"
|
|
22
|
+
opts = MapOptions(sitemap="skip")
|
|
23
|
+
data = _prepare_map_request("https://example.com", opts)
|
|
24
|
+
assert data["sitemap"] == "skip"
|
|
25
|
+
|
|
26
|
+
# default when options present but sitemap left as default -> include
|
|
27
|
+
opts = MapOptions(search="docs")
|
|
28
|
+
data = _prepare_map_request("https://example.com", opts)
|
|
29
|
+
assert data["sitemap"] == "include"
|
|
30
|
+
|
|
31
|
+
def test_field_conversions(self):
|
|
32
|
+
opts = MapOptions(
|
|
33
|
+
search="docs",
|
|
34
|
+
include_subdomains=True,
|
|
35
|
+
limit=25,
|
|
36
|
+
sitemap="only",
|
|
37
|
+
timeout=15000,
|
|
38
|
+
)
|
|
39
|
+
data = _prepare_map_request("https://example.com", opts)
|
|
40
|
+
|
|
41
|
+
assert data["url"] == "https://example.com"
|
|
42
|
+
assert data["search"] == "docs"
|
|
43
|
+
assert data["includeSubdomains"] is True
|
|
44
|
+
assert data["limit"] == 25
|
|
45
|
+
assert data["sitemap"] == "only"
|
|
46
|
+
assert data["timeout"] == 15000
|
|
47
|
+
|
|
48
|
+
def test_invalid_url(self):
|
|
49
|
+
with pytest.raises(ValueError):
|
|
50
|
+
_prepare_map_request("")
|
|
51
|
+
with pytest.raises(ValueError):
|
|
52
|
+
_prepare_map_request(" ")
|
|
53
|
+
|