firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,12 @@
1
+ import pytest
2
+ from firecrawl.v2.types import CrawlParamsRequest
3
+ from firecrawl.v2.methods.aio import crawl as aio_crawl
4
+
5
+
6
+ @pytest.mark.asyncio
7
+ async def test_crawl_params_request_validation():
8
+ with pytest.raises(ValueError):
9
+ await aio_crawl.crawl_params_preview(None, CrawlParamsRequest(url="", prompt="x"))
10
+ with pytest.raises(ValueError):
11
+ await aio_crawl.crawl_params_preview(None, CrawlParamsRequest(url="https://x", prompt=""))
12
+
@@ -0,0 +1,79 @@
1
+ from firecrawl.v2.types import CrawlRequest, ScrapeOptions, WebhookConfig
2
+ from firecrawl.v2.methods.aio.crawl import _prepare_crawl_request
3
+
4
+
5
+ class TestAsyncCrawlRequestPreparation:
6
+ def test_basic_request(self):
7
+ req = CrawlRequest(url="https://example.com")
8
+ payload = _prepare_crawl_request(req)
9
+ assert payload["url"] == "https://example.com"
10
+
11
+ def test_field_mappings(self):
12
+ req = CrawlRequest(
13
+ url="https://example.com",
14
+ include_paths=["/docs/*"],
15
+ exclude_paths=["/admin/*"],
16
+ max_discovery_depth=2,
17
+ sitemap="skip",
18
+ ignore_query_parameters=True,
19
+ crawl_entire_domain=True,
20
+ allow_external_links=False,
21
+ allow_subdomains=True,
22
+ max_concurrency=5,
23
+ zero_data_retention=True,
24
+ )
25
+ payload = _prepare_crawl_request(req)
26
+ assert payload["includePaths"] == ["/docs/*"]
27
+ assert payload["excludePaths"] == ["/admin/*"]
28
+ assert payload["maxDiscoveryDepth"] == 2
29
+ assert payload["sitemap"] == "skip"
30
+ assert payload["ignoreQueryParameters"] is True
31
+ assert payload["crawlEntireDomain"] is True
32
+ assert payload["allowExternalLinks"] is False
33
+ assert payload["allowSubdomains"] is True
34
+ assert payload["maxConcurrency"] == 5
35
+ assert payload["zeroDataRetention"] is True
36
+
37
+ def test_webhook_preparation(self):
38
+ # string webhook
39
+ req = CrawlRequest(url="https://example.com", webhook="https://example.com/hook")
40
+ payload = _prepare_crawl_request(req)
41
+ assert payload["webhook"] == "https://example.com/hook"
42
+
43
+ # object webhook
44
+ req2 = CrawlRequest(url="https://example.com", webhook=WebhookConfig(url="https://x/h", headers={"X": "1"}, events=["completed"]))
45
+ payload2 = _prepare_crawl_request(req2)
46
+ assert isinstance(payload2["webhook"], dict)
47
+ assert payload2["webhook"]["url"] == "https://x/h"
48
+ assert payload2["webhook"]["headers"] == {"X": "1"}
49
+
50
+ def test_webhook_none_values_excluded(self):
51
+ req = CrawlRequest(
52
+ url="https://example.com",
53
+ webhook=WebhookConfig(url="https://example.com/webhook", headers=None, metadata=None, events=None),
54
+ )
55
+ payload = _prepare_crawl_request(req)
56
+ webhook = payload["webhook"]
57
+ assert webhook["url"] == "https://example.com/webhook"
58
+ assert "headers" not in webhook
59
+ assert "metadata" not in webhook
60
+ assert "events" not in webhook
61
+
62
+ def test_all_fields_including_integration(self):
63
+ req = CrawlRequest(
64
+ url="https://example.com",
65
+ include_paths=["/docs/*"],
66
+ exclude_paths=["/admin/*"],
67
+ max_discovery_depth=2,
68
+ sitemap="include",
69
+ ignore_query_parameters=True,
70
+ crawl_entire_domain=False,
71
+ allow_external_links=True,
72
+ allow_subdomains=True,
73
+ max_concurrency=3,
74
+ zero_data_retention=False,
75
+ integration=" _unit-test ",
76
+ )
77
+ payload = _prepare_crawl_request(req)
78
+ assert payload["integration"] == "_unit-test"
79
+
@@ -0,0 +1,12 @@
1
+ from firecrawl.v2.types import CrawlRequest, ScrapeOptions
2
+ from firecrawl.v2.methods.aio.crawl import _prepare_crawl_request
3
+ import pytest
4
+
5
+
6
+ class TestAsyncCrawlValidation:
7
+ def test_invalid_url(self):
8
+ with pytest.raises(ValueError):
9
+ _prepare_crawl_request(CrawlRequest(url=""))
10
+ with pytest.raises(ValueError):
11
+ _prepare_crawl_request(CrawlRequest(url=" "))
12
+
@@ -0,0 +1,20 @@
1
+ import pytest
2
+ from firecrawl.v2.types import MapOptions
3
+ from firecrawl.v2.methods.aio.map import _prepare_map_request
4
+
5
+
6
+ class TestAsyncMapRequestPreparation:
7
+ def test_basic(self):
8
+ payload = _prepare_map_request("https://example.com")
9
+ assert payload["url"] == "https://example.com"
10
+
11
+ def test_fields(self):
12
+ opts = MapOptions(search="docs", include_subdomains=True, limit=10, sitemap="only", timeout=15000, integration=" _unit-test ")
13
+ payload = _prepare_map_request("https://example.com", opts)
14
+ assert payload["search"] == "docs"
15
+ assert payload["includeSubdomains"] is True
16
+ assert payload["limit"] == 10
17
+ assert payload["sitemap"] == "only"
18
+ assert payload["timeout"] == 15000
19
+ assert payload["integration"] == "_unit-test"
20
+
@@ -0,0 +1,50 @@
1
+ import pytest
2
+ from firecrawl.v2.types import ScrapeOptions, Location
3
+ from firecrawl.v2.methods.aio.scrape import _prepare_scrape_request
4
+
5
+
6
+ class TestAsyncScrapeRequestPreparation:
7
+ @pytest.mark.asyncio
8
+ async def test_basic_request_preparation(self):
9
+ payload = await _prepare_scrape_request("https://example.com", None)
10
+ assert payload["url"] == "https://example.com"
11
+
12
+ @pytest.mark.asyncio
13
+ async def test_options_conversion(self):
14
+ opts = ScrapeOptions(
15
+ formats=["markdown", {"type": "screenshot", "full_page": True, "quality": 80}],
16
+ include_tags=["main"],
17
+ exclude_tags=["nav"],
18
+ only_main_content=True,
19
+ wait_for=500,
20
+ timeout=30000,
21
+ mobile=True,
22
+ parsers=["pdf"],
23
+ location=Location(country="us", languages=["en"]),
24
+ skip_tls_verification=False,
25
+ remove_base64_images=False,
26
+ fast_mode=True,
27
+ use_mock="test",
28
+ block_ads=False,
29
+ proxy="basic",
30
+ max_age=1000,
31
+ store_in_cache=False,
32
+ )
33
+ payload = await _prepare_scrape_request("https://example.com", opts)
34
+ assert payload["url"] == "https://example.com"
35
+ assert isinstance(payload.get("formats"), list) and "markdown" in payload["formats"]
36
+ assert payload["includeTags"] == ["main"]
37
+ assert payload["excludeTags"] == ["nav"]
38
+ assert payload["onlyMainContent"] is True
39
+ assert payload["waitFor"] == 500
40
+ assert payload["timeout"] == 30000
41
+ assert payload["mobile"] is True
42
+ assert payload["skipTlsVerification"] is False
43
+ assert payload["removeBase64Images"] is False
44
+ assert payload["fastMode"] is True
45
+ assert payload["useMock"] == "test"
46
+ assert payload["blockAds"] is False
47
+ assert payload["proxy"] == "basic"
48
+ assert payload["maxAge"] == 1000
49
+ assert payload["storeInCache"] is False
50
+
@@ -0,0 +1,64 @@
1
+ import pytest
2
+ from firecrawl.v2.types import SearchRequest, ScrapeOptions
3
+ from firecrawl.v2.methods.aio.search import _prepare_search_request
4
+
5
+
6
+ class TestAsyncSearchRequestPreparation:
7
+ def test_basic_request_preparation(self):
8
+ request = SearchRequest(query="test query")
9
+ data = _prepare_search_request(request)
10
+ assert data["query"] == "test query"
11
+ assert "ignore_invalid_urls" not in data
12
+ assert "scrape_options" not in data
13
+
14
+ def test_all_fields_conversion(self):
15
+ scrape_opts = ScrapeOptions(
16
+ formats=["markdown"],
17
+ headers={"User-Agent": "Test"},
18
+ include_tags=["h1", "h2"],
19
+ exclude_tags=["nav"],
20
+ only_main_content=False,
21
+ timeout=15000,
22
+ wait_for=2000,
23
+ mobile=True,
24
+ skip_tls_verification=True,
25
+ remove_base64_images=False,
26
+ )
27
+ request = SearchRequest(
28
+ query="test query",
29
+ sources=["web", "news"],
30
+ limit=10,
31
+ tbs="qdr:w",
32
+ location="US",
33
+ ignore_invalid_urls=False,
34
+ timeout=30000,
35
+ scrape_options=scrape_opts,
36
+ integration=" _unit-test ",
37
+ )
38
+ data = _prepare_search_request(request)
39
+ assert data["ignoreInvalidURLs"] is False
40
+ assert "scrapeOptions" in data
41
+ assert data["integration"] == "_unit-test"
42
+
43
+ def test_exclude_none_behavior(self):
44
+ request = SearchRequest(
45
+ query="test",
46
+ sources=None,
47
+ limit=None,
48
+ tbs=None,
49
+ location=None,
50
+ ignore_invalid_urls=None,
51
+ timeout=None,
52
+ scrape_options=None,
53
+ )
54
+ data = _prepare_search_request(request)
55
+ assert "query" in data
56
+ assert len(data) == 1
57
+
58
+ def test_empty_scrape_options(self):
59
+ request = SearchRequest(query="test", scrape_options=ScrapeOptions())
60
+ data = _prepare_search_request(request)
61
+ assert "scrapeOptions" in data
62
+ scrape_data = data["scrapeOptions"]
63
+ assert "onlyMainContent" in scrape_data
64
+ assert "mobile" in scrape_data
@@ -0,0 +1,28 @@
1
+ from firecrawl.v2.types import ScrapeOptions, Location
2
+ from firecrawl.v2.methods.aio.batch import _prepare as _prepare_batch
3
+
4
+
5
+ class TestAsyncBatchRequestPreparation:
6
+ def test_urls_validation_and_conversion(self):
7
+ payload = _prepare_batch(["https://example.com", "http://foo.bar"], options=None)
8
+ assert payload["urls"] == ["https://example.com", "http://foo.bar"]
9
+
10
+ def test_options_and_batch_fields(self):
11
+ opts = ScrapeOptions(formats=["markdown"], only_main_content=True)
12
+ payload = _prepare_batch(
13
+ ["https://example.com"],
14
+ options=opts,
15
+ webhook="https://hook.example",
16
+ append_to_id="00000000-0000-0000-0000-000000000000",
17
+ ignore_invalid_urls=True,
18
+ max_concurrency=3,
19
+ zero_data_retention=True,
20
+ integration="zapier",
21
+ )
22
+ assert payload["webhook"] == "https://hook.example"
23
+ assert payload["appendToId"] == "00000000-0000-0000-0000-000000000000"
24
+ assert payload["ignoreInvalidURLs"] is True
25
+ assert payload["maxConcurrency"] == 3
26
+ assert payload["zeroDataRetention"] is True
27
+ assert payload["integration"] == "zapier"
28
+
@@ -0,0 +1,117 @@
1
+ import asyncio
2
+ import time
3
+ import httpx
4
+ import pytest
5
+
6
+ from firecrawl.v2.client_async import AsyncFirecrawlClient
7
+ from firecrawl.v2.utils.http_client_async import AsyncHttpClient
8
+ from firecrawl.v2.utils.http_client import HttpClient
9
+ from firecrawl.v2.methods.aio import batch as aio_batch
10
+
11
+
12
+ @pytest.mark.asyncio
13
+ async def test_scrape_concurrency(monkeypatch):
14
+ async def fake_post(self, endpoint, data, headers=None, timeout=None):
15
+ await asyncio.sleep(0.1)
16
+ return httpx.Response(200, json={"success": True, "data": {}})
17
+
18
+ monkeypatch.setattr(AsyncHttpClient, "post", fake_post)
19
+
20
+ client = AsyncFirecrawlClient(api_key="test", api_url="http://localhost")
21
+
22
+ start = time.perf_counter()
23
+ await asyncio.gather(
24
+ client.scrape("https://firecrawl.dev"),
25
+ client.scrape("https://firecrawl.dev"),
26
+ client.scrape("https://firecrawl.dev")
27
+ )
28
+ elapsed = time.perf_counter() - start
29
+
30
+ # If calls run concurrently, total should be close to single 0.1s delay, not 0.3s
31
+ assert elapsed < 0.25
32
+
33
+
34
+ @pytest.mark.asyncio
35
+ async def test_event_loop_not_blocked(monkeypatch):
36
+ ticks = 0
37
+
38
+ async def ticker():
39
+ nonlocal ticks
40
+ for _ in range(5):
41
+ await asyncio.sleep(0.05)
42
+ ticks += 1
43
+
44
+ async def fake_post(self, endpoint, data, headers=None, timeout=None):
45
+ await asyncio.sleep(0.2)
46
+ return httpx.Response(200, json={"success": True, "data": {}})
47
+
48
+ monkeypatch.setattr(AsyncHttpClient, "post", fake_post)
49
+
50
+ client = AsyncFirecrawlClient(api_key="test", api_url="http://localhost")
51
+
52
+ await asyncio.gather(ticker(), client.scrape("https://a"))
53
+ # If scrape awaited properly, ticker should have progressed several steps
54
+ assert ticks >= 3
55
+
56
+
57
+ @pytest.mark.asyncio
58
+ async def test_wait_batch_scrape_polling_interval(monkeypatch):
59
+ # Simulate one scraping status then completed
60
+ class S: # simple status holder
61
+ def __init__(self, status):
62
+ self.status = status
63
+
64
+ states = ["scraping", "completed"]
65
+
66
+ async def fake_status(client, job_id):
67
+ state = states.pop(0)
68
+ return S(state)
69
+
70
+ monkeypatch.setattr(aio_batch, "get_batch_scrape_status", fake_status)
71
+
72
+ client = AsyncFirecrawlClient(api_key="test", api_url="http://localhost")
73
+
74
+ start = time.perf_counter()
75
+ await client.wait_batch_scrape("job-1", poll_interval=0.1, timeout=2)
76
+ elapsed = time.perf_counter() - start
77
+
78
+ # Should take roughly one poll interval to reach completed
79
+ assert 0.09 <= elapsed <= 0.5
80
+
81
+
82
+ @pytest.mark.asyncio
83
+ async def test_async_transport_used_no_threads(monkeypatch):
84
+ # Make any to_thread usage blow up
85
+ monkeypatch.setattr(asyncio, "to_thread", lambda *a, **k: (_ for _ in ()).throw(RuntimeError("to_thread not allowed")))
86
+ # Make sync HttpClient unusable
87
+ def _boom(*a, **k):
88
+ raise RuntimeError("sync client should not be used")
89
+ monkeypatch.setattr(HttpClient, "post", _boom)
90
+ monkeypatch.setattr(HttpClient, "get", _boom)
91
+ monkeypatch.setattr(HttpClient, "delete", _boom)
92
+
93
+ # Track true async concurrency
94
+ active = 0
95
+ max_active = 0
96
+ async def fake_post(self, endpoint, data, headers=None, timeout=None):
97
+ nonlocal active, max_active
98
+ active += 1
99
+ max_active = max(max_active, active)
100
+ try:
101
+ await asyncio.sleep(0.1)
102
+ return httpx.Response(200, json={"success": True, "data": {}})
103
+ finally:
104
+ active -= 1
105
+
106
+ monkeypatch.setattr(AsyncHttpClient, "post", fake_post)
107
+
108
+ client = AsyncFirecrawlClient(api_key="test", api_url="http://localhost")
109
+
110
+ await asyncio.gather(
111
+ client.scrape("https://firecrawl.dev"),
112
+ client.scrape("https://firecrawl.dev"),
113
+ client.search("q"), # uses async search
114
+ )
115
+
116
+ assert max_active >= 2
117
+