firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +4653 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
- firecrawl-3.0.3.dist-info/RECORD +78 -0
- tests/test_timeout_conversion.py +117 -0
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl-2.16.5.dist-info/RECORD +0 -12
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from firecrawl import AsyncFirecrawl
|
|
5
|
+
from firecrawl.v2.types import ScrapeOptions, ScrapeFormats, SearchData, SearchResult, Document
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _collect_texts(entries):
|
|
9
|
+
texts = []
|
|
10
|
+
for r in entries or []:
|
|
11
|
+
title = getattr(r, 'title', None) if hasattr(r, 'title') else None
|
|
12
|
+
desc = getattr(r, 'description', None) if hasattr(r, 'description') else None
|
|
13
|
+
if title:
|
|
14
|
+
texts.append(str(title).lower())
|
|
15
|
+
if desc:
|
|
16
|
+
texts.append(str(desc).lower())
|
|
17
|
+
return texts
|
|
18
|
+
|
|
19
|
+
def _is_document(entry) -> bool:
|
|
20
|
+
try:
|
|
21
|
+
from firecrawl.v2.types import Document
|
|
22
|
+
return isinstance(entry, Document) or \
|
|
23
|
+
hasattr(entry, 'markdown') or \
|
|
24
|
+
hasattr(entry, 'html') or \
|
|
25
|
+
hasattr(entry, 'raw_html') or \
|
|
26
|
+
hasattr(entry, 'json') or \
|
|
27
|
+
hasattr(entry, 'screenshot') or \
|
|
28
|
+
hasattr(entry, 'change_tracking') or \
|
|
29
|
+
hasattr(entry, 'summary')
|
|
30
|
+
except Exception:
|
|
31
|
+
return hasattr(entry, 'markdown') or \
|
|
32
|
+
hasattr(entry, 'html') or \
|
|
33
|
+
hasattr(entry, 'raw_html') or \
|
|
34
|
+
hasattr(entry, 'json') or \
|
|
35
|
+
hasattr(entry, 'screenshot') or \
|
|
36
|
+
hasattr(entry, 'change_tracking') or \
|
|
37
|
+
hasattr(entry, 'summary')
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
load_dotenv()
|
|
41
|
+
|
|
42
|
+
if not os.getenv("API_KEY"):
|
|
43
|
+
raise ValueError("API_KEY is not set")
|
|
44
|
+
|
|
45
|
+
if not os.getenv("API_URL"):
|
|
46
|
+
raise ValueError("API_URL is not set")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@pytest.mark.asyncio
|
|
50
|
+
async def test_async_search_minimal():
|
|
51
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
52
|
+
data = await client.search("What is the capital of France?")
|
|
53
|
+
# Assert sections like sync tests
|
|
54
|
+
assert hasattr(data, "web")
|
|
55
|
+
assert hasattr(data, "news")
|
|
56
|
+
assert hasattr(data, "images")
|
|
57
|
+
assert data.web is not None
|
|
58
|
+
assert len(data.web) > 0
|
|
59
|
+
titles = [getattr(r, "title", None) for r in data.web]
|
|
60
|
+
descs = [getattr(r, "description", None) for r in data.web]
|
|
61
|
+
all_text = " ".join([t.lower() for t in titles if t] + [d.lower() for d in descs if d])
|
|
62
|
+
assert "paris" in all_text
|
|
63
|
+
assert data.news is None
|
|
64
|
+
assert data.images is None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@pytest.mark.asyncio
|
|
68
|
+
async def test_async_search_with_sources_and_limit():
|
|
69
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
70
|
+
data = await client.search("firecrawl", sources=["web", "news"], limit=3)
|
|
71
|
+
# Sections present
|
|
72
|
+
assert hasattr(data, "web") and hasattr(data, "news") and hasattr(data, "images")
|
|
73
|
+
# Web present, images absent, news optional but if present respects limit
|
|
74
|
+
if data.web is not None:
|
|
75
|
+
assert len(data.web) <= 3
|
|
76
|
+
if data.news is not None:
|
|
77
|
+
assert len(data.news) <= 3
|
|
78
|
+
assert data.images is None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@pytest.mark.asyncio
|
|
82
|
+
async def test_async_search_with_all_params():
|
|
83
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
84
|
+
data = await client.search(
|
|
85
|
+
"artificial intelligence",
|
|
86
|
+
sources=["web", "news"],
|
|
87
|
+
limit=3,
|
|
88
|
+
tbs="qdr:w",
|
|
89
|
+
location="US",
|
|
90
|
+
ignore_invalid_urls=False,
|
|
91
|
+
timeout=30000,
|
|
92
|
+
scrape_options={
|
|
93
|
+
"formats": ["markdown"],
|
|
94
|
+
"headers": {"User-Agent": "E2E-AIO"},
|
|
95
|
+
"include_tags": ["h1"],
|
|
96
|
+
"exclude_tags": ["nav"],
|
|
97
|
+
"only_main_content": False,
|
|
98
|
+
"timeout": 15000,
|
|
99
|
+
"wait_for": 2000,
|
|
100
|
+
"mobile": True,
|
|
101
|
+
"skip_tls_verification": True,
|
|
102
|
+
"remove_base64_images": False,
|
|
103
|
+
},
|
|
104
|
+
)
|
|
105
|
+
# Structure and type assertions mirroring sync
|
|
106
|
+
assert isinstance(data, SearchData)
|
|
107
|
+
assert hasattr(data, "web") and hasattr(data, "news") and hasattr(data, "images")
|
|
108
|
+
assert data.web is not None
|
|
109
|
+
assert len(data.web) <= 3
|
|
110
|
+
non_doc = [r for r in (data.web or []) if not _is_document(r)]
|
|
111
|
+
if non_doc:
|
|
112
|
+
combined = " ".join(_collect_texts(non_doc))
|
|
113
|
+
ai_terms = ["artificial", "intelligence", "ai", "machine", "learning"]
|
|
114
|
+
assert any(term in combined for term in ai_terms)
|
|
115
|
+
for r in data.web:
|
|
116
|
+
assert isinstance(r, (SearchResult, Document))
|
|
117
|
+
if isinstance(r, Document):
|
|
118
|
+
assert (r.markdown is not None) or (r.html is not None)
|
|
119
|
+
else:
|
|
120
|
+
assert hasattr(r, "url")
|
|
121
|
+
assert isinstance(r.url, str) and r.url.startswith("http")
|
|
122
|
+
if data.news is not None:
|
|
123
|
+
assert len(data.news) <= 10
|
|
124
|
+
for r in data.news:
|
|
125
|
+
assert isinstance(r, (SearchResult, Document))
|
|
126
|
+
if isinstance(r, Document):
|
|
127
|
+
assert (r.markdown is not None) or (r.html is not None)
|
|
128
|
+
else:
|
|
129
|
+
assert isinstance(r.url, str) and r.url.startswith("http")
|
|
130
|
+
assert data.images is None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@pytest.mark.asyncio
|
|
134
|
+
async def test_async_search_minimal_content_check():
|
|
135
|
+
"""Stronger assertion similar to sync: content check on a known query."""
|
|
136
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
137
|
+
data = await client.search("What is the capital of France?")
|
|
138
|
+
assert hasattr(data, "web") and data.web is not None
|
|
139
|
+
non_doc = [r for r in (data.web or []) if not _is_document(r)]
|
|
140
|
+
if non_doc:
|
|
141
|
+
combined = " ".join(_collect_texts(non_doc))
|
|
142
|
+
assert "paris" in combined
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@pytest.mark.asyncio
|
|
146
|
+
async def test_async_search_result_structure():
|
|
147
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
148
|
+
data = await client.search("test query", limit=1)
|
|
149
|
+
if data.web and len(data.web) > 0:
|
|
150
|
+
result = data.web[0]
|
|
151
|
+
assert hasattr(result, "url")
|
|
152
|
+
assert hasattr(result, "title")
|
|
153
|
+
assert hasattr(result, "description")
|
|
154
|
+
assert isinstance(result.url, str) and result.url.startswith("http")
|
|
155
|
+
assert isinstance(getattr(result, "title", None), (str, type(None)))
|
|
156
|
+
assert isinstance(getattr(result, "description", None), (str, type(None)))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@pytest.mark.asyncio
|
|
160
|
+
async def test_async_search_formats_flexibility():
|
|
161
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
162
|
+
# list string
|
|
163
|
+
res1 = await client.search("python programming", limit=1, scrape_options=ScrapeOptions(formats=["markdown"]))
|
|
164
|
+
# list objects
|
|
165
|
+
res2 = await client.search("python programming", limit=1, scrape_options=ScrapeOptions(formats=[{"type": "markdown"}]))
|
|
166
|
+
# ScrapeFormats object
|
|
167
|
+
res3 = await client.search("python programming", limit=1, scrape_options=ScrapeOptions(formats=ScrapeFormats(markdown=True)))
|
|
168
|
+
assert isinstance(res1, SearchData) and hasattr(res1, "web")
|
|
169
|
+
assert isinstance(res2, SearchData) and hasattr(res2, "web")
|
|
170
|
+
assert isinstance(res3, SearchData) and hasattr(res3, "web")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@pytest.mark.asyncio
|
|
174
|
+
async def test_async_search_json_format_object():
|
|
175
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
176
|
+
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
177
|
+
data = await client.search(
|
|
178
|
+
"site:docs.firecrawl.dev",
|
|
179
|
+
limit=1,
|
|
180
|
+
scrape_options={"formats": [{"type": "json", "prompt": "Extract page title", "schema": json_schema}]},
|
|
181
|
+
)
|
|
182
|
+
assert hasattr(data, "web")
|
|
183
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from firecrawl import AsyncFirecrawl
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
load_dotenv()
|
|
8
|
+
|
|
9
|
+
if not os.getenv("API_KEY"):
|
|
10
|
+
raise ValueError("API_KEY is not set")
|
|
11
|
+
|
|
12
|
+
if not os.getenv("API_URL"):
|
|
13
|
+
raise ValueError("API_URL is not set")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.mark.asyncio
|
|
17
|
+
async def test_async_get_concurrency():
|
|
18
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
19
|
+
conc = await client.get_concurrency()
|
|
20
|
+
assert hasattr(conc, "concurrency") and hasattr(conc, "max_concurrency")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.asyncio
|
|
24
|
+
async def test_async_get_credit_usage():
|
|
25
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
26
|
+
credits = await client.get_credit_usage()
|
|
27
|
+
assert hasattr(credits, "remaining_credits")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.mark.asyncio
|
|
31
|
+
async def test_async_get_token_usage():
|
|
32
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
33
|
+
tokens = await client.get_token_usage()
|
|
34
|
+
assert hasattr(tokens, "remaining_tokens")
|
|
35
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import pytest
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from firecrawl import AsyncFirecrawl
|
|
6
|
+
from firecrawl.v2.watcher_async import AsyncWatcher
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
load_dotenv()
|
|
10
|
+
|
|
11
|
+
if not os.getenv("API_KEY"):
|
|
12
|
+
raise ValueError("API_KEY is not set")
|
|
13
|
+
|
|
14
|
+
if not os.getenv("API_URL"):
|
|
15
|
+
raise ValueError("API_URL is not set")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.mark.asyncio
|
|
19
|
+
async def test_async_watcher_crawl_progresses():
|
|
20
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
21
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2)
|
|
22
|
+
statuses = []
|
|
23
|
+
async for snapshot in AsyncWatcher(client, start.id, kind="crawl", timeout=180):
|
|
24
|
+
statuses.append(snapshot.status)
|
|
25
|
+
if snapshot.status in ("completed", "failed"):
|
|
26
|
+
break
|
|
27
|
+
assert statuses and statuses[-1] in ("completed", "failed")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.mark.asyncio
|
|
31
|
+
async def test_async_watcher_batch_progresses():
|
|
32
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
33
|
+
start = await client.start_batch_scrape([
|
|
34
|
+
"https://docs.firecrawl.dev",
|
|
35
|
+
"https://firecrawl.dev",
|
|
36
|
+
], formats=["markdown"], max_concurrency=1)
|
|
37
|
+
statuses = []
|
|
38
|
+
async for snapshot in AsyncWatcher(client, start.id, kind="batch", timeout=240):
|
|
39
|
+
statuses.append(snapshot.status)
|
|
40
|
+
if snapshot.status in ("completed", "failed", "cancelled"):
|
|
41
|
+
break
|
|
42
|
+
assert statuses and statuses[-1] in ("completed", "failed", "cancelled")
|
|
43
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import pytest
|
|
4
|
+
import requests
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
|
|
7
|
+
load_dotenv()
|
|
8
|
+
|
|
9
|
+
def _idmux(identity_request: dict) -> dict:
|
|
10
|
+
idmux_url = os.getenv("IDMUX_URL")
|
|
11
|
+
if not idmux_url:
|
|
12
|
+
raise EnvironmentError("IDMUX_URL is not set. E2E tests must use idmux for credentials.")
|
|
13
|
+
run_number = int(os.getenv("GITHUB_RUN_NUMBER") or 0)
|
|
14
|
+
payload = {
|
|
15
|
+
"refName": os.getenv("GITHUB_REF_NAME") or "local",
|
|
16
|
+
"runNumber": run_number,
|
|
17
|
+
"concurrency": identity_request.get("concurrency", 100),
|
|
18
|
+
**identity_request,
|
|
19
|
+
}
|
|
20
|
+
resp = requests.post(idmux_url + "/", json=payload)
|
|
21
|
+
resp.raise_for_status()
|
|
22
|
+
return resp.json()
|
|
23
|
+
|
|
24
|
+
@pytest.fixture(scope="session")
|
|
25
|
+
def api_url():
|
|
26
|
+
# Prefer TEST_URL, then FIRECRAWL_API_URL (for parity with JS), then legacy API_URL
|
|
27
|
+
return (
|
|
28
|
+
os.getenv("TEST_URL")
|
|
29
|
+
or os.getenv("FIRECRAWL_API_URL")
|
|
30
|
+
or os.getenv("API_URL")
|
|
31
|
+
or "https://api.firecrawl.dev"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Resolve identity and export environment at import time so tests that read env at module import succeed
|
|
35
|
+
_IDENTITY = None
|
|
36
|
+
_API_URL = (
|
|
37
|
+
os.getenv("TEST_URL")
|
|
38
|
+
or os.getenv("FIRECRAWL_API_URL")
|
|
39
|
+
or os.getenv("API_URL")
|
|
40
|
+
or "https://api.firecrawl.dev"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
_IDMUX_URL = os.getenv("IDMUX_URL")
|
|
44
|
+
if _IDMUX_URL:
|
|
45
|
+
run_name = os.getenv("PYTEST_RUN_NAME") or "py-e2e"
|
|
46
|
+
# If IDMUX_URL is set, idmux MUST succeed; do not silently fall back
|
|
47
|
+
_IDENTITY = _idmux({"name": run_name})
|
|
48
|
+
os.environ["API_KEY"] = _IDENTITY.get("apiKey", "")
|
|
49
|
+
os.environ["API_URL"] = _API_URL
|
|
50
|
+
|
|
51
|
+
@pytest.fixture(scope="session")
|
|
52
|
+
def api_identity():
|
|
53
|
+
return _IDENTITY or {"apiKey": os.getenv("API_KEY") or "", "teamId": os.getenv("TEST_TEAM_ID") or os.getenv("TEAM_ID") or ""}
|
|
54
|
+
|
|
55
|
+
@pytest.fixture(autouse=True)
|
|
56
|
+
def _inject_client(request, api_identity, api_url):
|
|
57
|
+
# For class-based tests that rely on self.client, inject a client if missing
|
|
58
|
+
inst = getattr(request, "instance", None)
|
|
59
|
+
if inst is not None and not hasattr(inst, "client"):
|
|
60
|
+
try:
|
|
61
|
+
from firecrawl import Firecrawl
|
|
62
|
+
inst.client = Firecrawl(api_key=api_identity.get("apiKey", ""), api_url=api_url)
|
|
63
|
+
except Exception:
|
|
64
|
+
pass
|
|
65
|
+
# For function-based modules that expect a module-level `firecrawl` symbol
|
|
66
|
+
mod = getattr(request, "module", None)
|
|
67
|
+
if mod is not None and not hasattr(mod, "firecrawl"):
|
|
68
|
+
try:
|
|
69
|
+
from firecrawl import Firecrawl
|
|
70
|
+
setattr(mod, "firecrawl", Firecrawl(api_key=api_identity.get("apiKey", ""), api_url=api_url))
|
|
71
|
+
except Exception:
|
|
72
|
+
pass
|
|
73
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import pytest
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
|
|
6
|
+
from firecrawl import AsyncFirecrawl
|
|
7
|
+
from firecrawl.v2.types import Document
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
load_dotenv()
|
|
11
|
+
|
|
12
|
+
if not os.getenv("API_KEY"):
|
|
13
|
+
raise ValueError("API_KEY is not set")
|
|
14
|
+
|
|
15
|
+
if not os.getenv("API_URL"):
|
|
16
|
+
raise ValueError("API_URL is not set")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.mark.asyncio
|
|
20
|
+
async def test_async_scrape_minimal():
|
|
21
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
22
|
+
doc = await client.scrape("https://docs.firecrawl.dev")
|
|
23
|
+
assert isinstance(doc, Document)
|
|
24
|
+
# Accept any primary content or alternate outputs
|
|
25
|
+
assert doc.markdown is not None and doc.markdown and len(doc.markdown) > 0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.mark.asyncio
|
|
29
|
+
async def test_async_crawl_start_and_status():
|
|
30
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
31
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2)
|
|
32
|
+
job_id = start.id
|
|
33
|
+
|
|
34
|
+
# Poll status until terminal or timeout
|
|
35
|
+
deadline = asyncio.get_event_loop().time() + 180
|
|
36
|
+
status = await client.get_crawl_status(job_id)
|
|
37
|
+
while status.status not in ("completed", "failed") and asyncio.get_event_loop().time() < deadline:
|
|
38
|
+
await asyncio.sleep(2)
|
|
39
|
+
status = await client.get_crawl_status(job_id)
|
|
40
|
+
|
|
41
|
+
assert status.status in ("completed", "failed")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@pytest.mark.asyncio
|
|
45
|
+
async def test_async_batch_start_and_status():
|
|
46
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
47
|
+
start = await client.start_batch_scrape([
|
|
48
|
+
"https://docs.firecrawl.dev",
|
|
49
|
+
"https://firecrawl.dev",
|
|
50
|
+
], formats=["markdown"], max_concurrency=1)
|
|
51
|
+
job_id = start.id
|
|
52
|
+
|
|
53
|
+
deadline = asyncio.get_event_loop().time() + 240
|
|
54
|
+
status = await client.get_batch_scrape_status(job_id)
|
|
55
|
+
while status.status not in ("completed", "failed", "cancelled") and asyncio.get_event_loop().time() < deadline:
|
|
56
|
+
await asyncio.sleep(2)
|
|
57
|
+
status = await client.get_batch_scrape_status(job_id)
|
|
58
|
+
|
|
59
|
+
assert status.status in ("completed", "failed", "cancelled")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@pytest.mark.asyncio
|
|
63
|
+
async def test_async_usage_minimal():
|
|
64
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
65
|
+
conc = await client.get_concurrency()
|
|
66
|
+
assert hasattr(conc, "concurrency") and hasattr(conc, "max_concurrency")
|
|
67
|
+
|
|
68
|
+
credits = await client.get_credit_usage()
|
|
69
|
+
assert hasattr(credits, "remaining_credits")
|
|
70
|
+
|
|
71
|
+
tokens = await client.get_token_usage()
|
|
72
|
+
assert hasattr(tokens, "remaining_tokens")
|
|
73
|
+
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from firecrawl import Firecrawl
|
|
5
|
+
from firecrawl.v2.types import ScrapeOptions
|
|
6
|
+
|
|
7
|
+
load_dotenv()
|
|
8
|
+
|
|
9
|
+
if not os.getenv("API_KEY"):
|
|
10
|
+
raise ValueError("API_KEY is not set")
|
|
11
|
+
|
|
12
|
+
if not os.getenv("API_URL"):
|
|
13
|
+
raise ValueError("API_URL is not set")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TestBatchScrapeE2E:
|
|
17
|
+
"""End-to-end tests for batch scrape (v2)."""
|
|
18
|
+
|
|
19
|
+
def setup_method(self):
|
|
20
|
+
self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
21
|
+
|
|
22
|
+
def test_batch_scrape_minimal(self):
|
|
23
|
+
"""Start a small batch and wait for completion."""
|
|
24
|
+
urls = [
|
|
25
|
+
"https://docs.firecrawl.dev",
|
|
26
|
+
"https://firecrawl.dev",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
job = self.client.batch_scrape(urls, formats=["markdown"], poll_interval=1, wait_timeout=120)
|
|
30
|
+
|
|
31
|
+
assert job.status in ["completed", "failed"]
|
|
32
|
+
assert job.completed >= 0
|
|
33
|
+
assert job.total >= 0
|
|
34
|
+
assert isinstance(job.data, list)
|
|
35
|
+
|
|
36
|
+
def test_start_batch_minimal_and_status(self):
|
|
37
|
+
"""Start via start_batch_scrape (minimal), then fetch status once."""
|
|
38
|
+
urls = [
|
|
39
|
+
"https://docs.firecrawl.dev",
|
|
40
|
+
"https://firecrawl.dev",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
start_resp = self.client.start_batch_scrape(urls, formats=["markdown"], ignore_invalid_urls=True)
|
|
44
|
+
assert start_resp.id is not None
|
|
45
|
+
assert start_resp.url is not None
|
|
46
|
+
|
|
47
|
+
job = self.client.get_batch_scrape_status(start_resp.id)
|
|
48
|
+
assert job.status in ["scraping", "completed", "failed"]
|
|
49
|
+
assert job.total >= 0
|
|
50
|
+
|
|
51
|
+
def test_wait_batch_with_all_params(self):
|
|
52
|
+
"""Blocking waiter with JSON and changeTracking formats plus many options."""
|
|
53
|
+
urls = [
|
|
54
|
+
"https://docs.firecrawl.dev",
|
|
55
|
+
"https://firecrawl.dev",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
json_schema = {
|
|
59
|
+
"type": "object",
|
|
60
|
+
"properties": {
|
|
61
|
+
"title": {"type": "string"}
|
|
62
|
+
},
|
|
63
|
+
"required": ["title"],
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
opts = ScrapeOptions(
|
|
67
|
+
formats=[
|
|
68
|
+
"markdown",
|
|
69
|
+
{"type": "json", "prompt": "Extract page title", "schema": json_schema},
|
|
70
|
+
{"type": "changeTracking", "prompt": "Track changes", "modes": ["json"]},
|
|
71
|
+
],
|
|
72
|
+
only_main_content=True,
|
|
73
|
+
mobile=False,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
job = self.client.batch_scrape(
|
|
77
|
+
urls,
|
|
78
|
+
formats=opts.formats,
|
|
79
|
+
only_main_content=opts.only_main_content,
|
|
80
|
+
mobile=opts.mobile,
|
|
81
|
+
ignore_invalid_urls=True,
|
|
82
|
+
max_concurrency=2,
|
|
83
|
+
zero_data_retention=False,
|
|
84
|
+
poll_interval=1,
|
|
85
|
+
wait_timeout=180,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
assert job.status in ["completed", "failed"]
|
|
89
|
+
assert job.completed >= 0
|
|
90
|
+
assert job.total >= 0
|
|
91
|
+
assert isinstance(job.data, list)
|
|
92
|
+
|
|
93
|
+
def test_cancel_batch(self):
|
|
94
|
+
"""Start a batch and cancel it."""
|
|
95
|
+
urls = [
|
|
96
|
+
"https://docs.firecrawl.dev",
|
|
97
|
+
"https://firecrawl.dev",
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
start_resp = self.client.start_batch_scrape(urls, formats=["markdown"], max_concurrency=1)
|
|
101
|
+
assert start_resp.id is not None
|
|
102
|
+
|
|
103
|
+
cancelled = self.client.cancel_batch_scrape(start_resp.id)
|
|
104
|
+
assert cancelled is True
|
|
105
|
+
|