firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
firecrawl/__init__.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Firecrawl Python SDK
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
from .client import Firecrawl, AsyncFirecrawl, FirecrawlApp, AsyncFirecrawlApp
|
|
10
|
+
from .v2.watcher import Watcher
|
|
11
|
+
from .v2.watcher_async import AsyncWatcher
|
|
12
|
+
from .v1 import (
|
|
13
|
+
V1FirecrawlApp,
|
|
14
|
+
AsyncV1FirecrawlApp,
|
|
15
|
+
V1JsonConfig,
|
|
16
|
+
V1ScrapeOptions,
|
|
17
|
+
V1ChangeTrackingOptions,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__version__ = "4.12.0"
|
|
21
|
+
|
|
22
|
+
# Define the logger for the Firecrawl project
|
|
23
|
+
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _configure_logger() -> None:
|
|
27
|
+
"""
|
|
28
|
+
Configure the firecrawl logger for console output.
|
|
29
|
+
|
|
30
|
+
The function attaches a handler for console output with a specific format and date
|
|
31
|
+
format to the firecrawl logger.
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
formatter = logging.Formatter(
|
|
35
|
+
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
36
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
console_handler = logging.StreamHandler()
|
|
40
|
+
console_handler.setFormatter(formatter)
|
|
41
|
+
|
|
42
|
+
logger.addHandler(console_handler)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error("Failed to configure logging: %s", e)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def setup_logging() -> None:
|
|
48
|
+
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
49
|
+
if logger.hasHandlers():
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
53
|
+
logger.addHandler(logging.NullHandler())
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
_configure_logger()
|
|
57
|
+
|
|
58
|
+
if env == "DEBUG":
|
|
59
|
+
logger.setLevel(logging.DEBUG)
|
|
60
|
+
elif env == "INFO":
|
|
61
|
+
logger.setLevel(logging.INFO)
|
|
62
|
+
elif env == "WARNING":
|
|
63
|
+
logger.setLevel(logging.WARNING)
|
|
64
|
+
elif env == "ERROR":
|
|
65
|
+
logger.setLevel(logging.ERROR)
|
|
66
|
+
elif env == "CRITICAL":
|
|
67
|
+
logger.setLevel(logging.CRITICAL)
|
|
68
|
+
else:
|
|
69
|
+
logger.setLevel(logging.INFO)
|
|
70
|
+
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
71
|
+
|
|
72
|
+
setup_logging()
|
|
73
|
+
logger.debug("Debugging logger setup")
|
|
74
|
+
|
|
75
|
+
__all__ = [
|
|
76
|
+
'Firecrawl',
|
|
77
|
+
'AsyncFirecrawl',
|
|
78
|
+
'FirecrawlApp',
|
|
79
|
+
'AsyncFirecrawlApp',
|
|
80
|
+
'Watcher',
|
|
81
|
+
'AsyncWatcher',
|
|
82
|
+
'V1FirecrawlApp',
|
|
83
|
+
'AsyncV1FirecrawlApp',
|
|
84
|
+
'V1JsonConfig',
|
|
85
|
+
'V1ScrapeOptions',
|
|
86
|
+
'V1ChangeTrackingOptions',
|
|
87
|
+
]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pytest configuration for async E2E tests.
|
|
3
|
+
Ensures environment variables are loaded before any test runs.
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import pytest
|
|
7
|
+
from dotenv import load_dotenv, find_dotenv
|
|
8
|
+
|
|
9
|
+
# Load environment IMMEDIATELY at module import time (before pytest collects tests)
|
|
10
|
+
# This ensures env vars are loaded before the first test runs
|
|
11
|
+
_env_loaded = False
|
|
12
|
+
|
|
13
|
+
def _ensure_env_loaded():
|
|
14
|
+
"""Ensure environment is loaded exactly once, synchronously."""
|
|
15
|
+
global _env_loaded
|
|
16
|
+
if not _env_loaded:
|
|
17
|
+
# Find and load .env from multiple possible locations
|
|
18
|
+
env_file = find_dotenv(usecwd=True)
|
|
19
|
+
if env_file:
|
|
20
|
+
load_dotenv(env_file, override=True)
|
|
21
|
+
else:
|
|
22
|
+
# Try loading from current directory
|
|
23
|
+
load_dotenv(override=True)
|
|
24
|
+
_env_loaded = True
|
|
25
|
+
|
|
26
|
+
# Load env immediately when this module is imported
|
|
27
|
+
_ensure_env_loaded()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture(scope="session", autouse=True)
|
|
31
|
+
def load_environment():
|
|
32
|
+
"""Ensure environment variables are loaded before running any tests."""
|
|
33
|
+
# Double-check env is loaded (should already be done at import time)
|
|
34
|
+
_ensure_env_loaded()
|
|
35
|
+
|
|
36
|
+
# Validate required environment variables
|
|
37
|
+
required_vars = ["API_KEY", "API_URL"]
|
|
38
|
+
missing_vars = [var for var in required_vars if not os.getenv(var)]
|
|
39
|
+
|
|
40
|
+
if missing_vars:
|
|
41
|
+
pytest.skip(f"Skipping E2E tests: Missing required environment variables: {', '.join(missing_vars)}")
|
|
42
|
+
|
|
43
|
+
yield
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.fixture(scope="function")
|
|
47
|
+
def api_key():
|
|
48
|
+
"""Provide API key for tests."""
|
|
49
|
+
key = os.getenv("API_KEY")
|
|
50
|
+
if not key:
|
|
51
|
+
pytest.skip("API_KEY not set")
|
|
52
|
+
return key
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@pytest.fixture(scope="function")
|
|
56
|
+
def api_url():
|
|
57
|
+
"""Provide API URL for tests."""
|
|
58
|
+
url = os.getenv("API_URL")
|
|
59
|
+
if not url:
|
|
60
|
+
pytest.skip("API_URL not set")
|
|
61
|
+
return url
|
|
62
|
+
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import pytest
|
|
3
|
+
from firecrawl import AsyncFirecrawl
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.mark.asyncio
|
|
7
|
+
async def test_async_batch_start_and_status(api_key, api_url):
|
|
8
|
+
client = AsyncFirecrawl(api_key=api_key, api_url=api_url)
|
|
9
|
+
start = await client.start_batch_scrape([
|
|
10
|
+
"https://docs.firecrawl.dev",
|
|
11
|
+
"https://firecrawl.dev",
|
|
12
|
+
], formats=["markdown"], max_concurrency=1)
|
|
13
|
+
job_id = start.id
|
|
14
|
+
|
|
15
|
+
deadline = asyncio.get_event_loop().time() + 240
|
|
16
|
+
status = await client.get_batch_scrape_status(job_id)
|
|
17
|
+
while status.status not in ("completed", "failed", "cancelled") and asyncio.get_event_loop().time() < deadline:
|
|
18
|
+
await asyncio.sleep(2)
|
|
19
|
+
status = await client.get_batch_scrape_status(job_id)
|
|
20
|
+
|
|
21
|
+
assert status.status in ("completed", "failed", "cancelled")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.mark.asyncio
|
|
25
|
+
async def test_async_batch_wait_minimal(api_key, api_url):
|
|
26
|
+
client = AsyncFirecrawl(api_key=api_key, api_url=api_url)
|
|
27
|
+
job = await client.batch_scrape([
|
|
28
|
+
"https://docs.firecrawl.dev",
|
|
29
|
+
"https://firecrawl.dev",
|
|
30
|
+
], formats=["markdown"], poll_interval=1, timeout=120)
|
|
31
|
+
assert job.status in ("completed", "failed")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.mark.asyncio
|
|
35
|
+
async def test_async_batch_wait_with_all_params(api_key, api_url):
|
|
36
|
+
client = AsyncFirecrawl(api_key=api_key, api_url=api_url)
|
|
37
|
+
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
38
|
+
job = await client.batch_scrape(
|
|
39
|
+
[
|
|
40
|
+
"https://docs.firecrawl.dev",
|
|
41
|
+
"https://firecrawl.dev",
|
|
42
|
+
],
|
|
43
|
+
formats=[
|
|
44
|
+
"markdown",
|
|
45
|
+
{"type": "json", "prompt": "Extract page title", "schema": json_schema},
|
|
46
|
+
{"type": "changeTracking", "prompt": "Track changes", "modes": ["json"]},
|
|
47
|
+
],
|
|
48
|
+
only_main_content=True,
|
|
49
|
+
mobile=False,
|
|
50
|
+
ignore_invalid_urls=True,
|
|
51
|
+
max_concurrency=2,
|
|
52
|
+
zero_data_retention=False,
|
|
53
|
+
poll_interval=1,
|
|
54
|
+
timeout=180,
|
|
55
|
+
integration="_e2e-test",
|
|
56
|
+
)
|
|
57
|
+
assert job.status in ("completed", "failed")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@pytest.mark.asyncio
|
|
61
|
+
async def test_async_cancel_batch(api_key, api_url):
|
|
62
|
+
client = AsyncFirecrawl(api_key=api_key, api_url=api_url)
|
|
63
|
+
start = await client.start_batch_scrape([
|
|
64
|
+
"https://docs.firecrawl.dev",
|
|
65
|
+
"https://firecrawl.dev",
|
|
66
|
+
], formats=["markdown"], max_concurrency=1)
|
|
67
|
+
ok = await client.cancel_batch_scrape(start.id)
|
|
68
|
+
assert ok is True
|
|
69
|
+
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import pytest
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from firecrawl import AsyncFirecrawl
|
|
6
|
+
from firecrawl.v2.types import ScrapeOptions
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
load_dotenv()
|
|
10
|
+
|
|
11
|
+
if not os.getenv("API_KEY"):
|
|
12
|
+
raise ValueError("API_KEY is not set")
|
|
13
|
+
|
|
14
|
+
if not os.getenv("API_URL"):
|
|
15
|
+
raise ValueError("API_URL is not set")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.mark.asyncio
|
|
19
|
+
async def test_async_crawl_start_and_status():
|
|
20
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
21
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2)
|
|
22
|
+
job_id = start.id
|
|
23
|
+
|
|
24
|
+
deadline = asyncio.get_event_loop().time() + 180
|
|
25
|
+
status = await client.get_crawl_status(job_id)
|
|
26
|
+
while status.status not in ("completed", "failed") and asyncio.get_event_loop().time() < deadline:
|
|
27
|
+
await asyncio.sleep(2)
|
|
28
|
+
status = await client.get_crawl_status(job_id)
|
|
29
|
+
|
|
30
|
+
assert status.status in ("completed", "failed")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.mark.asyncio
|
|
34
|
+
async def test_async_crawl_with_all_params():
|
|
35
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
36
|
+
# rich scrape options including json format
|
|
37
|
+
json_schema = {
|
|
38
|
+
"type": "object",
|
|
39
|
+
"properties": {"title": {"type": "string"}},
|
|
40
|
+
"required": ["title"],
|
|
41
|
+
}
|
|
42
|
+
status = await client.crawl(
|
|
43
|
+
url="https://docs.firecrawl.dev",
|
|
44
|
+
prompt="Extract docs and blog",
|
|
45
|
+
include_paths=["/docs/*", "/blog/*"],
|
|
46
|
+
exclude_paths=["/admin/*"],
|
|
47
|
+
max_discovery_depth=2,
|
|
48
|
+
ignore_sitemap=False,
|
|
49
|
+
ignore_query_parameters=True,
|
|
50
|
+
limit=5,
|
|
51
|
+
crawl_entire_domain=False,
|
|
52
|
+
allow_external_links=True,
|
|
53
|
+
allow_subdomains=True,
|
|
54
|
+
delay=1,
|
|
55
|
+
max_concurrency=2,
|
|
56
|
+
integration="_e2e-test",
|
|
57
|
+
webhook="https://example.com/hook",
|
|
58
|
+
scrape_options=ScrapeOptions(
|
|
59
|
+
formats=[
|
|
60
|
+
"markdown",
|
|
61
|
+
"rawHtml",
|
|
62
|
+
{"type": "json", "prompt": "Extract title", "schema": json_schema},
|
|
63
|
+
],
|
|
64
|
+
only_main_content=True,
|
|
65
|
+
mobile=False,
|
|
66
|
+
timeout=20000,
|
|
67
|
+
wait_for=500,
|
|
68
|
+
skip_tls_verification=False,
|
|
69
|
+
remove_base64_images=False,
|
|
70
|
+
),
|
|
71
|
+
zero_data_retention=False,
|
|
72
|
+
poll_interval=2,
|
|
73
|
+
timeout=180,
|
|
74
|
+
)
|
|
75
|
+
assert status.status in ("completed", "failed")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@pytest.mark.asyncio
|
|
79
|
+
async def test_async_start_crawl_with_options():
|
|
80
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
81
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=5, max_discovery_depth=2)
|
|
82
|
+
assert start.id is not None and start.url is not None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@pytest.mark.asyncio
|
|
86
|
+
async def test_async_start_crawl_with_prompt():
|
|
87
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
88
|
+
start = await client.start_crawl("https://firecrawl.dev", prompt="Extract all blog posts", limit=3)
|
|
89
|
+
assert start.id is not None and start.url is not None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@pytest.mark.asyncio
|
|
93
|
+
async def test_async_get_crawl_status_shape():
|
|
94
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
95
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=3)
|
|
96
|
+
status = await client.get_crawl_status(start.id)
|
|
97
|
+
assert status.status in ("scraping", "completed", "failed")
|
|
98
|
+
assert status.completed >= 0
|
|
99
|
+
assert status.expires_at is not None
|
|
100
|
+
assert isinstance(status.data, list)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@pytest.mark.asyncio
|
|
104
|
+
async def test_async_crawl_with_wait():
|
|
105
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
106
|
+
job = await client.crawl(url="https://docs.firecrawl.dev", limit=3, max_discovery_depth=2, poll_interval=1, timeout=120)
|
|
107
|
+
assert job.status in ("completed", "failed")
|
|
108
|
+
assert job.completed >= 0 and job.total >= 0 and isinstance(job.data, list)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@pytest.mark.asyncio
|
|
112
|
+
async def test_async_crawl_with_prompt_and_wait():
|
|
113
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
114
|
+
job = await client.crawl(url="https://docs.firecrawl.dev", prompt="Extract all blog posts", limit=3, poll_interval=1, timeout=120)
|
|
115
|
+
assert job.status in ("completed", "failed")
|
|
116
|
+
assert job.completed >= 0 and job.total >= 0 and isinstance(job.data, list)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@pytest.mark.asyncio
|
|
120
|
+
async def test_async_crawl_with_scrape_options():
|
|
121
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
122
|
+
scrape_opts = ScrapeOptions(formats=["markdown", "links"], only_main_content=False, mobile=True)
|
|
123
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2, scrape_options=scrape_opts)
|
|
124
|
+
assert start.id is not None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@pytest.mark.asyncio
|
|
128
|
+
async def test_async_crawl_with_json_format_object():
|
|
129
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
130
|
+
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
131
|
+
scrape_opts = ScrapeOptions(formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}])
|
|
132
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2, scrape_options=scrape_opts)
|
|
133
|
+
assert start.id is not None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@pytest.mark.asyncio
|
|
137
|
+
async def test_async_cancel_crawl():
|
|
138
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
139
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=3)
|
|
140
|
+
cancelled = await client.cancel_crawl(start.id)
|
|
141
|
+
assert cancelled is True
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@pytest.mark.asyncio
|
|
145
|
+
async def test_async_get_crawl_errors_and_invalid_job():
|
|
146
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
147
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2)
|
|
148
|
+
errs = await client.get_crawl_errors(start.id)
|
|
149
|
+
assert hasattr(errs, "errors") and hasattr(errs, "robots_blocked")
|
|
150
|
+
with pytest.raises(Exception):
|
|
151
|
+
await client.get_crawl_errors("invalid-job-id-12345")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@pytest.mark.asyncio
|
|
155
|
+
async def test_async_active_crawls():
|
|
156
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
157
|
+
resp = await client.active_crawls()
|
|
158
|
+
assert hasattr(resp, "success") and hasattr(resp, "crawls")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@pytest.mark.asyncio
|
|
162
|
+
async def test_async_active_crawls_with_running_crawl():
|
|
163
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
164
|
+
start = await client.start_crawl("https://docs.firecrawl.dev", limit=3)
|
|
165
|
+
# fetch active crawls and assert our ID is listed
|
|
166
|
+
active = await client.active_crawls()
|
|
167
|
+
ids = [c.id for c in active.crawls]
|
|
168
|
+
assert start.id in ids
|
|
169
|
+
# cleanup
|
|
170
|
+
await client.cancel_crawl(start.id)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@pytest.mark.asyncio
|
|
174
|
+
async def test_async_crawl_params_preview():
|
|
175
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
176
|
+
params = await client.crawl_params_preview(
|
|
177
|
+
url="https://docs.firecrawl.dev",
|
|
178
|
+
prompt="Extract all blog posts and documentation",
|
|
179
|
+
)
|
|
180
|
+
assert params is not None
|
|
181
|
+
# basic sanity: at least one field should be suggested
|
|
182
|
+
has_any = any([
|
|
183
|
+
getattr(params, "limit", None) is not None,
|
|
184
|
+
getattr(params, "include_paths", None) is not None,
|
|
185
|
+
getattr(params, "max_discovery_depth", None) is not None,
|
|
186
|
+
])
|
|
187
|
+
assert has_any
|
|
188
|
+
|
|
189
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from firecrawl import AsyncFirecrawl
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
load_dotenv()
|
|
8
|
+
|
|
9
|
+
if not os.getenv("API_KEY"):
|
|
10
|
+
raise ValueError("API_KEY is not set")
|
|
11
|
+
|
|
12
|
+
if not os.getenv("API_URL"):
|
|
13
|
+
raise ValueError("API_URL is not set")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.mark.asyncio
|
|
17
|
+
async def test_async_extract_minimal():
|
|
18
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
19
|
+
res = await client.extract(urls=["https://docs.firecrawl.dev"], prompt="Extract title")
|
|
20
|
+
assert res is not None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.mark.asyncio
|
|
24
|
+
async def test_async_extract_with_schema_and_options():
|
|
25
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
26
|
+
schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
27
|
+
res = await client.extract(
|
|
28
|
+
urls=["https://docs.firecrawl.dev"],
|
|
29
|
+
prompt="Extract title",
|
|
30
|
+
schema=schema,
|
|
31
|
+
system_prompt="You are a helpful extractor",
|
|
32
|
+
allow_external_links=False,
|
|
33
|
+
enable_web_search=False,
|
|
34
|
+
show_sources=False,
|
|
35
|
+
integration="_e2e-test",
|
|
36
|
+
# agent={"model": "FIRE-1", "prompt": "Extract title"}, # Skipping agent test in CI
|
|
37
|
+
)
|
|
38
|
+
assert res is not None
|
|
39
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from firecrawl import AsyncFirecrawl
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
load_dotenv()
|
|
8
|
+
|
|
9
|
+
if not os.getenv("API_KEY"):
|
|
10
|
+
raise ValueError("API_KEY is not set")
|
|
11
|
+
|
|
12
|
+
if not os.getenv("API_URL"):
|
|
13
|
+
raise ValueError("API_URL is not set")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.mark.asyncio
|
|
17
|
+
async def test_async_map_minimal():
|
|
18
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
19
|
+
resp = await client.map("https://docs.firecrawl.dev")
|
|
20
|
+
assert hasattr(resp, "links") and isinstance(resp.links, list)
|
|
21
|
+
if resp.links:
|
|
22
|
+
first = resp.links[0]
|
|
23
|
+
assert hasattr(first, "url") and isinstance(first.url, str) and first.url.startswith("http")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.mark.asyncio
|
|
27
|
+
@pytest.mark.parametrize("sitemap", ["only", "include", "skip"])
|
|
28
|
+
async def test_async_map_with_all_params(sitemap):
|
|
29
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
30
|
+
resp = await client.map(
|
|
31
|
+
"https://docs.firecrawl.dev",
|
|
32
|
+
search="docs",
|
|
33
|
+
include_subdomains=True,
|
|
34
|
+
limit=10,
|
|
35
|
+
sitemap=sitemap,
|
|
36
|
+
timeout=15000,
|
|
37
|
+
integration="_e2e-test",
|
|
38
|
+
)
|
|
39
|
+
assert hasattr(resp, "links") and isinstance(resp.links, list)
|
|
40
|
+
assert len(resp.links) <= 10
|
|
41
|
+
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from firecrawl import AsyncFirecrawl
|
|
5
|
+
from firecrawl.v2.types import Document
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
if not os.getenv("API_KEY"):
|
|
11
|
+
raise ValueError("API_KEY is not set")
|
|
12
|
+
|
|
13
|
+
if not os.getenv("API_URL"):
|
|
14
|
+
raise ValueError("API_URL is not set")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.mark.asyncio
|
|
18
|
+
async def test_async_scrape_minimal():
|
|
19
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
20
|
+
doc = await client.scrape("https://docs.firecrawl.dev")
|
|
21
|
+
assert isinstance(doc, Document)
|
|
22
|
+
assert (
|
|
23
|
+
(doc.markdown and len(doc.markdown) > 0)
|
|
24
|
+
or (doc.html and len(doc.html) > 0)
|
|
25
|
+
or (doc.raw_html and len(doc.raw_html) > 0)
|
|
26
|
+
or (doc.links is not None)
|
|
27
|
+
or (doc.screenshot is not None)
|
|
28
|
+
or (doc.json is not None)
|
|
29
|
+
or (doc.summary is not None)
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.mark.asyncio
|
|
34
|
+
async def test_async_scrape_with_all_params():
|
|
35
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
36
|
+
# Include multiple formats with configuration
|
|
37
|
+
json_schema = {
|
|
38
|
+
"type": "object",
|
|
39
|
+
"properties": {"title": {"type": "string"}},
|
|
40
|
+
"required": ["title"],
|
|
41
|
+
}
|
|
42
|
+
doc = await client.scrape(
|
|
43
|
+
"https://docs.firecrawl.dev",
|
|
44
|
+
formats=[
|
|
45
|
+
"markdown",
|
|
46
|
+
"rawHtml",
|
|
47
|
+
{"type": "screenshot", "full_page": False, "quality": 70},
|
|
48
|
+
{"type": "json", "prompt": "Extract title", "schema": json_schema},
|
|
49
|
+
],
|
|
50
|
+
headers={"User-Agent": "E2E-AIO"},
|
|
51
|
+
include_tags=["main"],
|
|
52
|
+
exclude_tags=["nav"],
|
|
53
|
+
only_main_content=True,
|
|
54
|
+
timeout=20000,
|
|
55
|
+
wait_for=500,
|
|
56
|
+
mobile=False,
|
|
57
|
+
parsers=["pdf"],
|
|
58
|
+
actions=[],
|
|
59
|
+
skip_tls_verification=False,
|
|
60
|
+
remove_base64_images=False,
|
|
61
|
+
fast_mode=False,
|
|
62
|
+
use_mock=None,
|
|
63
|
+
block_ads=False,
|
|
64
|
+
proxy="basic",
|
|
65
|
+
max_age=0,
|
|
66
|
+
store_in_cache=False,
|
|
67
|
+
integration="_e2e-test",
|
|
68
|
+
)
|
|
69
|
+
assert isinstance(doc, Document)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@pytest.mark.asyncio
|
|
73
|
+
async def test_async_scrape_with_options_markdown():
|
|
74
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
75
|
+
doc = await client.scrape(
|
|
76
|
+
"https://docs.firecrawl.dev",
|
|
77
|
+
formats=["markdown"],
|
|
78
|
+
only_main_content=False,
|
|
79
|
+
mobile=False,
|
|
80
|
+
)
|
|
81
|
+
assert isinstance(doc, Document)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@pytest.mark.asyncio
|
|
85
|
+
async def test_async_scrape_with_screenshot_action_viewport():
|
|
86
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
87
|
+
doc = await client.scrape(
|
|
88
|
+
"https://docs.firecrawl.dev",
|
|
89
|
+
formats=[{"type": "screenshot", "full_page": False, "quality": 80, "viewport": {"width": 800, "height": 600}}],
|
|
90
|
+
)
|
|
91
|
+
assert isinstance(doc, Document)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@pytest.mark.asyncio
|
|
95
|
+
@pytest.mark.parametrize("fmt,expect_field", [
|
|
96
|
+
("markdown", "markdown"),
|
|
97
|
+
("html", "html"),
|
|
98
|
+
("raw_html", "raw_html"),
|
|
99
|
+
("links", "links"),
|
|
100
|
+
("screenshot", "screenshot"),
|
|
101
|
+
("summary", "summary"),
|
|
102
|
+
])
|
|
103
|
+
async def test_async_scrape_basic_formats(fmt, expect_field):
|
|
104
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
105
|
+
doc = await client.scrape("https://docs.firecrawl.dev", formats=[fmt])
|
|
106
|
+
assert isinstance(doc, Document)
|
|
107
|
+
if expect_field == "markdown":
|
|
108
|
+
assert doc.markdown is not None
|
|
109
|
+
elif expect_field == "html":
|
|
110
|
+
assert doc.html is not None
|
|
111
|
+
elif expect_field == "raw_html":
|
|
112
|
+
assert doc.raw_html is not None
|
|
113
|
+
elif expect_field == "links":
|
|
114
|
+
assert isinstance(doc.links, list)
|
|
115
|
+
elif expect_field == "screenshot":
|
|
116
|
+
assert doc.screenshot is not None
|
|
117
|
+
elif expect_field == "summary":
|
|
118
|
+
assert doc.summary is not None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@pytest.mark.asyncio
|
|
122
|
+
async def test_async_scrape_with_json_format_object():
|
|
123
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
124
|
+
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
125
|
+
doc = await client.scrape(
|
|
126
|
+
"https://docs.firecrawl.dev",
|
|
127
|
+
formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}],
|
|
128
|
+
only_main_content=True,
|
|
129
|
+
)
|
|
130
|
+
assert isinstance(doc, Document)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@pytest.mark.asyncio
|
|
134
|
+
async def test_async_scrape_invalid_url():
|
|
135
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
136
|
+
with pytest.raises(ValueError):
|
|
137
|
+
await client.scrape("")
|
|
138
|
+
|