firecrawl 2.5.4__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/firecrawl.py +12 -0
- {firecrawl-2.5.4.dist-info → firecrawl-2.6.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.5.4.dist-info → firecrawl-2.6.0.dist-info}/METADATA +1 -1
- firecrawl-2.6.0.dist-info/RECORD +12 -0
- {firecrawl-2.5.4.dist-info → firecrawl-2.6.0.dist-info}/top_level.txt +0 -2
- build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
- build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
- build/lib/build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
- build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
- build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/firecrawl/firecrawl.py +0 -4454
- build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/firecrawl/firecrawl.py +0 -4454
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl-2.5.4.dist-info/RECORD +0 -54
- {firecrawl-2.5.4.dist-info → firecrawl-2.6.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.6.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -347,6 +347,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
|
|
|
347
347
|
"""
|
|
348
348
|
maxUrls: Optional[int] = 10
|
|
349
349
|
showFullText: Optional[bool] = False
|
|
350
|
+
cache: Optional[bool] = True
|
|
350
351
|
__experimental_stream: Optional[bool] = None
|
|
351
352
|
|
|
352
353
|
class DeepResearchParams(pydantic.BaseModel):
|
|
@@ -1870,6 +1871,7 @@ class FirecrawlApp:
|
|
|
1870
1871
|
*,
|
|
1871
1872
|
max_urls: Optional[int] = None,
|
|
1872
1873
|
show_full_text: Optional[bool] = None,
|
|
1874
|
+
cache: Optional[bool] = None,
|
|
1873
1875
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
|
1874
1876
|
"""
|
|
1875
1877
|
Generate LLMs.txt for a given URL and poll until completion.
|
|
@@ -1878,6 +1880,7 @@ class FirecrawlApp:
|
|
|
1878
1880
|
url (str): Target URL to generate LLMs.txt from
|
|
1879
1881
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1880
1882
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1883
|
+
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
|
1881
1884
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
1882
1885
|
|
|
1883
1886
|
Returns:
|
|
@@ -1893,6 +1896,7 @@ class FirecrawlApp:
|
|
|
1893
1896
|
params = GenerateLLMsTextParams(
|
|
1894
1897
|
maxUrls=max_urls,
|
|
1895
1898
|
showFullText=show_full_text,
|
|
1899
|
+
cache=cache,
|
|
1896
1900
|
__experimental_stream=experimental_stream
|
|
1897
1901
|
)
|
|
1898
1902
|
|
|
@@ -1900,6 +1904,7 @@ class FirecrawlApp:
|
|
|
1900
1904
|
url,
|
|
1901
1905
|
max_urls=max_urls,
|
|
1902
1906
|
show_full_text=show_full_text,
|
|
1907
|
+
cache=cache,
|
|
1903
1908
|
experimental_stream=experimental_stream
|
|
1904
1909
|
)
|
|
1905
1910
|
|
|
@@ -1935,6 +1940,7 @@ class FirecrawlApp:
|
|
|
1935
1940
|
*,
|
|
1936
1941
|
max_urls: Optional[int] = None,
|
|
1937
1942
|
show_full_text: Optional[bool] = None,
|
|
1943
|
+
cache: Optional[bool] = None,
|
|
1938
1944
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
1939
1945
|
"""
|
|
1940
1946
|
Initiate an asynchronous LLMs.txt generation operation.
|
|
@@ -1943,6 +1949,7 @@ class FirecrawlApp:
|
|
|
1943
1949
|
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
|
|
1944
1950
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1945
1951
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1952
|
+
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
|
1946
1953
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
1947
1954
|
|
|
1948
1955
|
Returns:
|
|
@@ -1957,6 +1964,7 @@ class FirecrawlApp:
|
|
|
1957
1964
|
params = GenerateLLMsTextParams(
|
|
1958
1965
|
maxUrls=max_urls,
|
|
1959
1966
|
showFullText=show_full_text,
|
|
1967
|
+
cache=cache,
|
|
1960
1968
|
__experimental_stream=experimental_stream
|
|
1961
1969
|
)
|
|
1962
1970
|
|
|
@@ -4001,6 +4009,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4001
4009
|
url,
|
|
4002
4010
|
max_urls=max_urls,
|
|
4003
4011
|
show_full_text=show_full_text,
|
|
4012
|
+
cache=cache,
|
|
4004
4013
|
experimental_stream=experimental_stream
|
|
4005
4014
|
)
|
|
4006
4015
|
if not response.get('success') or 'id' not in response:
|
|
@@ -4027,6 +4036,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4027
4036
|
*,
|
|
4028
4037
|
max_urls: Optional[int] = None,
|
|
4029
4038
|
show_full_text: Optional[bool] = None,
|
|
4039
|
+
cache: Optional[bool] = None,
|
|
4030
4040
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
4031
4041
|
"""
|
|
4032
4042
|
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
|
@@ -4035,6 +4045,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4035
4045
|
url (str): Target URL to generate LLMs.txt from
|
|
4036
4046
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
4037
4047
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
4048
|
+
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
|
4038
4049
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
4039
4050
|
|
|
4040
4051
|
Returns:
|
|
@@ -4057,6 +4068,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4057
4068
|
params = GenerateLLMsTextParams(
|
|
4058
4069
|
maxUrls=max_urls,
|
|
4059
4070
|
showFullText=show_full_text,
|
|
4071
|
+
cache=cache,
|
|
4060
4072
|
__experimental_stream=experimental_stream
|
|
4061
4073
|
)
|
|
4062
4074
|
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=bds9ny9yl_8sXYAmfjaVd_32uf8Qm0ZPbZ2V-LGTgGQ,2612
|
|
2
|
+
firecrawl/firecrawl.py,sha256=Y_8gS4vBPQZ_LP8UXdZAOEMEmK0kRv88Jst2dZ3VW8c,190089
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl-2.6.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.6.0.dist-info/METADATA,sha256=gOm5xIZTdtTxAIVZbjZFf5ALWIrIOGpYCFqQ9zW4PAU,7165
|
|
10
|
+
firecrawl-2.6.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.6.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.6.0.dist-info/RECORD,,
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This is the Firecrawl package.
|
|
3
|
-
|
|
4
|
-
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
-
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
-
and check the status of these jobs.
|
|
7
|
-
|
|
8
|
-
For more information visit https://github.com/firecrawl/
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
|
-
|
|
16
|
-
__version__ = "2.5.4"
|
|
17
|
-
|
|
18
|
-
# Define the logger for the Firecrawl project
|
|
19
|
-
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _configure_logger() -> None:
|
|
23
|
-
"""
|
|
24
|
-
Configure the firecrawl logger for console output.
|
|
25
|
-
|
|
26
|
-
The function attaches a handler for console output with a specific format and date
|
|
27
|
-
format to the firecrawl logger.
|
|
28
|
-
"""
|
|
29
|
-
try:
|
|
30
|
-
# Create the formatter
|
|
31
|
-
formatter = logging.Formatter(
|
|
32
|
-
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
33
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# Create the console handler and set the formatter
|
|
37
|
-
console_handler = logging.StreamHandler()
|
|
38
|
-
console_handler.setFormatter(formatter)
|
|
39
|
-
|
|
40
|
-
# Add the console handler to the firecrawl logger
|
|
41
|
-
logger.addHandler(console_handler)
|
|
42
|
-
except Exception as e:
|
|
43
|
-
logger.error("Failed to configure logging: %s", e)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def setup_logging() -> None:
|
|
47
|
-
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
48
|
-
# Check if the firecrawl logger already has a handler
|
|
49
|
-
if logger.hasHandlers():
|
|
50
|
-
return # To prevent duplicate logging
|
|
51
|
-
|
|
52
|
-
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
|
53
|
-
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
54
|
-
# Attach a no-op handler to prevent warnings about no handlers
|
|
55
|
-
logger.addHandler(logging.NullHandler())
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
# Attach the console handler to the firecrawl logger
|
|
59
|
-
_configure_logger()
|
|
60
|
-
|
|
61
|
-
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
|
62
|
-
if env == "DEBUG":
|
|
63
|
-
logger.setLevel(logging.DEBUG)
|
|
64
|
-
elif env == "INFO":
|
|
65
|
-
logger.setLevel(logging.INFO)
|
|
66
|
-
elif env == "WARNING":
|
|
67
|
-
logger.setLevel(logging.WARNING)
|
|
68
|
-
elif env == "ERROR":
|
|
69
|
-
logger.setLevel(logging.ERROR)
|
|
70
|
-
elif env == "CRITICAL":
|
|
71
|
-
logger.setLevel(logging.CRITICAL)
|
|
72
|
-
else:
|
|
73
|
-
logger.setLevel(logging.INFO)
|
|
74
|
-
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# Initialize logging configuration when the module is imported
|
|
78
|
-
setup_logging()
|
|
79
|
-
logger.debug("Debugging logger setup")
|
|
File without changes
|
build/lib/build/lib/build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py
DELETED
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
import importlib.util
|
|
2
|
-
import pytest
|
|
3
|
-
import time
|
|
4
|
-
import os
|
|
5
|
-
from uuid import uuid4
|
|
6
|
-
from dotenv import load_dotenv
|
|
7
|
-
|
|
8
|
-
load_dotenv()
|
|
9
|
-
|
|
10
|
-
API_URL = "http://127.0.0.1:3002"
|
|
11
|
-
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
|
12
|
-
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
|
13
|
-
|
|
14
|
-
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
|
15
|
-
|
|
16
|
-
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
|
17
|
-
firecrawl = importlib.util.module_from_spec(spec)
|
|
18
|
-
spec.loader.exec_module(firecrawl)
|
|
19
|
-
FirecrawlApp = firecrawl.FirecrawlApp
|
|
20
|
-
|
|
21
|
-
def test_no_api_key():
|
|
22
|
-
with pytest.raises(Exception) as excinfo:
|
|
23
|
-
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
|
24
|
-
assert "No API key provided" in str(excinfo.value)
|
|
25
|
-
|
|
26
|
-
def test_scrape_url_invalid_api_key():
|
|
27
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
28
|
-
with pytest.raises(Exception) as excinfo:
|
|
29
|
-
invalid_app.scrape_url('https://firecrawl.dev')
|
|
30
|
-
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
31
|
-
|
|
32
|
-
# def test_blocklisted_url():
|
|
33
|
-
# blocklisted_url = "https://facebook.com/fake-test"
|
|
34
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
35
|
-
# with pytest.raises(Exception) as excinfo:
|
|
36
|
-
# app.scrape_url(blocklisted_url)
|
|
37
|
-
# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
38
|
-
|
|
39
|
-
def test_successful_response_with_valid_preview_token():
|
|
40
|
-
app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'), version='v0')
|
|
41
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
42
|
-
assert response is not None
|
|
43
|
-
assert 'content' in response
|
|
44
|
-
assert "_Roast_" in response['content']
|
|
45
|
-
|
|
46
|
-
def test_scrape_url_e2e():
|
|
47
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
48
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
49
|
-
print(response)
|
|
50
|
-
|
|
51
|
-
assert response is not None
|
|
52
|
-
assert 'content' in response
|
|
53
|
-
assert 'markdown' in response
|
|
54
|
-
assert 'metadata' in response
|
|
55
|
-
assert 'html' not in response
|
|
56
|
-
assert "_Roast_" in response['content']
|
|
57
|
-
|
|
58
|
-
def test_successful_response_with_valid_api_key_and_include_html():
|
|
59
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
60
|
-
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
|
61
|
-
assert response is not None
|
|
62
|
-
assert 'content' in response
|
|
63
|
-
assert 'markdown' in response
|
|
64
|
-
assert 'html' in response
|
|
65
|
-
assert 'metadata' in response
|
|
66
|
-
assert "_Roast_" in response['content']
|
|
67
|
-
assert "_Roast_" in response['markdown']
|
|
68
|
-
assert "<h1" in response['html']
|
|
69
|
-
|
|
70
|
-
def test_successful_response_for_valid_scrape_with_pdf_file():
|
|
71
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
72
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
|
73
|
-
assert response is not None
|
|
74
|
-
assert 'content' in response
|
|
75
|
-
assert 'metadata' in response
|
|
76
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
77
|
-
|
|
78
|
-
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
|
79
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
80
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
|
81
|
-
time.sleep(6) # wait for 6 seconds
|
|
82
|
-
assert response is not None
|
|
83
|
-
assert 'content' in response
|
|
84
|
-
assert 'metadata' in response
|
|
85
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
86
|
-
|
|
87
|
-
def test_crawl_url_invalid_api_key():
|
|
88
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
89
|
-
with pytest.raises(Exception) as excinfo:
|
|
90
|
-
invalid_app.crawl_url('https://firecrawl.dev')
|
|
91
|
-
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
92
|
-
|
|
93
|
-
# def test_should_return_error_for_blocklisted_url():
|
|
94
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
95
|
-
# blocklisted_url = "https://twitter.com/fake-test"
|
|
96
|
-
# with pytest.raises(Exception) as excinfo:
|
|
97
|
-
# app.crawl_url(blocklisted_url)
|
|
98
|
-
# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
99
|
-
|
|
100
|
-
def test_crawl_url_wait_for_completion_e2e():
|
|
101
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
102
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
|
103
|
-
assert response is not None
|
|
104
|
-
assert len(response) > 0
|
|
105
|
-
assert 'content' in response[0]
|
|
106
|
-
assert "_Roast_" in response[0]['content']
|
|
107
|
-
|
|
108
|
-
def test_crawl_url_with_idempotency_key_e2e():
|
|
109
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
110
|
-
uniqueIdempotencyKey = str(uuid4())
|
|
111
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
112
|
-
assert response is not None
|
|
113
|
-
assert len(response) > 0
|
|
114
|
-
assert 'content' in response[0]
|
|
115
|
-
assert "_Roast_" in response[0]['content']
|
|
116
|
-
|
|
117
|
-
with pytest.raises(Exception) as excinfo:
|
|
118
|
-
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
119
|
-
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
|
120
|
-
|
|
121
|
-
def test_check_crawl_status_e2e():
|
|
122
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
123
|
-
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
|
124
|
-
assert response is not None
|
|
125
|
-
assert 'jobId' in response
|
|
126
|
-
|
|
127
|
-
time.sleep(30) # wait for 30 seconds
|
|
128
|
-
status_response = app.check_crawl_status(response['jobId'])
|
|
129
|
-
assert status_response is not None
|
|
130
|
-
assert 'status' in status_response
|
|
131
|
-
assert status_response['status'] == 'completed'
|
|
132
|
-
assert 'data' in status_response
|
|
133
|
-
assert len(status_response['data']) > 0
|
|
134
|
-
|
|
135
|
-
def test_search_e2e():
|
|
136
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
137
|
-
response = app.search("test query")
|
|
138
|
-
assert response is not None
|
|
139
|
-
assert 'content' in response[0]
|
|
140
|
-
assert len(response) > 2
|
|
141
|
-
|
|
142
|
-
def test_search_invalid_api_key():
|
|
143
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
144
|
-
with pytest.raises(Exception) as excinfo:
|
|
145
|
-
invalid_app.search("test query")
|
|
146
|
-
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
147
|
-
|
|
148
|
-
def test_llm_extraction():
|
|
149
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
150
|
-
response = app.scrape_url("https://firecrawl.dev", {
|
|
151
|
-
'extractorOptions': {
|
|
152
|
-
'mode': 'llm-extraction',
|
|
153
|
-
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
|
154
|
-
'extractionSchema': {
|
|
155
|
-
'type': 'object',
|
|
156
|
-
'properties': {
|
|
157
|
-
'company_mission': {'type': 'string'},
|
|
158
|
-
'supports_sso': {'type': 'boolean'},
|
|
159
|
-
'is_open_source': {'type': 'boolean'}
|
|
160
|
-
},
|
|
161
|
-
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
})
|
|
165
|
-
assert response is not None
|
|
166
|
-
assert 'llm_extraction' in response
|
|
167
|
-
llm_extraction = response['llm_extraction']
|
|
168
|
-
assert 'company_mission' in llm_extraction
|
|
169
|
-
assert isinstance(llm_extraction['supports_sso'], bool)
|
|
170
|
-
assert isinstance(llm_extraction['is_open_source'], bool)
|
|
File without changes
|