firecrawl 2.5.3__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/firecrawl.py +28 -1
- {firecrawl-2.5.3.dist-info → firecrawl-2.6.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.5.3.dist-info → firecrawl-2.6.0.dist-info}/METADATA +1 -1
- firecrawl-2.6.0.dist-info/RECORD +12 -0
- {firecrawl-2.5.3.dist-info → firecrawl-2.6.0.dist-info}/top_level.txt +0 -2
- build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4439
- build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/firecrawl/firecrawl.py +0 -4439
- build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/firecrawl/firecrawl.py +0 -4439
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl-2.5.3.dist-info/RECORD +0 -33
- {firecrawl-2.5.3.dist-info → firecrawl-2.6.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.6.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -161,7 +161,7 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
161
161
|
class WaitAction(pydantic.BaseModel):
|
|
162
162
|
"""Wait action to perform during scraping."""
|
|
163
163
|
type: Literal["wait"]
|
|
164
|
-
milliseconds: int
|
|
164
|
+
milliseconds: Optional[int] = None
|
|
165
165
|
selector: Optional[str] = None
|
|
166
166
|
|
|
167
167
|
class ScreenshotAction(pydantic.BaseModel):
|
|
@@ -259,6 +259,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
259
259
|
deduplicateSimilarURLs: Optional[bool] = None
|
|
260
260
|
ignoreQueryParameters: Optional[bool] = None
|
|
261
261
|
regexOnFullURL: Optional[bool] = None
|
|
262
|
+
delay: Optional[int] = None # Delay in seconds between scrapes
|
|
262
263
|
|
|
263
264
|
class CrawlResponse(pydantic.BaseModel):
|
|
264
265
|
"""Response from crawling operations."""
|
|
@@ -346,6 +347,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
|
|
|
346
347
|
"""
|
|
347
348
|
maxUrls: Optional[int] = 10
|
|
348
349
|
showFullText: Optional[bool] = False
|
|
350
|
+
cache: Optional[bool] = True
|
|
349
351
|
__experimental_stream: Optional[bool] = None
|
|
350
352
|
|
|
351
353
|
class DeepResearchParams(pydantic.BaseModel):
|
|
@@ -681,6 +683,7 @@ class FirecrawlApp:
|
|
|
681
683
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
682
684
|
ignore_query_parameters: Optional[bool] = None,
|
|
683
685
|
regex_on_full_url: Optional[bool] = None,
|
|
686
|
+
delay: Optional[int] = None,
|
|
684
687
|
poll_interval: Optional[int] = 2,
|
|
685
688
|
idempotency_key: Optional[str] = None,
|
|
686
689
|
**kwargs
|
|
@@ -703,6 +706,7 @@ class FirecrawlApp:
|
|
|
703
706
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
704
707
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
705
708
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
709
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
706
710
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
707
711
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
708
712
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -748,6 +752,8 @@ class FirecrawlApp:
|
|
|
748
752
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
749
753
|
if regex_on_full_url is not None:
|
|
750
754
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
755
|
+
if delay is not None:
|
|
756
|
+
crawl_params['delay'] = delay
|
|
751
757
|
|
|
752
758
|
# Add any additional kwargs
|
|
753
759
|
crawl_params.update(kwargs)
|
|
@@ -788,6 +794,7 @@ class FirecrawlApp:
|
|
|
788
794
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
789
795
|
ignore_query_parameters: Optional[bool] = None,
|
|
790
796
|
regex_on_full_url: Optional[bool] = None,
|
|
797
|
+
delay: Optional[int] = None,
|
|
791
798
|
idempotency_key: Optional[str] = None,
|
|
792
799
|
**kwargs
|
|
793
800
|
) -> CrawlResponse:
|
|
@@ -854,6 +861,8 @@ class FirecrawlApp:
|
|
|
854
861
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
855
862
|
if regex_on_full_url is not None:
|
|
856
863
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
864
|
+
if delay is not None:
|
|
865
|
+
crawl_params['delay'] = delay
|
|
857
866
|
|
|
858
867
|
# Add any additional kwargs
|
|
859
868
|
crawl_params.update(kwargs)
|
|
@@ -1862,6 +1871,7 @@ class FirecrawlApp:
|
|
|
1862
1871
|
*,
|
|
1863
1872
|
max_urls: Optional[int] = None,
|
|
1864
1873
|
show_full_text: Optional[bool] = None,
|
|
1874
|
+
cache: Optional[bool] = None,
|
|
1865
1875
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
|
1866
1876
|
"""
|
|
1867
1877
|
Generate LLMs.txt for a given URL and poll until completion.
|
|
@@ -1870,6 +1880,7 @@ class FirecrawlApp:
|
|
|
1870
1880
|
url (str): Target URL to generate LLMs.txt from
|
|
1871
1881
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1872
1882
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1883
|
+
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
|
1873
1884
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
1874
1885
|
|
|
1875
1886
|
Returns:
|
|
@@ -1885,6 +1896,7 @@ class FirecrawlApp:
|
|
|
1885
1896
|
params = GenerateLLMsTextParams(
|
|
1886
1897
|
maxUrls=max_urls,
|
|
1887
1898
|
showFullText=show_full_text,
|
|
1899
|
+
cache=cache,
|
|
1888
1900
|
__experimental_stream=experimental_stream
|
|
1889
1901
|
)
|
|
1890
1902
|
|
|
@@ -1892,6 +1904,7 @@ class FirecrawlApp:
|
|
|
1892
1904
|
url,
|
|
1893
1905
|
max_urls=max_urls,
|
|
1894
1906
|
show_full_text=show_full_text,
|
|
1907
|
+
cache=cache,
|
|
1895
1908
|
experimental_stream=experimental_stream
|
|
1896
1909
|
)
|
|
1897
1910
|
|
|
@@ -1927,6 +1940,7 @@ class FirecrawlApp:
|
|
|
1927
1940
|
*,
|
|
1928
1941
|
max_urls: Optional[int] = None,
|
|
1929
1942
|
show_full_text: Optional[bool] = None,
|
|
1943
|
+
cache: Optional[bool] = None,
|
|
1930
1944
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
1931
1945
|
"""
|
|
1932
1946
|
Initiate an asynchronous LLMs.txt generation operation.
|
|
@@ -1935,6 +1949,7 @@ class FirecrawlApp:
|
|
|
1935
1949
|
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
|
|
1936
1950
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
1937
1951
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
1952
|
+
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
|
1938
1953
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
1939
1954
|
|
|
1940
1955
|
Returns:
|
|
@@ -1949,6 +1964,7 @@ class FirecrawlApp:
|
|
|
1949
1964
|
params = GenerateLLMsTextParams(
|
|
1950
1965
|
maxUrls=max_urls,
|
|
1951
1966
|
showFullText=show_full_text,
|
|
1967
|
+
cache=cache,
|
|
1952
1968
|
__experimental_stream=experimental_stream
|
|
1953
1969
|
)
|
|
1954
1970
|
|
|
@@ -3240,6 +3256,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3240
3256
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3241
3257
|
ignore_query_parameters: Optional[bool] = None,
|
|
3242
3258
|
regex_on_full_url: Optional[bool] = None,
|
|
3259
|
+
delay: Optional[int] = None,
|
|
3243
3260
|
poll_interval: Optional[int] = 2,
|
|
3244
3261
|
idempotency_key: Optional[str] = None,
|
|
3245
3262
|
**kwargs
|
|
@@ -3262,6 +3279,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3262
3279
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3263
3280
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3264
3281
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
3282
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
3265
3283
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
3266
3284
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3267
3285
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -3307,6 +3325,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3307
3325
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
3308
3326
|
if regex_on_full_url is not None:
|
|
3309
3327
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3328
|
+
if delay is not None:
|
|
3329
|
+
crawl_params['delay'] = delay
|
|
3310
3330
|
|
|
3311
3331
|
# Add any additional kwargs
|
|
3312
3332
|
crawl_params.update(kwargs)
|
|
@@ -3348,6 +3368,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3348
3368
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3349
3369
|
ignore_query_parameters: Optional[bool] = None,
|
|
3350
3370
|
regex_on_full_url: Optional[bool] = None,
|
|
3371
|
+
delay: Optional[int] = None,
|
|
3351
3372
|
poll_interval: Optional[int] = 2,
|
|
3352
3373
|
idempotency_key: Optional[str] = None,
|
|
3353
3374
|
**kwargs
|
|
@@ -3412,6 +3433,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3412
3433
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
|
3413
3434
|
if regex_on_full_url is not None:
|
|
3414
3435
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3436
|
+
if delay is not None:
|
|
3437
|
+
crawl_params['delay'] = delay
|
|
3415
3438
|
|
|
3416
3439
|
# Add any additional kwargs
|
|
3417
3440
|
crawl_params.update(kwargs)
|
|
@@ -3986,6 +4009,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3986
4009
|
url,
|
|
3987
4010
|
max_urls=max_urls,
|
|
3988
4011
|
show_full_text=show_full_text,
|
|
4012
|
+
cache=cache,
|
|
3989
4013
|
experimental_stream=experimental_stream
|
|
3990
4014
|
)
|
|
3991
4015
|
if not response.get('success') or 'id' not in response:
|
|
@@ -4012,6 +4036,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4012
4036
|
*,
|
|
4013
4037
|
max_urls: Optional[int] = None,
|
|
4014
4038
|
show_full_text: Optional[bool] = None,
|
|
4039
|
+
cache: Optional[bool] = None,
|
|
4015
4040
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
|
4016
4041
|
"""
|
|
4017
4042
|
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
|
@@ -4020,6 +4045,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4020
4045
|
url (str): Target URL to generate LLMs.txt from
|
|
4021
4046
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
|
4022
4047
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
|
4048
|
+
cache (Optional[bool]): Whether to use cached content if available (default: True)
|
|
4023
4049
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
|
4024
4050
|
|
|
4025
4051
|
Returns:
|
|
@@ -4042,6 +4068,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4042
4068
|
params = GenerateLLMsTextParams(
|
|
4043
4069
|
maxUrls=max_urls,
|
|
4044
4070
|
showFullText=show_full_text,
|
|
4071
|
+
cache=cache,
|
|
4045
4072
|
__experimental_stream=experimental_stream
|
|
4046
4073
|
)
|
|
4047
4074
|
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=bds9ny9yl_8sXYAmfjaVd_32uf8Qm0ZPbZ2V-LGTgGQ,2612
|
|
2
|
+
firecrawl/firecrawl.py,sha256=Y_8gS4vBPQZ_LP8UXdZAOEMEmK0kRv88Jst2dZ3VW8c,190089
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl-2.6.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.6.0.dist-info/METADATA,sha256=gOm5xIZTdtTxAIVZbjZFf5ALWIrIOGpYCFqQ9zW4PAU,7165
|
|
10
|
+
firecrawl-2.6.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.6.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.6.0.dist-info/RECORD,,
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This is the Firecrawl package.
|
|
3
|
-
|
|
4
|
-
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
-
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
-
and check the status of these jobs.
|
|
7
|
-
|
|
8
|
-
For more information visit https://github.com/firecrawl/
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
|
-
|
|
16
|
-
__version__ = "2.5.3"
|
|
17
|
-
|
|
18
|
-
# Define the logger for the Firecrawl project
|
|
19
|
-
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _configure_logger() -> None:
|
|
23
|
-
"""
|
|
24
|
-
Configure the firecrawl logger for console output.
|
|
25
|
-
|
|
26
|
-
The function attaches a handler for console output with a specific format and date
|
|
27
|
-
format to the firecrawl logger.
|
|
28
|
-
"""
|
|
29
|
-
try:
|
|
30
|
-
# Create the formatter
|
|
31
|
-
formatter = logging.Formatter(
|
|
32
|
-
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
33
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# Create the console handler and set the formatter
|
|
37
|
-
console_handler = logging.StreamHandler()
|
|
38
|
-
console_handler.setFormatter(formatter)
|
|
39
|
-
|
|
40
|
-
# Add the console handler to the firecrawl logger
|
|
41
|
-
logger.addHandler(console_handler)
|
|
42
|
-
except Exception as e:
|
|
43
|
-
logger.error("Failed to configure logging: %s", e)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def setup_logging() -> None:
|
|
47
|
-
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
48
|
-
# Check if the firecrawl logger already has a handler
|
|
49
|
-
if logger.hasHandlers():
|
|
50
|
-
return # To prevent duplicate logging
|
|
51
|
-
|
|
52
|
-
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
|
53
|
-
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
54
|
-
# Attach a no-op handler to prevent warnings about no handlers
|
|
55
|
-
logger.addHandler(logging.NullHandler())
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
# Attach the console handler to the firecrawl logger
|
|
59
|
-
_configure_logger()
|
|
60
|
-
|
|
61
|
-
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
|
62
|
-
if env == "DEBUG":
|
|
63
|
-
logger.setLevel(logging.DEBUG)
|
|
64
|
-
elif env == "INFO":
|
|
65
|
-
logger.setLevel(logging.INFO)
|
|
66
|
-
elif env == "WARNING":
|
|
67
|
-
logger.setLevel(logging.WARNING)
|
|
68
|
-
elif env == "ERROR":
|
|
69
|
-
logger.setLevel(logging.ERROR)
|
|
70
|
-
elif env == "CRITICAL":
|
|
71
|
-
logger.setLevel(logging.CRITICAL)
|
|
72
|
-
else:
|
|
73
|
-
logger.setLevel(logging.INFO)
|
|
74
|
-
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# Initialize logging configuration when the module is imported
|
|
78
|
-
setup_logging()
|
|
79
|
-
logger.debug("Debugging logger setup")
|
|
File without changes
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
import importlib.util
|
|
2
|
-
import pytest
|
|
3
|
-
import time
|
|
4
|
-
import os
|
|
5
|
-
from uuid import uuid4
|
|
6
|
-
from dotenv import load_dotenv
|
|
7
|
-
|
|
8
|
-
load_dotenv()
|
|
9
|
-
|
|
10
|
-
API_URL = "http://127.0.0.1:3002"
|
|
11
|
-
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
|
12
|
-
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
|
13
|
-
|
|
14
|
-
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
|
15
|
-
|
|
16
|
-
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
|
17
|
-
firecrawl = importlib.util.module_from_spec(spec)
|
|
18
|
-
spec.loader.exec_module(firecrawl)
|
|
19
|
-
FirecrawlApp = firecrawl.FirecrawlApp
|
|
20
|
-
|
|
21
|
-
def test_no_api_key():
|
|
22
|
-
with pytest.raises(Exception) as excinfo:
|
|
23
|
-
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
|
24
|
-
assert "No API key provided" in str(excinfo.value)
|
|
25
|
-
|
|
26
|
-
def test_scrape_url_invalid_api_key():
|
|
27
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
28
|
-
with pytest.raises(Exception) as excinfo:
|
|
29
|
-
invalid_app.scrape_url('https://firecrawl.dev')
|
|
30
|
-
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
31
|
-
|
|
32
|
-
# def test_blocklisted_url():
|
|
33
|
-
# blocklisted_url = "https://facebook.com/fake-test"
|
|
34
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
35
|
-
# with pytest.raises(Exception) as excinfo:
|
|
36
|
-
# app.scrape_url(blocklisted_url)
|
|
37
|
-
# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
38
|
-
|
|
39
|
-
def test_successful_response_with_valid_preview_token():
|
|
40
|
-
app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'), version='v0')
|
|
41
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
42
|
-
assert response is not None
|
|
43
|
-
assert 'content' in response
|
|
44
|
-
assert "_Roast_" in response['content']
|
|
45
|
-
|
|
46
|
-
def test_scrape_url_e2e():
|
|
47
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
48
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
49
|
-
print(response)
|
|
50
|
-
|
|
51
|
-
assert response is not None
|
|
52
|
-
assert 'content' in response
|
|
53
|
-
assert 'markdown' in response
|
|
54
|
-
assert 'metadata' in response
|
|
55
|
-
assert 'html' not in response
|
|
56
|
-
assert "_Roast_" in response['content']
|
|
57
|
-
|
|
58
|
-
def test_successful_response_with_valid_api_key_and_include_html():
|
|
59
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
60
|
-
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
|
61
|
-
assert response is not None
|
|
62
|
-
assert 'content' in response
|
|
63
|
-
assert 'markdown' in response
|
|
64
|
-
assert 'html' in response
|
|
65
|
-
assert 'metadata' in response
|
|
66
|
-
assert "_Roast_" in response['content']
|
|
67
|
-
assert "_Roast_" in response['markdown']
|
|
68
|
-
assert "<h1" in response['html']
|
|
69
|
-
|
|
70
|
-
def test_successful_response_for_valid_scrape_with_pdf_file():
|
|
71
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
72
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
|
73
|
-
assert response is not None
|
|
74
|
-
assert 'content' in response
|
|
75
|
-
assert 'metadata' in response
|
|
76
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
77
|
-
|
|
78
|
-
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
|
79
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
80
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
|
81
|
-
time.sleep(6) # wait for 6 seconds
|
|
82
|
-
assert response is not None
|
|
83
|
-
assert 'content' in response
|
|
84
|
-
assert 'metadata' in response
|
|
85
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
86
|
-
|
|
87
|
-
def test_crawl_url_invalid_api_key():
|
|
88
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
89
|
-
with pytest.raises(Exception) as excinfo:
|
|
90
|
-
invalid_app.crawl_url('https://firecrawl.dev')
|
|
91
|
-
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
92
|
-
|
|
93
|
-
# def test_should_return_error_for_blocklisted_url():
|
|
94
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
95
|
-
# blocklisted_url = "https://twitter.com/fake-test"
|
|
96
|
-
# with pytest.raises(Exception) as excinfo:
|
|
97
|
-
# app.crawl_url(blocklisted_url)
|
|
98
|
-
# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
99
|
-
|
|
100
|
-
def test_crawl_url_wait_for_completion_e2e():
|
|
101
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
102
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
|
103
|
-
assert response is not None
|
|
104
|
-
assert len(response) > 0
|
|
105
|
-
assert 'content' in response[0]
|
|
106
|
-
assert "_Roast_" in response[0]['content']
|
|
107
|
-
|
|
108
|
-
def test_crawl_url_with_idempotency_key_e2e():
|
|
109
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
110
|
-
uniqueIdempotencyKey = str(uuid4())
|
|
111
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
112
|
-
assert response is not None
|
|
113
|
-
assert len(response) > 0
|
|
114
|
-
assert 'content' in response[0]
|
|
115
|
-
assert "_Roast_" in response[0]['content']
|
|
116
|
-
|
|
117
|
-
with pytest.raises(Exception) as excinfo:
|
|
118
|
-
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
119
|
-
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
|
120
|
-
|
|
121
|
-
def test_check_crawl_status_e2e():
|
|
122
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
123
|
-
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
|
124
|
-
assert response is not None
|
|
125
|
-
assert 'jobId' in response
|
|
126
|
-
|
|
127
|
-
time.sleep(30) # wait for 30 seconds
|
|
128
|
-
status_response = app.check_crawl_status(response['jobId'])
|
|
129
|
-
assert status_response is not None
|
|
130
|
-
assert 'status' in status_response
|
|
131
|
-
assert status_response['status'] == 'completed'
|
|
132
|
-
assert 'data' in status_response
|
|
133
|
-
assert len(status_response['data']) > 0
|
|
134
|
-
|
|
135
|
-
def test_search_e2e():
|
|
136
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
137
|
-
response = app.search("test query")
|
|
138
|
-
assert response is not None
|
|
139
|
-
assert 'content' in response[0]
|
|
140
|
-
assert len(response) > 2
|
|
141
|
-
|
|
142
|
-
def test_search_invalid_api_key():
|
|
143
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
144
|
-
with pytest.raises(Exception) as excinfo:
|
|
145
|
-
invalid_app.search("test query")
|
|
146
|
-
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
147
|
-
|
|
148
|
-
def test_llm_extraction():
|
|
149
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
150
|
-
response = app.scrape_url("https://firecrawl.dev", {
|
|
151
|
-
'extractorOptions': {
|
|
152
|
-
'mode': 'llm-extraction',
|
|
153
|
-
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
|
154
|
-
'extractionSchema': {
|
|
155
|
-
'type': 'object',
|
|
156
|
-
'properties': {
|
|
157
|
-
'company_mission': {'type': 'string'},
|
|
158
|
-
'supports_sso': {'type': 'boolean'},
|
|
159
|
-
'is_open_source': {'type': 'boolean'}
|
|
160
|
-
},
|
|
161
|
-
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
})
|
|
165
|
-
assert response is not None
|
|
166
|
-
assert 'llm_extraction' in response
|
|
167
|
-
llm_extraction = response['llm_extraction']
|
|
168
|
-
assert 'company_mission' in llm_extraction
|
|
169
|
-
assert isinstance(llm_extraction['supports_sso'], bool)
|
|
170
|
-
assert isinstance(llm_extraction['is_open_source'], bool)
|
|
File without changes
|