firecrawl 2.5.3__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (29) hide show
  1. firecrawl/__init__.py +1 -1
  2. firecrawl/firecrawl.py +28 -1
  3. {firecrawl-2.5.3.dist-info → firecrawl-2.6.0.dist-info}/LICENSE +0 -0
  4. {firecrawl-2.5.3.dist-info → firecrawl-2.6.0.dist-info}/METADATA +1 -1
  5. firecrawl-2.6.0.dist-info/RECORD +12 -0
  6. {firecrawl-2.5.3.dist-info → firecrawl-2.6.0.dist-info}/top_level.txt +0 -2
  7. build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  8. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  9. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  10. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  11. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  12. build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4439
  13. build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  14. build/lib/build/lib/firecrawl/__init__.py +0 -79
  15. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  16. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  17. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  18. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  19. build/lib/build/lib/firecrawl/firecrawl.py +0 -4439
  20. build/lib/build/lib/tests/test_change_tracking.py +0 -98
  21. build/lib/firecrawl/__init__.py +0 -79
  22. build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  23. build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  24. build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  25. build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  26. build/lib/firecrawl/firecrawl.py +0 -4439
  27. build/lib/tests/test_change_tracking.py +0 -98
  28. firecrawl-2.5.3.dist-info/RECORD +0 -33
  29. {firecrawl-2.5.3.dist-info → firecrawl-2.6.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py CHANGED
@@ -13,7 +13,7 @@ import os
13
13
 
14
14
  from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
15
15
 
16
- __version__ = "2.5.3"
16
+ __version__ = "2.6.0"
17
17
 
18
18
  # Define the logger for the Firecrawl project
19
19
  logger: logging.Logger = logging.getLogger("firecrawl")
firecrawl/firecrawl.py CHANGED
@@ -161,7 +161,7 @@ class ScrapeOptions(pydantic.BaseModel):
161
161
  class WaitAction(pydantic.BaseModel):
162
162
  """Wait action to perform during scraping."""
163
163
  type: Literal["wait"]
164
- milliseconds: int
164
+ milliseconds: Optional[int] = None
165
165
  selector: Optional[str] = None
166
166
 
167
167
  class ScreenshotAction(pydantic.BaseModel):
@@ -259,6 +259,7 @@ class CrawlParams(pydantic.BaseModel):
259
259
  deduplicateSimilarURLs: Optional[bool] = None
260
260
  ignoreQueryParameters: Optional[bool] = None
261
261
  regexOnFullURL: Optional[bool] = None
262
+ delay: Optional[int] = None # Delay in seconds between scrapes
262
263
 
263
264
  class CrawlResponse(pydantic.BaseModel):
264
265
  """Response from crawling operations."""
@@ -346,6 +347,7 @@ class GenerateLLMsTextParams(pydantic.BaseModel):
346
347
  """
347
348
  maxUrls: Optional[int] = 10
348
349
  showFullText: Optional[bool] = False
350
+ cache: Optional[bool] = True
349
351
  __experimental_stream: Optional[bool] = None
350
352
 
351
353
  class DeepResearchParams(pydantic.BaseModel):
@@ -681,6 +683,7 @@ class FirecrawlApp:
681
683
  deduplicate_similar_urls: Optional[bool] = None,
682
684
  ignore_query_parameters: Optional[bool] = None,
683
685
  regex_on_full_url: Optional[bool] = None,
686
+ delay: Optional[int] = None,
684
687
  poll_interval: Optional[int] = 2,
685
688
  idempotency_key: Optional[str] = None,
686
689
  **kwargs
@@ -703,6 +706,7 @@ class FirecrawlApp:
703
706
  deduplicate_similar_urls (Optional[bool]): Remove similar URLs
704
707
  ignore_query_parameters (Optional[bool]): Ignore URL parameters
705
708
  regex_on_full_url (Optional[bool]): Apply regex to full URLs
709
+ delay (Optional[int]): Delay in seconds between scrapes
706
710
  poll_interval (Optional[int]): Seconds between status checks (default: 2)
707
711
  idempotency_key (Optional[str]): Unique key to prevent duplicate requests
708
712
  **kwargs: Additional parameters to pass to the API
@@ -748,6 +752,8 @@ class FirecrawlApp:
748
752
  crawl_params['ignoreQueryParameters'] = ignore_query_parameters
749
753
  if regex_on_full_url is not None:
750
754
  crawl_params['regexOnFullURL'] = regex_on_full_url
755
+ if delay is not None:
756
+ crawl_params['delay'] = delay
751
757
 
752
758
  # Add any additional kwargs
753
759
  crawl_params.update(kwargs)
@@ -788,6 +794,7 @@ class FirecrawlApp:
788
794
  deduplicate_similar_urls: Optional[bool] = None,
789
795
  ignore_query_parameters: Optional[bool] = None,
790
796
  regex_on_full_url: Optional[bool] = None,
797
+ delay: Optional[int] = None,
791
798
  idempotency_key: Optional[str] = None,
792
799
  **kwargs
793
800
  ) -> CrawlResponse:
@@ -854,6 +861,8 @@ class FirecrawlApp:
854
861
  crawl_params['ignoreQueryParameters'] = ignore_query_parameters
855
862
  if regex_on_full_url is not None:
856
863
  crawl_params['regexOnFullURL'] = regex_on_full_url
864
+ if delay is not None:
865
+ crawl_params['delay'] = delay
857
866
 
858
867
  # Add any additional kwargs
859
868
  crawl_params.update(kwargs)
@@ -1862,6 +1871,7 @@ class FirecrawlApp:
1862
1871
  *,
1863
1872
  max_urls: Optional[int] = None,
1864
1873
  show_full_text: Optional[bool] = None,
1874
+ cache: Optional[bool] = None,
1865
1875
  experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
1866
1876
  """
1867
1877
  Generate LLMs.txt for a given URL and poll until completion.
@@ -1870,6 +1880,7 @@ class FirecrawlApp:
1870
1880
  url (str): Target URL to generate LLMs.txt from
1871
1881
  max_urls (Optional[int]): Maximum URLs to process (default: 10)
1872
1882
  show_full_text (Optional[bool]): Include full text in output (default: False)
1883
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
1873
1884
  experimental_stream (Optional[bool]): Enable experimental streaming
1874
1885
 
1875
1886
  Returns:
@@ -1885,6 +1896,7 @@ class FirecrawlApp:
1885
1896
  params = GenerateLLMsTextParams(
1886
1897
  maxUrls=max_urls,
1887
1898
  showFullText=show_full_text,
1899
+ cache=cache,
1888
1900
  __experimental_stream=experimental_stream
1889
1901
  )
1890
1902
 
@@ -1892,6 +1904,7 @@ class FirecrawlApp:
1892
1904
  url,
1893
1905
  max_urls=max_urls,
1894
1906
  show_full_text=show_full_text,
1907
+ cache=cache,
1895
1908
  experimental_stream=experimental_stream
1896
1909
  )
1897
1910
 
@@ -1927,6 +1940,7 @@ class FirecrawlApp:
1927
1940
  *,
1928
1941
  max_urls: Optional[int] = None,
1929
1942
  show_full_text: Optional[bool] = None,
1943
+ cache: Optional[bool] = None,
1930
1944
  experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
1931
1945
  """
1932
1946
  Initiate an asynchronous LLMs.txt generation operation.
@@ -1935,6 +1949,7 @@ class FirecrawlApp:
1935
1949
  url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
1936
1950
  max_urls (Optional[int]): Maximum URLs to process (default: 10)
1937
1951
  show_full_text (Optional[bool]): Include full text in output (default: False)
1952
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
1938
1953
  experimental_stream (Optional[bool]): Enable experimental streaming
1939
1954
 
1940
1955
  Returns:
@@ -1949,6 +1964,7 @@ class FirecrawlApp:
1949
1964
  params = GenerateLLMsTextParams(
1950
1965
  maxUrls=max_urls,
1951
1966
  showFullText=show_full_text,
1967
+ cache=cache,
1952
1968
  __experimental_stream=experimental_stream
1953
1969
  )
1954
1970
 
@@ -3240,6 +3256,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
3240
3256
  deduplicate_similar_urls: Optional[bool] = None,
3241
3257
  ignore_query_parameters: Optional[bool] = None,
3242
3258
  regex_on_full_url: Optional[bool] = None,
3259
+ delay: Optional[int] = None,
3243
3260
  poll_interval: Optional[int] = 2,
3244
3261
  idempotency_key: Optional[str] = None,
3245
3262
  **kwargs
@@ -3262,6 +3279,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
3262
3279
  deduplicate_similar_urls (Optional[bool]): Remove similar URLs
3263
3280
  ignore_query_parameters (Optional[bool]): Ignore URL parameters
3264
3281
  regex_on_full_url (Optional[bool]): Apply regex to full URLs
3282
+ delay (Optional[int]): Delay in seconds between scrapes
3265
3283
  poll_interval (Optional[int]): Seconds between status checks (default: 2)
3266
3284
  idempotency_key (Optional[str]): Unique key to prevent duplicate requests
3267
3285
  **kwargs: Additional parameters to pass to the API
@@ -3307,6 +3325,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
3307
3325
  crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3308
3326
  if regex_on_full_url is not None:
3309
3327
  crawl_params['regexOnFullURL'] = regex_on_full_url
3328
+ if delay is not None:
3329
+ crawl_params['delay'] = delay
3310
3330
 
3311
3331
  # Add any additional kwargs
3312
3332
  crawl_params.update(kwargs)
@@ -3348,6 +3368,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
3348
3368
  deduplicate_similar_urls: Optional[bool] = None,
3349
3369
  ignore_query_parameters: Optional[bool] = None,
3350
3370
  regex_on_full_url: Optional[bool] = None,
3371
+ delay: Optional[int] = None,
3351
3372
  poll_interval: Optional[int] = 2,
3352
3373
  idempotency_key: Optional[str] = None,
3353
3374
  **kwargs
@@ -3412,6 +3433,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
3412
3433
  crawl_params['ignoreQueryParameters'] = ignore_query_parameters
3413
3434
  if regex_on_full_url is not None:
3414
3435
  crawl_params['regexOnFullURL'] = regex_on_full_url
3436
+ if delay is not None:
3437
+ crawl_params['delay'] = delay
3415
3438
 
3416
3439
  # Add any additional kwargs
3417
3440
  crawl_params.update(kwargs)
@@ -3986,6 +4009,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
3986
4009
  url,
3987
4010
  max_urls=max_urls,
3988
4011
  show_full_text=show_full_text,
4012
+ cache=cache,
3989
4013
  experimental_stream=experimental_stream
3990
4014
  )
3991
4015
  if not response.get('success') or 'id' not in response:
@@ -4012,6 +4036,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
4012
4036
  *,
4013
4037
  max_urls: Optional[int] = None,
4014
4038
  show_full_text: Optional[bool] = None,
4039
+ cache: Optional[bool] = None,
4015
4040
  experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
4016
4041
  """
4017
4042
  Initiate an asynchronous LLMs.txt generation job without waiting for completion.
@@ -4020,6 +4045,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
4020
4045
  url (str): Target URL to generate LLMs.txt from
4021
4046
  max_urls (Optional[int]): Maximum URLs to process (default: 10)
4022
4047
  show_full_text (Optional[bool]): Include full text in output (default: False)
4048
+ cache (Optional[bool]): Whether to use cached content if available (default: True)
4023
4049
  experimental_stream (Optional[bool]): Enable experimental streaming
4024
4050
 
4025
4051
  Returns:
@@ -4042,6 +4068,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
4042
4068
  params = GenerateLLMsTextParams(
4043
4069
  maxUrls=max_urls,
4044
4070
  showFullText=show_full_text,
4071
+ cache=cache,
4045
4072
  __experimental_stream=experimental_stream
4046
4073
  )
4047
4074
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 2.5.3
3
+ Version: 2.6.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/mendableai/firecrawl
6
6
  Author: Mendable.ai
@@ -0,0 +1,12 @@
1
+ firecrawl/__init__.py,sha256=bds9ny9yl_8sXYAmfjaVd_32uf8Qm0ZPbZ2V-LGTgGQ,2612
2
+ firecrawl/firecrawl.py,sha256=Y_8gS4vBPQZ_LP8UXdZAOEMEmK0kRv88Jst2dZ3VW8c,190089
3
+ firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
5
+ firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
7
+ tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
8
+ firecrawl-2.6.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
9
+ firecrawl-2.6.0.dist-info/METADATA,sha256=gOm5xIZTdtTxAIVZbjZFf5ALWIrIOGpYCFqQ9zW4PAU,7165
10
+ firecrawl-2.6.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
11
+ firecrawl-2.6.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
12
+ firecrawl-2.6.0.dist-info/RECORD,,
@@ -1,4 +1,2 @@
1
- build
2
- dist
3
1
  firecrawl
4
2
  tests
@@ -1,79 +0,0 @@
1
- """
2
- This is the Firecrawl package.
3
-
4
- This package provides a Python SDK for interacting with the Firecrawl API.
5
- It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
- and check the status of these jobs.
7
-
8
- For more information visit https://github.com/firecrawl/
9
- """
10
-
11
- import logging
12
- import os
13
-
14
- from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
15
-
16
- __version__ = "2.5.3"
17
-
18
- # Define the logger for the Firecrawl project
19
- logger: logging.Logger = logging.getLogger("firecrawl")
20
-
21
-
22
- def _configure_logger() -> None:
23
- """
24
- Configure the firecrawl logger for console output.
25
-
26
- The function attaches a handler for console output with a specific format and date
27
- format to the firecrawl logger.
28
- """
29
- try:
30
- # Create the formatter
31
- formatter = logging.Formatter(
32
- "[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
33
- datefmt="%Y-%m-%d %H:%M:%S",
34
- )
35
-
36
- # Create the console handler and set the formatter
37
- console_handler = logging.StreamHandler()
38
- console_handler.setFormatter(formatter)
39
-
40
- # Add the console handler to the firecrawl logger
41
- logger.addHandler(console_handler)
42
- except Exception as e:
43
- logger.error("Failed to configure logging: %s", e)
44
-
45
-
46
- def setup_logging() -> None:
47
- """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
48
- # Check if the firecrawl logger already has a handler
49
- if logger.hasHandlers():
50
- return # To prevent duplicate logging
51
-
52
- # Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
53
- if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
54
- # Attach a no-op handler to prevent warnings about no handlers
55
- logger.addHandler(logging.NullHandler())
56
- return
57
-
58
- # Attach the console handler to the firecrawl logger
59
- _configure_logger()
60
-
61
- # Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
62
- if env == "DEBUG":
63
- logger.setLevel(logging.DEBUG)
64
- elif env == "INFO":
65
- logger.setLevel(logging.INFO)
66
- elif env == "WARNING":
67
- logger.setLevel(logging.WARNING)
68
- elif env == "ERROR":
69
- logger.setLevel(logging.ERROR)
70
- elif env == "CRITICAL":
71
- logger.setLevel(logging.CRITICAL)
72
- else:
73
- logger.setLevel(logging.INFO)
74
- logger.warning("Unknown logging level: %s, defaulting to INFO", env)
75
-
76
-
77
- # Initialize logging configuration when the module is imported
78
- setup_logging()
79
- logger.debug("Debugging logger setup")
@@ -1,170 +0,0 @@
1
- import importlib.util
2
- import pytest
3
- import time
4
- import os
5
- from uuid import uuid4
6
- from dotenv import load_dotenv
7
-
8
- load_dotenv()
9
-
10
- API_URL = "http://127.0.0.1:3002"
11
- ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
12
- TEST_API_KEY = os.getenv('TEST_API_KEY')
13
-
14
- print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
15
-
16
- spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
17
- firecrawl = importlib.util.module_from_spec(spec)
18
- spec.loader.exec_module(firecrawl)
19
- FirecrawlApp = firecrawl.FirecrawlApp
20
-
21
- def test_no_api_key():
22
- with pytest.raises(Exception) as excinfo:
23
- invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
24
- assert "No API key provided" in str(excinfo.value)
25
-
26
- def test_scrape_url_invalid_api_key():
27
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
28
- with pytest.raises(Exception) as excinfo:
29
- invalid_app.scrape_url('https://firecrawl.dev')
30
- assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
31
-
32
- # def test_blocklisted_url():
33
- # blocklisted_url = "https://facebook.com/fake-test"
34
- # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
35
- # with pytest.raises(Exception) as excinfo:
36
- # app.scrape_url(blocklisted_url)
37
- # assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
38
-
39
- def test_successful_response_with_valid_preview_token():
40
- app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'), version='v0')
41
- response = app.scrape_url('https://roastmywebsite.ai')
42
- assert response is not None
43
- assert 'content' in response
44
- assert "_Roast_" in response['content']
45
-
46
- def test_scrape_url_e2e():
47
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
48
- response = app.scrape_url('https://roastmywebsite.ai')
49
- print(response)
50
-
51
- assert response is not None
52
- assert 'content' in response
53
- assert 'markdown' in response
54
- assert 'metadata' in response
55
- assert 'html' not in response
56
- assert "_Roast_" in response['content']
57
-
58
- def test_successful_response_with_valid_api_key_and_include_html():
59
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
60
- response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
61
- assert response is not None
62
- assert 'content' in response
63
- assert 'markdown' in response
64
- assert 'html' in response
65
- assert 'metadata' in response
66
- assert "_Roast_" in response['content']
67
- assert "_Roast_" in response['markdown']
68
- assert "<h1" in response['html']
69
-
70
- def test_successful_response_for_valid_scrape_with_pdf_file():
71
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
72
- response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
73
- assert response is not None
74
- assert 'content' in response
75
- assert 'metadata' in response
76
- assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
77
-
78
- def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
79
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
80
- response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
81
- time.sleep(6) # wait for 6 seconds
82
- assert response is not None
83
- assert 'content' in response
84
- assert 'metadata' in response
85
- assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
86
-
87
- def test_crawl_url_invalid_api_key():
88
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
89
- with pytest.raises(Exception) as excinfo:
90
- invalid_app.crawl_url('https://firecrawl.dev')
91
- assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
92
-
93
- # def test_should_return_error_for_blocklisted_url():
94
- # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
95
- # blocklisted_url = "https://twitter.com/fake-test"
96
- # with pytest.raises(Exception) as excinfo:
97
- # app.crawl_url(blocklisted_url)
98
- # assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
99
-
100
- def test_crawl_url_wait_for_completion_e2e():
101
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
102
- response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
103
- assert response is not None
104
- assert len(response) > 0
105
- assert 'content' in response[0]
106
- assert "_Roast_" in response[0]['content']
107
-
108
- def test_crawl_url_with_idempotency_key_e2e():
109
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
110
- uniqueIdempotencyKey = str(uuid4())
111
- response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
112
- assert response is not None
113
- assert len(response) > 0
114
- assert 'content' in response[0]
115
- assert "_Roast_" in response[0]['content']
116
-
117
- with pytest.raises(Exception) as excinfo:
118
- app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
119
- assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
120
-
121
- def test_check_crawl_status_e2e():
122
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
123
- response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
124
- assert response is not None
125
- assert 'jobId' in response
126
-
127
- time.sleep(30) # wait for 30 seconds
128
- status_response = app.check_crawl_status(response['jobId'])
129
- assert status_response is not None
130
- assert 'status' in status_response
131
- assert status_response['status'] == 'completed'
132
- assert 'data' in status_response
133
- assert len(status_response['data']) > 0
134
-
135
- def test_search_e2e():
136
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
137
- response = app.search("test query")
138
- assert response is not None
139
- assert 'content' in response[0]
140
- assert len(response) > 2
141
-
142
- def test_search_invalid_api_key():
143
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
144
- with pytest.raises(Exception) as excinfo:
145
- invalid_app.search("test query")
146
- assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
147
-
148
- def test_llm_extraction():
149
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
150
- response = app.scrape_url("https://firecrawl.dev", {
151
- 'extractorOptions': {
152
- 'mode': 'llm-extraction',
153
- 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
154
- 'extractionSchema': {
155
- 'type': 'object',
156
- 'properties': {
157
- 'company_mission': {'type': 'string'},
158
- 'supports_sso': {'type': 'boolean'},
159
- 'is_open_source': {'type': 'boolean'}
160
- },
161
- 'required': ['company_mission', 'supports_sso', 'is_open_source']
162
- }
163
- }
164
- })
165
- assert response is not None
166
- assert 'llm_extraction' in response
167
- llm_extraction = response['llm_extraction']
168
- assert 'company_mission' in llm_extraction
169
- assert isinstance(llm_extraction['supports_sso'], bool)
170
- assert isinstance(llm_extraction['is_open_source'], bool)