firecrawl 0.0.19__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Sideguide Technologies Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 0.0.19
3
+ Version: 1.1.1
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/mendableai/firecrawl
6
6
  Author: Mendable.ai
7
7
  Author-email: "Mendable.ai" <nick@mendable.ai>
8
8
  Maintainer-email: "Mendable.ai" <nick@mendable.ai>
9
- License: GNU General Public License v3 (GPLv3)
9
+ License: GNU Affero General Public License v3 (AGPLv3)
10
10
  Project-URL: Documentation, https://docs.firecrawl.dev
11
11
  Project-URL: Source, https://github.com/mendableai/firecrawl
12
12
  Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
@@ -30,9 +30,14 @@ Classifier: Topic :: Software Development :: Libraries
30
30
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
31
31
  Classifier: Topic :: Text Processing
32
32
  Classifier: Topic :: Text Processing :: Indexing
33
- Requires-Python: >=3.7
33
+ Requires-Python: >=3.8
34
34
  Description-Content-Type: text/markdown
35
+ License-File: LICENSE
35
36
  Requires-Dist: requests
37
+ Requires-Dist: python-dotenv
38
+ Requires-Dist: websockets
39
+ Requires-Dist: asyncio
40
+ Requires-Dist: nest-asyncio
36
41
 
37
42
  # Firecrawl Python SDK
38
43
 
@@ -51,27 +56,31 @@ pip install firecrawl-py
51
56
  1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
52
57
  2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
53
58
 
54
-
55
59
  Here's an example of how to use the SDK:
56
60
 
57
61
  ```python
58
- from firecrawl import FirecrawlApp
59
-
60
- # Initialize the FirecrawlApp with your API key
61
- app = FirecrawlApp(api_key='your_api_key')
62
-
63
- # Scrape a single URL
64
- url = 'https://mendable.ai'
65
- scraped_data = app.scrape_url(url)
66
-
67
- # Crawl a website
68
- crawl_url = 'https://mendable.ai'
69
- params = {
70
- 'pageOptions': {
71
- 'onlyMainContent': True
72
- }
73
- }
74
- crawl_result = app.crawl_url(crawl_url, params=params)
62
+ from firecrawl.firecrawl import FirecrawlApp
63
+
64
+ app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
65
+
66
+ # Scrape a website:
67
+ scrape_status = app.scrape_url(
68
+ 'https://firecrawl.dev',
69
+ params={'formats': ['markdown', 'html']}
70
+ )
71
+ print(scrape_status)
72
+
73
+ # Crawl a website:
74
+ crawl_status = app.crawl_url(
75
+ 'https://firecrawl.dev',
76
+ params={
77
+ 'limit': 100,
78
+ 'scrapeOptions': {'formats': ['markdown', 'html']}
79
+ },
80
+ wait_until_done=True,
81
+ poll_interval=30
82
+ )
83
+ print(crawl_status)
75
84
  ```
76
85
 
77
86
  ### Scraping a URL
@@ -82,6 +91,7 @@ To scrape a single URL, use the `scrape_url` method. It takes the URL as a param
82
91
  url = 'https://example.com'
83
92
  scraped_data = app.scrape_url(url)
84
93
  ```
94
+
85
95
  ### Extracting structured data from a URL
86
96
 
87
97
  With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
@@ -89,7 +99,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup
89
99
  ```python
90
100
  class ArticleSchema(BaseModel):
91
101
  title: str
92
- points: int
102
+ points: int
93
103
  by: str
94
104
  commentsURL: str
95
105
 
@@ -108,45 +118,77 @@ data = app.scrape_url('https://news.ycombinator.com', {
108
118
  print(data["llm_extraction"])
109
119
  ```
110
120
 
111
- ### Search for a query
121
+ ### Crawling a Website
112
122
 
113
- Used to search the web, get the most relevant results, scrap each page and return the markdown.
123
+ To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
114
124
 
115
125
  ```python
116
- query = 'what is mendable?'
117
- search_result = app.search(query)
126
+ idempotency_key = str(uuid.uuid4()) # optional idempotency key
127
+ crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
128
+ print(crawl_result)
118
129
  ```
119
130
 
120
- ### Crawling a Website
121
-
122
- To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
131
+ ### Asynchronous Crawl a Website
123
132
 
124
- The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
133
+ To crawl a website asynchronously, use the `async_crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
125
134
 
126
135
  ```python
127
- crawl_url = 'https://example.com'
128
- params = {
129
- 'crawlerOptions': {
130
- 'excludes': ['blog/*'],
131
- 'includes': [], # leave empty for all pages
132
- 'limit': 1000,
133
- },
134
- 'pageOptions': {
135
- 'onlyMainContent': True
136
- }
137
- }
138
- crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
136
+ crawl_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
137
+ print(crawl_result)
139
138
  ```
140
139
 
141
- If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
142
-
143
140
  ### Checking Crawl Status
144
141
 
145
142
  To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
146
143
 
147
144
  ```python
148
- job_id = crawl_result['jobId']
149
- status = app.check_crawl_status(job_id)
145
+ id = crawl_result['id']
146
+ status = app.check_crawl_status(id)
147
+ ```
148
+
149
+ ### Map a Website
150
+
151
+ Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap.
152
+
153
+ ```python
154
+ # Map a website:
155
+ map_result = app.map_url('https://example.com')
156
+ print(map_result)
157
+ ```
158
+
159
+ ### Crawl a website with WebSockets
160
+
161
+ To crawl a website with WebSockets, use the `crawl_url_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
162
+
163
+ ```python
164
+ # inside an async function...
165
+ nest_asyncio.apply()
166
+
167
+ # Define event handlers
168
+ def on_document(detail):
169
+ print("DOC", detail)
170
+
171
+ def on_error(detail):
172
+ print("ERR", detail['error'])
173
+
174
+ def on_done(detail):
175
+ print("DONE", detail['status'])
176
+
177
+ # Function to start the crawl and watch process
178
+ async def start_crawl_and_watch():
179
+ # Initiate the crawl job and get the watcher
180
+ watcher = app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
181
+
182
+ # Add event listeners
183
+ watcher.add_event_listener("document", on_document)
184
+ watcher.add_event_listener("error", on_error)
185
+ watcher.add_event_listener("done", on_done)
186
+
187
+ # Start the watcher
188
+ await watcher.connect()
189
+
190
+ # Run the event loop
191
+ await start_crawl_and_watch()
150
192
  ```
151
193
 
152
194
  ## Error Handling
@@ -162,20 +204,27 @@ To ensure the functionality of the Firecrawl Python SDK, we have included end-to
162
204
  To run the tests, execute the following commands:
163
205
 
164
206
  Install pytest:
207
+
165
208
  ```bash
166
209
  pip install pytest
167
210
  ```
168
211
 
169
212
  Run:
213
+
170
214
  ```bash
171
215
  pytest firecrawl/__tests__/e2e_withAuth/test.py
172
216
  ```
173
217
 
174
-
175
218
  ## Contributing
176
219
 
177
220
  Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
178
221
 
179
222
  ## License
180
223
 
181
- The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
224
+ The Firecrawl Python SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
225
+
226
+ - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
227
+
228
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
229
+
230
+ Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.
@@ -0,0 +1,189 @@
1
+ # Firecrawl Python SDK
2
+
3
+ The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
4
+
5
+ ## Installation
6
+
7
+ To install the Firecrawl Python SDK, you can use pip:
8
+
9
+ ```bash
10
+ pip install firecrawl-py
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ 1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
16
+ 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
17
+
18
+ Here's an example of how to use the SDK:
19
+
20
+ ```python
21
+ from firecrawl.firecrawl import FirecrawlApp
22
+
23
+ app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
24
+
25
+ # Scrape a website:
26
+ scrape_status = app.scrape_url(
27
+ 'https://firecrawl.dev',
28
+ params={'formats': ['markdown', 'html']}
29
+ )
30
+ print(scrape_status)
31
+
32
+ # Crawl a website:
33
+ crawl_status = app.crawl_url(
34
+ 'https://firecrawl.dev',
35
+ params={
36
+ 'limit': 100,
37
+ 'scrapeOptions': {'formats': ['markdown', 'html']}
38
+ },
39
+ wait_until_done=True,
40
+ poll_interval=30
41
+ )
42
+ print(crawl_status)
43
+ ```
44
+
45
+ ### Scraping a URL
46
+
47
+ To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
48
+
49
+ ```python
50
+ url = 'https://example.com'
51
+ scraped_data = app.scrape_url(url)
52
+ ```
53
+
54
+ ### Extracting structured data from a URL
55
+
56
+ With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
57
+
58
+ ```python
59
+ class ArticleSchema(BaseModel):
60
+ title: str
61
+ points: int
62
+ by: str
63
+ commentsURL: str
64
+
65
+ class TopArticlesSchema(BaseModel):
66
+ top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
67
+
68
+ data = app.scrape_url('https://news.ycombinator.com', {
69
+ 'extractorOptions': {
70
+ 'extractionSchema': TopArticlesSchema.model_json_schema(),
71
+ 'mode': 'llm-extraction'
72
+ },
73
+ 'pageOptions':{
74
+ 'onlyMainContent': True
75
+ }
76
+ })
77
+ print(data["llm_extraction"])
78
+ ```
79
+
80
+ ### Crawling a Website
81
+
82
+ To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
83
+
84
+ ```python
85
+ idempotency_key = str(uuid.uuid4()) # optional idempotency key
86
+ crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
87
+ print(crawl_result)
88
+ ```
89
+
90
+ ### Asynchronous Crawl a Website
91
+
92
+ To crawl a website asynchronously, use the `async_crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
93
+
94
+ ```python
95
+ crawl_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
96
+ print(crawl_result)
97
+ ```
98
+
99
+ ### Checking Crawl Status
100
+
101
+ To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
102
+
103
+ ```python
104
+ id = crawl_result['id']
105
+ status = app.check_crawl_status(id)
106
+ ```
107
+
108
+ ### Map a Website
109
+
110
+ Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap.
111
+
112
+ ```python
113
+ # Map a website:
114
+ map_result = app.map_url('https://example.com')
115
+ print(map_result)
116
+ ```
117
+
118
+ ### Crawl a website with WebSockets
119
+
120
+ To crawl a website with WebSockets, use the `crawl_url_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
121
+
122
+ ```python
123
+ # inside an async function...
124
+ nest_asyncio.apply()
125
+
126
+ # Define event handlers
127
+ def on_document(detail):
128
+ print("DOC", detail)
129
+
130
+ def on_error(detail):
131
+ print("ERR", detail['error'])
132
+
133
+ def on_done(detail):
134
+ print("DONE", detail['status'])
135
+
136
+ # Function to start the crawl and watch process
137
+ async def start_crawl_and_watch():
138
+ # Initiate the crawl job and get the watcher
139
+ watcher = app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
140
+
141
+ # Add event listeners
142
+ watcher.add_event_listener("document", on_document)
143
+ watcher.add_event_listener("error", on_error)
144
+ watcher.add_event_listener("done", on_done)
145
+
146
+ # Start the watcher
147
+ await watcher.connect()
148
+
149
+ # Run the event loop
150
+ await start_crawl_and_watch()
151
+ ```
152
+
153
+ ## Error Handling
154
+
155
+ The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
156
+
157
+ ## Running the Tests with Pytest
158
+
159
+ To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
160
+
161
+ ### Running the Tests
162
+
163
+ To run the tests, execute the following commands:
164
+
165
+ Install pytest:
166
+
167
+ ```bash
168
+ pip install pytest
169
+ ```
170
+
171
+ Run:
172
+
173
+ ```bash
174
+ pytest firecrawl/__tests__/e2e_withAuth/test.py
175
+ ```
176
+
177
+ ## Contributing
178
+
179
+ Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
180
+
181
+ ## License
182
+
183
+ The Firecrawl Python SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
184
+
185
+ - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
186
+
187
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
188
+
189
+ Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.
@@ -13,7 +13,7 @@ import os
13
13
 
14
14
  from .firecrawl import FirecrawlApp
15
15
 
16
- __version__ = "0.0.19"
16
+ __version__ = "1.1.1"
17
17
 
18
18
  # Define the logger for the Firecrawl project
19
19
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -7,7 +7,7 @@ from dotenv import load_dotenv
7
7
 
8
8
  load_dotenv()
9
9
 
10
- API_URL = "http://127.0.0.1:3002";
10
+ API_URL = "http://127.0.0.1:3002"
11
11
  ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
12
12
  TEST_API_KEY = os.getenv('TEST_API_KEY')
13
13
 
@@ -20,32 +20,34 @@ FirecrawlApp = firecrawl.FirecrawlApp
20
20
 
21
21
  def test_no_api_key():
22
22
  with pytest.raises(Exception) as excinfo:
23
- invalid_app = FirecrawlApp(api_url=API_URL)
23
+ invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
24
24
  assert "No API key provided" in str(excinfo.value)
25
25
 
26
26
  def test_scrape_url_invalid_api_key():
27
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
27
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
28
28
  with pytest.raises(Exception) as excinfo:
29
29
  invalid_app.scrape_url('https://firecrawl.dev')
30
30
  assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
31
31
 
32
32
  def test_blocklisted_url():
33
33
  blocklisted_url = "https://facebook.com/fake-test"
34
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
34
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
35
35
  with pytest.raises(Exception) as excinfo:
36
36
  app.scrape_url(blocklisted_url)
37
37
  assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
38
38
 
39
39
  def test_successful_response_with_valid_preview_token():
40
- app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
40
+ app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
41
41
  response = app.scrape_url('https://roastmywebsite.ai')
42
42
  assert response is not None
43
43
  assert 'content' in response
44
44
  assert "_Roast_" in response['content']
45
45
 
46
46
  def test_scrape_url_e2e():
47
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
47
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
48
48
  response = app.scrape_url('https://roastmywebsite.ai')
49
+ print(response)
50
+
49
51
  assert response is not None
50
52
  assert 'content' in response
51
53
  assert 'markdown' in response
@@ -54,7 +56,7 @@ def test_scrape_url_e2e():
54
56
  assert "_Roast_" in response['content']
55
57
 
56
58
  def test_successful_response_with_valid_api_key_and_include_html():
57
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
59
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
58
60
  response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
59
61
  assert response is not None
60
62
  assert 'content' in response
@@ -66,7 +68,7 @@ def test_successful_response_with_valid_api_key_and_include_html():
66
68
  assert "<h1" in response['html']
67
69
 
68
70
  def test_successful_response_for_valid_scrape_with_pdf_file():
69
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
71
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
70
72
  response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
71
73
  assert response is not None
72
74
  assert 'content' in response
@@ -74,7 +76,7 @@ def test_successful_response_for_valid_scrape_with_pdf_file():
74
76
  assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
75
77
 
76
78
  def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
77
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
79
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
78
80
  response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
79
81
  time.sleep(6) # wait for 6 seconds
80
82
  assert response is not None
@@ -83,20 +85,20 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
83
85
  assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
84
86
 
85
87
  def test_crawl_url_invalid_api_key():
86
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
88
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
87
89
  with pytest.raises(Exception) as excinfo:
88
90
  invalid_app.crawl_url('https://firecrawl.dev')
89
91
  assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
90
92
 
91
93
  def test_should_return_error_for_blocklisted_url():
92
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
94
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
93
95
  blocklisted_url = "https://twitter.com/fake-test"
94
96
  with pytest.raises(Exception) as excinfo:
95
97
  app.crawl_url(blocklisted_url)
96
98
  assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
97
99
 
98
100
  def test_crawl_url_wait_for_completion_e2e():
99
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
101
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
100
102
  response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
101
103
  assert response is not None
102
104
  assert len(response) > 0
@@ -104,7 +106,7 @@ def test_crawl_url_wait_for_completion_e2e():
104
106
  assert "_Roast_" in response[0]['content']
105
107
 
106
108
  def test_crawl_url_with_idempotency_key_e2e():
107
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
109
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
108
110
  uniqueIdempotencyKey = str(uuid4())
109
111
  response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
110
112
  assert response is not None
@@ -117,7 +119,7 @@ def test_crawl_url_with_idempotency_key_e2e():
117
119
  assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
118
120
 
119
121
  def test_check_crawl_status_e2e():
120
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
122
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
121
123
  response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
122
124
  assert response is not None
123
125
  assert 'jobId' in response
@@ -131,21 +133,21 @@ def test_check_crawl_status_e2e():
131
133
  assert len(status_response['data']) > 0
132
134
 
133
135
  def test_search_e2e():
134
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
136
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
135
137
  response = app.search("test query")
136
138
  assert response is not None
137
139
  assert 'content' in response[0]
138
140
  assert len(response) > 2
139
141
 
140
142
  def test_search_invalid_api_key():
141
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
143
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
142
144
  with pytest.raises(Exception) as excinfo:
143
145
  invalid_app.search("test query")
144
146
  assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
145
147
 
146
148
  def test_llm_extraction():
147
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
148
- response = app.scrape_url("https://mendable.ai", {
149
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
150
+ response = app.scrape_url("https://firecrawl.dev", {
149
151
  'extractorOptions': {
150
152
  'mode': 'llm-extraction',
151
153
  'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",