firecrawl 0.0.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl-0.0.16/PKG-INFO +181 -0
- firecrawl-0.0.16/README.md +145 -0
- firecrawl-0.0.16/firecrawl/__init__.py +57 -0
- firecrawl-0.0.16/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl-0.0.16/firecrawl/__tests__/e2e_withAuth/test.py +168 -0
- firecrawl-0.0.16/firecrawl/firecrawl.py +318 -0
- firecrawl-0.0.16/firecrawl.egg-info/PKG-INFO +181 -0
- firecrawl-0.0.16/firecrawl.egg-info/SOURCES.txt +12 -0
- firecrawl-0.0.16/firecrawl.egg-info/dependency_links.txt +1 -0
- firecrawl-0.0.16/firecrawl.egg-info/requires.txt +1 -0
- firecrawl-0.0.16/firecrawl.egg-info/top_level.txt +3 -0
- firecrawl-0.0.16/pyproject.toml +48 -0
- firecrawl-0.0.16/setup.cfg +4 -0
- firecrawl-0.0.16/setup.py +63 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: firecrawl
|
|
3
|
+
Version: 0.0.16
|
|
4
|
+
Summary: Python SDK for Firecrawl API
|
|
5
|
+
Home-page: https://github.com/mendableai/firecrawl
|
|
6
|
+
Author: Mendable.ai
|
|
7
|
+
Author-email: "Mendable.ai" <nick@mendable.ai>
|
|
8
|
+
Maintainer-email: "Mendable.ai" <nick@mendable.ai>
|
|
9
|
+
License: GNU General Public License v3 (GPLv3)
|
|
10
|
+
Project-URL: Documentation, https://docs.firecrawl.dev
|
|
11
|
+
Project-URL: Source, https://github.com/mendableai/firecrawl
|
|
12
|
+
Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
|
|
13
|
+
Keywords: SDK,API,firecrawl
|
|
14
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
15
|
+
Classifier: Environment :: Web Environment
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
18
|
+
Classifier: Natural Language :: English
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python
|
|
21
|
+
Classifier: Programming Language :: Python :: 3
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
25
|
+
Classifier: Topic :: Internet
|
|
26
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
27
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
28
|
+
Classifier: Topic :: Software Development
|
|
29
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
30
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
31
|
+
Classifier: Topic :: Text Processing
|
|
32
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
33
|
+
Requires-Python: >=3.8
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
Requires-Dist: requests
|
|
36
|
+
|
|
37
|
+
# Firecrawl Python SDK
|
|
38
|
+
|
|
39
|
+
The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
To install the Firecrawl Python SDK, you can use pip:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install firecrawl-py
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
|
52
|
+
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
Here's an example of how to use the SDK:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from firecrawl import FirecrawlApp
|
|
59
|
+
|
|
60
|
+
# Initialize the FirecrawlApp with your API key
|
|
61
|
+
app = FirecrawlApp(api_key='your_api_key')
|
|
62
|
+
|
|
63
|
+
# Scrape a single URL
|
|
64
|
+
url = 'https://mendable.ai'
|
|
65
|
+
scraped_data = app.scrape_url(url)
|
|
66
|
+
|
|
67
|
+
# Crawl a website
|
|
68
|
+
crawl_url = 'https://mendable.ai'
|
|
69
|
+
params = {
|
|
70
|
+
'pageOptions': {
|
|
71
|
+
'onlyMainContent': True
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
crawl_result = app.crawl_url(crawl_url, params=params)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Scraping a URL
|
|
78
|
+
|
|
79
|
+
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
url = 'https://example.com'
|
|
83
|
+
scraped_data = app.scrape_url(url)
|
|
84
|
+
```
|
|
85
|
+
### Extracting structured data from a URL
|
|
86
|
+
|
|
87
|
+
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
class ArticleSchema(BaseModel):
|
|
91
|
+
title: str
|
|
92
|
+
points: int
|
|
93
|
+
by: str
|
|
94
|
+
commentsURL: str
|
|
95
|
+
|
|
96
|
+
class TopArticlesSchema(BaseModel):
|
|
97
|
+
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
|
98
|
+
|
|
99
|
+
data = app.scrape_url('https://news.ycombinator.com', {
|
|
100
|
+
'extractorOptions': {
|
|
101
|
+
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
|
102
|
+
'mode': 'llm-extraction'
|
|
103
|
+
},
|
|
104
|
+
'pageOptions':{
|
|
105
|
+
'onlyMainContent': True
|
|
106
|
+
}
|
|
107
|
+
})
|
|
108
|
+
print(data["llm_extraction"])
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Search for a query
|
|
112
|
+
|
|
113
|
+
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
query = 'what is mendable?'
|
|
117
|
+
search_result = app.search(query)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Crawling a Website
|
|
121
|
+
|
|
122
|
+
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
|
123
|
+
|
|
124
|
+
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
crawl_url = 'https://example.com'
|
|
128
|
+
params = {
|
|
129
|
+
'crawlerOptions': {
|
|
130
|
+
'excludes': ['blog/*'],
|
|
131
|
+
'includes': [], # leave empty for all pages
|
|
132
|
+
'limit': 1000,
|
|
133
|
+
},
|
|
134
|
+
'pageOptions': {
|
|
135
|
+
'onlyMainContent': True
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
|
142
|
+
|
|
143
|
+
### Checking Crawl Status
|
|
144
|
+
|
|
145
|
+
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
job_id = crawl_result['jobId']
|
|
149
|
+
status = app.check_crawl_status(job_id)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Error Handling
|
|
153
|
+
|
|
154
|
+
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
|
155
|
+
|
|
156
|
+
## Running the Tests with Pytest
|
|
157
|
+
|
|
158
|
+
To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
|
|
159
|
+
|
|
160
|
+
### Running the Tests
|
|
161
|
+
|
|
162
|
+
To run the tests, execute the following commands:
|
|
163
|
+
|
|
164
|
+
Install pytest:
|
|
165
|
+
```bash
|
|
166
|
+
pip install pytest
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Run:
|
|
170
|
+
```bash
|
|
171
|
+
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
## Contributing
|
|
176
|
+
|
|
177
|
+
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
|
178
|
+
|
|
179
|
+
## License
|
|
180
|
+
|
|
181
|
+
The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# Firecrawl Python SDK
|
|
2
|
+
|
|
3
|
+
The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
To install the Firecrawl Python SDK, you can use pip:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install firecrawl-py
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
|
16
|
+
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
Here's an example of how to use the SDK:
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from firecrawl import FirecrawlApp
|
|
23
|
+
|
|
24
|
+
# Initialize the FirecrawlApp with your API key
|
|
25
|
+
app = FirecrawlApp(api_key='your_api_key')
|
|
26
|
+
|
|
27
|
+
# Scrape a single URL
|
|
28
|
+
url = 'https://mendable.ai'
|
|
29
|
+
scraped_data = app.scrape_url(url)
|
|
30
|
+
|
|
31
|
+
# Crawl a website
|
|
32
|
+
crawl_url = 'https://mendable.ai'
|
|
33
|
+
params = {
|
|
34
|
+
'pageOptions': {
|
|
35
|
+
'onlyMainContent': True
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
crawl_result = app.crawl_url(crawl_url, params=params)
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Scraping a URL
|
|
42
|
+
|
|
43
|
+
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
url = 'https://example.com'
|
|
47
|
+
scraped_data = app.scrape_url(url)
|
|
48
|
+
```
|
|
49
|
+
### Extracting structured data from a URL
|
|
50
|
+
|
|
51
|
+
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
class ArticleSchema(BaseModel):
|
|
55
|
+
title: str
|
|
56
|
+
points: int
|
|
57
|
+
by: str
|
|
58
|
+
commentsURL: str
|
|
59
|
+
|
|
60
|
+
class TopArticlesSchema(BaseModel):
|
|
61
|
+
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
|
62
|
+
|
|
63
|
+
data = app.scrape_url('https://news.ycombinator.com', {
|
|
64
|
+
'extractorOptions': {
|
|
65
|
+
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
|
66
|
+
'mode': 'llm-extraction'
|
|
67
|
+
},
|
|
68
|
+
'pageOptions':{
|
|
69
|
+
'onlyMainContent': True
|
|
70
|
+
}
|
|
71
|
+
})
|
|
72
|
+
print(data["llm_extraction"])
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Search for a query
|
|
76
|
+
|
|
77
|
+
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
query = 'what is mendable?'
|
|
81
|
+
search_result = app.search(query)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Crawling a Website
|
|
85
|
+
|
|
86
|
+
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
|
87
|
+
|
|
88
|
+
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
crawl_url = 'https://example.com'
|
|
92
|
+
params = {
|
|
93
|
+
'crawlerOptions': {
|
|
94
|
+
'excludes': ['blog/*'],
|
|
95
|
+
'includes': [], # leave empty for all pages
|
|
96
|
+
'limit': 1000,
|
|
97
|
+
},
|
|
98
|
+
'pageOptions': {
|
|
99
|
+
'onlyMainContent': True
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
|
106
|
+
|
|
107
|
+
### Checking Crawl Status
|
|
108
|
+
|
|
109
|
+
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
job_id = crawl_result['jobId']
|
|
113
|
+
status = app.check_crawl_status(job_id)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Error Handling
|
|
117
|
+
|
|
118
|
+
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
|
119
|
+
|
|
120
|
+
## Running the Tests with Pytest
|
|
121
|
+
|
|
122
|
+
To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
|
|
123
|
+
|
|
124
|
+
### Running the Tests
|
|
125
|
+
|
|
126
|
+
To run the tests, execute the following commands:
|
|
127
|
+
|
|
128
|
+
Install pytest:
|
|
129
|
+
```bash
|
|
130
|
+
pip install pytest
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Run:
|
|
134
|
+
```bash
|
|
135
|
+
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
## Contributing
|
|
140
|
+
|
|
141
|
+
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This is the Firecrawl package.
|
|
3
|
+
|
|
4
|
+
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
+
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
+
and check the status of these jobs.
|
|
7
|
+
|
|
8
|
+
For more information visit https://github.com/firecrawl/
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
from .firecrawl import FirecrawlApp
|
|
15
|
+
|
|
16
|
+
__version__ = "0.0.16"
|
|
17
|
+
|
|
18
|
+
# Define the logger for the Firecrawl project
|
|
19
|
+
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _basic_config() -> None:
|
|
23
|
+
"""Set up basic configuration for logging with a specific format and date format."""
|
|
24
|
+
try:
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
27
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
28
|
+
)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.error("Failed to configure logging: %s", e)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def setup_logging() -> None:
|
|
34
|
+
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
35
|
+
env = os.environ.get(
|
|
36
|
+
"FIRECRAWL_LOGGING_LEVEL", "INFO"
|
|
37
|
+
).upper() # Default to 'INFO' level
|
|
38
|
+
_basic_config()
|
|
39
|
+
|
|
40
|
+
if env == "DEBUG":
|
|
41
|
+
logger.setLevel(logging.DEBUG)
|
|
42
|
+
elif env == "INFO":
|
|
43
|
+
logger.setLevel(logging.INFO)
|
|
44
|
+
elif env == "WARNING":
|
|
45
|
+
logger.setLevel(logging.WARNING)
|
|
46
|
+
elif env == "ERROR":
|
|
47
|
+
logger.setLevel(logging.ERROR)
|
|
48
|
+
elif env == "CRITICAL":
|
|
49
|
+
logger.setLevel(logging.CRITICAL)
|
|
50
|
+
else:
|
|
51
|
+
logger.setLevel(logging.INFO)
|
|
52
|
+
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Initialize logging configuration when the module is imported
|
|
56
|
+
setup_logging()
|
|
57
|
+
logger.debug("Debugging logger setup")
|
|
File without changes
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import pytest
|
|
3
|
+
import time
|
|
4
|
+
import os
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
from dotenv import load_dotenv
|
|
7
|
+
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
API_URL = "http://127.0.0.1:3002";
|
|
11
|
+
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
|
12
|
+
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
|
13
|
+
|
|
14
|
+
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
|
15
|
+
|
|
16
|
+
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
|
17
|
+
firecrawl = importlib.util.module_from_spec(spec)
|
|
18
|
+
spec.loader.exec_module(firecrawl)
|
|
19
|
+
FirecrawlApp = firecrawl.FirecrawlApp
|
|
20
|
+
|
|
21
|
+
def test_no_api_key():
|
|
22
|
+
with pytest.raises(Exception) as excinfo:
|
|
23
|
+
invalid_app = FirecrawlApp(api_url=API_URL)
|
|
24
|
+
assert "No API key provided" in str(excinfo.value)
|
|
25
|
+
|
|
26
|
+
def test_scrape_url_invalid_api_key():
|
|
27
|
+
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
|
28
|
+
with pytest.raises(Exception) as excinfo:
|
|
29
|
+
invalid_app.scrape_url('https://firecrawl.dev')
|
|
30
|
+
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
31
|
+
|
|
32
|
+
def test_blocklisted_url():
|
|
33
|
+
blocklisted_url = "https://facebook.com/fake-test"
|
|
34
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
35
|
+
with pytest.raises(Exception) as excinfo:
|
|
36
|
+
app.scrape_url(blocklisted_url)
|
|
37
|
+
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
38
|
+
|
|
39
|
+
def test_successful_response_with_valid_preview_token():
|
|
40
|
+
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
|
41
|
+
response = app.scrape_url('https://roastmywebsite.ai')
|
|
42
|
+
assert response is not None
|
|
43
|
+
assert 'content' in response
|
|
44
|
+
assert "_Roast_" in response['content']
|
|
45
|
+
|
|
46
|
+
def test_scrape_url_e2e():
|
|
47
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
48
|
+
response = app.scrape_url('https://roastmywebsite.ai')
|
|
49
|
+
assert response is not None
|
|
50
|
+
assert 'content' in response
|
|
51
|
+
assert 'markdown' in response
|
|
52
|
+
assert 'metadata' in response
|
|
53
|
+
assert 'html' not in response
|
|
54
|
+
assert "_Roast_" in response['content']
|
|
55
|
+
|
|
56
|
+
def test_successful_response_with_valid_api_key_and_include_html():
|
|
57
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
58
|
+
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
|
59
|
+
assert response is not None
|
|
60
|
+
assert 'content' in response
|
|
61
|
+
assert 'markdown' in response
|
|
62
|
+
assert 'html' in response
|
|
63
|
+
assert 'metadata' in response
|
|
64
|
+
assert "_Roast_" in response['content']
|
|
65
|
+
assert "_Roast_" in response['markdown']
|
|
66
|
+
assert "<h1" in response['html']
|
|
67
|
+
|
|
68
|
+
def test_successful_response_for_valid_scrape_with_pdf_file():
|
|
69
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
70
|
+
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
|
71
|
+
assert response is not None
|
|
72
|
+
assert 'content' in response
|
|
73
|
+
assert 'metadata' in response
|
|
74
|
+
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
75
|
+
|
|
76
|
+
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
|
77
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
78
|
+
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
|
79
|
+
time.sleep(6) # wait for 6 seconds
|
|
80
|
+
assert response is not None
|
|
81
|
+
assert 'content' in response
|
|
82
|
+
assert 'metadata' in response
|
|
83
|
+
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
84
|
+
|
|
85
|
+
def test_crawl_url_invalid_api_key():
|
|
86
|
+
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
|
87
|
+
with pytest.raises(Exception) as excinfo:
|
|
88
|
+
invalid_app.crawl_url('https://firecrawl.dev')
|
|
89
|
+
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
90
|
+
|
|
91
|
+
def test_should_return_error_for_blocklisted_url():
|
|
92
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
93
|
+
blocklisted_url = "https://twitter.com/fake-test"
|
|
94
|
+
with pytest.raises(Exception) as excinfo:
|
|
95
|
+
app.crawl_url(blocklisted_url)
|
|
96
|
+
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
97
|
+
|
|
98
|
+
def test_crawl_url_wait_for_completion_e2e():
|
|
99
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
100
|
+
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
|
101
|
+
assert response is not None
|
|
102
|
+
assert len(response) > 0
|
|
103
|
+
assert 'content' in response[0]
|
|
104
|
+
assert "_Roast_" in response[0]['content']
|
|
105
|
+
|
|
106
|
+
def test_crawl_url_with_idempotency_key_e2e():
|
|
107
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
108
|
+
uniqueIdempotencyKey = str(uuid4())
|
|
109
|
+
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
110
|
+
assert response is not None
|
|
111
|
+
assert len(response) > 0
|
|
112
|
+
assert 'content' in response[0]
|
|
113
|
+
assert "_Roast_" in response[0]['content']
|
|
114
|
+
|
|
115
|
+
with pytest.raises(Exception) as excinfo:
|
|
116
|
+
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
117
|
+
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
|
118
|
+
|
|
119
|
+
def test_check_crawl_status_e2e():
|
|
120
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
121
|
+
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
|
122
|
+
assert response is not None
|
|
123
|
+
assert 'jobId' in response
|
|
124
|
+
|
|
125
|
+
time.sleep(30) # wait for 30 seconds
|
|
126
|
+
status_response = app.check_crawl_status(response['jobId'])
|
|
127
|
+
assert status_response is not None
|
|
128
|
+
assert 'status' in status_response
|
|
129
|
+
assert status_response['status'] == 'completed'
|
|
130
|
+
assert 'data' in status_response
|
|
131
|
+
assert len(status_response['data']) > 0
|
|
132
|
+
|
|
133
|
+
def test_search_e2e():
|
|
134
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
135
|
+
response = app.search("test query")
|
|
136
|
+
assert response is not None
|
|
137
|
+
assert 'content' in response[0]
|
|
138
|
+
assert len(response) > 2
|
|
139
|
+
|
|
140
|
+
def test_search_invalid_api_key():
|
|
141
|
+
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
|
142
|
+
with pytest.raises(Exception) as excinfo:
|
|
143
|
+
invalid_app.search("test query")
|
|
144
|
+
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
145
|
+
|
|
146
|
+
def test_llm_extraction():
|
|
147
|
+
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
148
|
+
response = app.scrape_url("https://mendable.ai", {
|
|
149
|
+
'extractorOptions': {
|
|
150
|
+
'mode': 'llm-extraction',
|
|
151
|
+
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
|
152
|
+
'extractionSchema': {
|
|
153
|
+
'type': 'object',
|
|
154
|
+
'properties': {
|
|
155
|
+
'company_mission': {'type': 'string'},
|
|
156
|
+
'supports_sso': {'type': 'boolean'},
|
|
157
|
+
'is_open_source': {'type': 'boolean'}
|
|
158
|
+
},
|
|
159
|
+
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
})
|
|
163
|
+
assert response is not None
|
|
164
|
+
assert 'llm_extraction' in response
|
|
165
|
+
llm_extraction = response['llm_extraction']
|
|
166
|
+
assert 'company_mission' in llm_extraction
|
|
167
|
+
assert isinstance(llm_extraction['supports_sso'], bool)
|
|
168
|
+
assert isinstance(llm_extraction['is_open_source'], bool)
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FirecrawlApp Module
|
|
3
|
+
|
|
4
|
+
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
|
|
5
|
+
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
+
and check the status of these jobs. The module uses requests for HTTP communication
|
|
7
|
+
and handles retries for certain HTTP status codes.
|
|
8
|
+
|
|
9
|
+
Classes:
|
|
10
|
+
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
|
11
|
+
"""
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import time
|
|
15
|
+
from typing import Any, Dict, Optional
|
|
16
|
+
|
|
17
|
+
import requests
|
|
18
|
+
|
|
19
|
+
logger : logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
+
|
|
21
|
+
class FirecrawlApp:
|
|
22
|
+
"""
|
|
23
|
+
Initialize the FirecrawlApp instance.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
|
27
|
+
api_url (Optional[str]): Base URL for the Firecrawl API.
|
|
28
|
+
"""
|
|
29
|
+
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
|
30
|
+
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
|
31
|
+
if self.api_key is None:
|
|
32
|
+
logger.warning("No API key provided")
|
|
33
|
+
raise ValueError('No API key provided')
|
|
34
|
+
else:
|
|
35
|
+
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
|
|
36
|
+
|
|
37
|
+
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
|
38
|
+
if self.api_url != 'https://api.firecrawl.dev':
|
|
39
|
+
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
|
|
40
|
+
|
|
41
|
+
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
|
42
|
+
"""
|
|
43
|
+
Scrape the specified URL using the Firecrawl API.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
url (str): The URL to scrape.
|
|
47
|
+
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Any: The scraped data if the request is successful.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
Exception: If the scrape request fails.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
headers = self._prepare_headers()
|
|
57
|
+
|
|
58
|
+
# Prepare the base scrape parameters with the URL
|
|
59
|
+
scrape_params = {'url': url}
|
|
60
|
+
|
|
61
|
+
# If there are additional params, process them
|
|
62
|
+
if params:
|
|
63
|
+
# Initialize extractorOptions if present
|
|
64
|
+
extractor_options = params.get('extractorOptions', {})
|
|
65
|
+
# Check and convert the extractionSchema if it's a Pydantic model
|
|
66
|
+
if 'extractionSchema' in extractor_options:
|
|
67
|
+
if hasattr(extractor_options['extractionSchema'], 'schema'):
|
|
68
|
+
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
|
69
|
+
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
|
|
70
|
+
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
|
71
|
+
# Update the scrape_params with the processed extractorOptions
|
|
72
|
+
scrape_params['extractorOptions'] = extractor_options
|
|
73
|
+
|
|
74
|
+
# Include any other params directly at the top level of scrape_params
|
|
75
|
+
for key, value in params.items():
|
|
76
|
+
if key != 'extractorOptions':
|
|
77
|
+
scrape_params[key] = value
|
|
78
|
+
# Make the POST request with the prepared headers and JSON data
|
|
79
|
+
response = requests.post(
|
|
80
|
+
f'{self.api_url}/v0/scrape',
|
|
81
|
+
headers=headers,
|
|
82
|
+
json=scrape_params,
|
|
83
|
+
)
|
|
84
|
+
if response.status_code == 200:
|
|
85
|
+
response = response.json()
|
|
86
|
+
if response['success'] and 'data' in response:
|
|
87
|
+
return response['data']
|
|
88
|
+
else:
|
|
89
|
+
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
90
|
+
else:
|
|
91
|
+
self._handle_error(response, 'scrape URL')
|
|
92
|
+
|
|
93
|
+
def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
|
94
|
+
"""
|
|
95
|
+
Perform a search using the Firecrawl API.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
query (str): The search query.
|
|
99
|
+
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Any: The search results if the request is successful.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
Exception: If the search request fails.
|
|
106
|
+
"""
|
|
107
|
+
headers = self._prepare_headers()
|
|
108
|
+
json_data = {'query': query}
|
|
109
|
+
if params:
|
|
110
|
+
json_data.update(params)
|
|
111
|
+
response = requests.post(
|
|
112
|
+
f'{self.api_url}/v0/search',
|
|
113
|
+
headers=headers,
|
|
114
|
+
json=json_data
|
|
115
|
+
)
|
|
116
|
+
if response.status_code == 200:
|
|
117
|
+
response = response.json()
|
|
118
|
+
|
|
119
|
+
if response['success'] and 'data' in response:
|
|
120
|
+
return response['data']
|
|
121
|
+
else:
|
|
122
|
+
raise Exception(f'Failed to search. Error: {response["error"]}')
|
|
123
|
+
|
|
124
|
+
else:
|
|
125
|
+
self._handle_error(response, 'search')
|
|
126
|
+
|
|
127
|
+
def crawl_url(self, url: str,
|
|
128
|
+
params: Optional[Dict[str, Any]] = None,
|
|
129
|
+
wait_until_done: bool = True,
|
|
130
|
+
poll_interval: int = 2,
|
|
131
|
+
idempotency_key: Optional[str] = None) -> Any:
|
|
132
|
+
"""
|
|
133
|
+
Initiate a crawl job for the specified URL using the Firecrawl API.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
url (str): The URL to crawl.
|
|
137
|
+
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
|
138
|
+
wait_until_done (bool): Whether to wait until the crawl job is completed.
|
|
139
|
+
poll_interval (int): Time in seconds between status checks when waiting for job completion.
|
|
140
|
+
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Any: The crawl job ID or the crawl results if waiting until completion.
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
Exception: If the crawl job initiation or monitoring fails.
|
|
147
|
+
"""
|
|
148
|
+
headers = self._prepare_headers(idempotency_key)
|
|
149
|
+
json_data = {'url': url}
|
|
150
|
+
if params:
|
|
151
|
+
json_data.update(params)
|
|
152
|
+
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
|
|
153
|
+
if response.status_code == 200:
|
|
154
|
+
job_id = response.json().get('jobId')
|
|
155
|
+
if wait_until_done:
|
|
156
|
+
return self._monitor_job_status(job_id, headers, poll_interval)
|
|
157
|
+
else:
|
|
158
|
+
return {'jobId': job_id}
|
|
159
|
+
else:
|
|
160
|
+
self._handle_error(response, 'start crawl job')
|
|
161
|
+
|
|
162
|
+
def check_crawl_status(self, job_id: str) -> Any:
|
|
163
|
+
"""
|
|
164
|
+
Check the status of a crawl job using the Firecrawl API.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
job_id (str): The ID of the crawl job.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Any: The status of the crawl job.
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
Exception: If the status check request fails.
|
|
174
|
+
"""
|
|
175
|
+
headers = self._prepare_headers()
|
|
176
|
+
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
|
177
|
+
if response.status_code == 200:
|
|
178
|
+
return response.json()
|
|
179
|
+
else:
|
|
180
|
+
self._handle_error(response, 'check crawl status')
|
|
181
|
+
|
|
182
|
+
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
|
183
|
+
"""
|
|
184
|
+
Prepare the headers for API requests.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
|
191
|
+
"""
|
|
192
|
+
if idempotency_key:
|
|
193
|
+
return {
|
|
194
|
+
'Content-Type': 'application/json',
|
|
195
|
+
'Authorization': f'Bearer {self.api_key}',
|
|
196
|
+
'x-idempotency-key': idempotency_key
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
return {
|
|
200
|
+
'Content-Type': 'application/json',
|
|
201
|
+
'Authorization': f'Bearer {self.api_key}',
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
def _post_request(self, url: str,
|
|
205
|
+
data: Dict[str, Any],
|
|
206
|
+
headers: Dict[str, str],
|
|
207
|
+
retries: int = 3,
|
|
208
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
209
|
+
"""
|
|
210
|
+
Make a POST request with retries.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
url (str): The URL to send the POST request to.
|
|
214
|
+
data (Dict[str, Any]): The JSON data to include in the POST request.
|
|
215
|
+
headers (Dict[str, str]): The headers to include in the POST request.
|
|
216
|
+
retries (int): Number of retries for the request.
|
|
217
|
+
backoff_factor (float): Backoff factor for retries.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
requests.Response: The response from the POST request.
|
|
221
|
+
|
|
222
|
+
Raises:
|
|
223
|
+
requests.RequestException: If the request fails after the specified retries.
|
|
224
|
+
"""
|
|
225
|
+
for attempt in range(retries):
|
|
226
|
+
response = requests.post(url, headers=headers, json=data)
|
|
227
|
+
if response.status_code == 502:
|
|
228
|
+
time.sleep(backoff_factor * (2 ** attempt))
|
|
229
|
+
else:
|
|
230
|
+
return response
|
|
231
|
+
return response
|
|
232
|
+
|
|
233
|
+
def _get_request(self, url: str,
|
|
234
|
+
headers: Dict[str, str],
|
|
235
|
+
retries: int = 3,
|
|
236
|
+
backoff_factor: float = 0.5) -> requests.Response:
|
|
237
|
+
"""
|
|
238
|
+
Make a GET request with retries.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
url (str): The URL to send the GET request to.
|
|
242
|
+
headers (Dict[str, str]): The headers to include in the GET request.
|
|
243
|
+
retries (int): Number of retries for the request.
|
|
244
|
+
backoff_factor (float): Backoff factor for retries.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
requests.Response: The response from the GET request.
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
requests.RequestException: If the request fails after the specified retries.
|
|
251
|
+
"""
|
|
252
|
+
for attempt in range(retries):
|
|
253
|
+
response = requests.get(url, headers=headers)
|
|
254
|
+
if response.status_code == 502:
|
|
255
|
+
time.sleep(backoff_factor * (2 ** attempt))
|
|
256
|
+
else:
|
|
257
|
+
return response
|
|
258
|
+
return response
|
|
259
|
+
|
|
260
|
+
def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
|
|
261
|
+
"""
|
|
262
|
+
Monitor the status of a crawl job until completion.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
job_id (str): The ID of the crawl job.
|
|
266
|
+
headers (Dict[str, str]): The headers to include in the status check requests.
|
|
267
|
+
poll_interval (int): Secounds between status checks.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
Any: The crawl results if the job is completed successfully.
|
|
271
|
+
|
|
272
|
+
Raises:
|
|
273
|
+
Exception: If the job fails or an error occurs during status checks.
|
|
274
|
+
"""
|
|
275
|
+
while True:
|
|
276
|
+
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
|
277
|
+
if status_response.status_code == 200:
|
|
278
|
+
status_data = status_response.json()
|
|
279
|
+
if status_data['status'] == 'completed':
|
|
280
|
+
if 'data' in status_data:
|
|
281
|
+
return status_data['data']
|
|
282
|
+
else:
|
|
283
|
+
raise Exception('Crawl job completed but no data was returned')
|
|
284
|
+
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
|
|
285
|
+
poll_interval=max(poll_interval,2)
|
|
286
|
+
time.sleep(poll_interval) # Wait for the specified interval before checking again
|
|
287
|
+
else:
|
|
288
|
+
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
|
289
|
+
else:
|
|
290
|
+
self._handle_error(status_response, 'check crawl status')
|
|
291
|
+
|
|
292
|
+
def _handle_error(self, response: requests.Response, action: str) -> None:
|
|
293
|
+
"""
|
|
294
|
+
Handle errors from API responses.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
response (requests.Response): The response object from the API request.
|
|
298
|
+
action (str): Description of the action that was being performed.
|
|
299
|
+
|
|
300
|
+
Raises:
|
|
301
|
+
Exception: An exception with a message containing the status code and error details from the response.
|
|
302
|
+
"""
|
|
303
|
+
error_message = response.json().get('error', 'No additional error details provided.')
|
|
304
|
+
|
|
305
|
+
if response.status_code == 402:
|
|
306
|
+
message = f"Payment Required: Failed to {action}. {error_message}"
|
|
307
|
+
elif response.status_code == 408:
|
|
308
|
+
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
|
|
309
|
+
elif response.status_code == 409:
|
|
310
|
+
message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
|
|
311
|
+
elif response.status_code == 500:
|
|
312
|
+
message = f"Internal Server Error: Failed to {action}. {error_message}"
|
|
313
|
+
else:
|
|
314
|
+
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
|
|
315
|
+
|
|
316
|
+
# Raise an HTTPError with the custom message and attach the response
|
|
317
|
+
raise requests.exceptions.HTTPError(message, response=response)
|
|
318
|
+
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: firecrawl
|
|
3
|
+
Version: 0.0.16
|
|
4
|
+
Summary: Python SDK for Firecrawl API
|
|
5
|
+
Home-page: https://github.com/mendableai/firecrawl
|
|
6
|
+
Author: Mendable.ai
|
|
7
|
+
Author-email: "Mendable.ai" <nick@mendable.ai>
|
|
8
|
+
Maintainer-email: "Mendable.ai" <nick@mendable.ai>
|
|
9
|
+
License: GNU General Public License v3 (GPLv3)
|
|
10
|
+
Project-URL: Documentation, https://docs.firecrawl.dev
|
|
11
|
+
Project-URL: Source, https://github.com/mendableai/firecrawl
|
|
12
|
+
Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
|
|
13
|
+
Keywords: SDK,API,firecrawl
|
|
14
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
15
|
+
Classifier: Environment :: Web Environment
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
18
|
+
Classifier: Natural Language :: English
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python
|
|
21
|
+
Classifier: Programming Language :: Python :: 3
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
25
|
+
Classifier: Topic :: Internet
|
|
26
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
27
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
28
|
+
Classifier: Topic :: Software Development
|
|
29
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
30
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
31
|
+
Classifier: Topic :: Text Processing
|
|
32
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
33
|
+
Requires-Python: >=3.8
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
Requires-Dist: requests
|
|
36
|
+
|
|
37
|
+
# Firecrawl Python SDK
|
|
38
|
+
|
|
39
|
+
The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
To install the Firecrawl Python SDK, you can use pip:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install firecrawl-py
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
|
52
|
+
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
Here's an example of how to use the SDK:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from firecrawl import FirecrawlApp
|
|
59
|
+
|
|
60
|
+
# Initialize the FirecrawlApp with your API key
|
|
61
|
+
app = FirecrawlApp(api_key='your_api_key')
|
|
62
|
+
|
|
63
|
+
# Scrape a single URL
|
|
64
|
+
url = 'https://mendable.ai'
|
|
65
|
+
scraped_data = app.scrape_url(url)
|
|
66
|
+
|
|
67
|
+
# Crawl a website
|
|
68
|
+
crawl_url = 'https://mendable.ai'
|
|
69
|
+
params = {
|
|
70
|
+
'pageOptions': {
|
|
71
|
+
'onlyMainContent': True
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
crawl_result = app.crawl_url(crawl_url, params=params)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Scraping a URL
|
|
78
|
+
|
|
79
|
+
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
url = 'https://example.com'
|
|
83
|
+
scraped_data = app.scrape_url(url)
|
|
84
|
+
```
|
|
85
|
+
### Extracting structured data from a URL
|
|
86
|
+
|
|
87
|
+
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
class ArticleSchema(BaseModel):
|
|
91
|
+
title: str
|
|
92
|
+
points: int
|
|
93
|
+
by: str
|
|
94
|
+
commentsURL: str
|
|
95
|
+
|
|
96
|
+
class TopArticlesSchema(BaseModel):
|
|
97
|
+
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
|
98
|
+
|
|
99
|
+
data = app.scrape_url('https://news.ycombinator.com', {
|
|
100
|
+
'extractorOptions': {
|
|
101
|
+
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
|
102
|
+
'mode': 'llm-extraction'
|
|
103
|
+
},
|
|
104
|
+
'pageOptions':{
|
|
105
|
+
'onlyMainContent': True
|
|
106
|
+
}
|
|
107
|
+
})
|
|
108
|
+
print(data["llm_extraction"])
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Search for a query
|
|
112
|
+
|
|
113
|
+
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
query = 'what is mendable?'
|
|
117
|
+
search_result = app.search(query)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Crawling a Website
|
|
121
|
+
|
|
122
|
+
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
|
123
|
+
|
|
124
|
+
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
crawl_url = 'https://example.com'
|
|
128
|
+
params = {
|
|
129
|
+
'crawlerOptions': {
|
|
130
|
+
'excludes': ['blog/*'],
|
|
131
|
+
'includes': [], # leave empty for all pages
|
|
132
|
+
'limit': 1000,
|
|
133
|
+
},
|
|
134
|
+
'pageOptions': {
|
|
135
|
+
'onlyMainContent': True
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
|
142
|
+
|
|
143
|
+
### Checking Crawl Status
|
|
144
|
+
|
|
145
|
+
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
job_id = crawl_result['jobId']
|
|
149
|
+
status = app.check_crawl_status(job_id)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Error Handling
|
|
153
|
+
|
|
154
|
+
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
|
155
|
+
|
|
156
|
+
## Running the Tests with Pytest
|
|
157
|
+
|
|
158
|
+
To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
|
|
159
|
+
|
|
160
|
+
### Running the Tests
|
|
161
|
+
|
|
162
|
+
To run the tests, execute the following commands:
|
|
163
|
+
|
|
164
|
+
Install pytest:
|
|
165
|
+
```bash
|
|
166
|
+
pip install pytest
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Run:
|
|
170
|
+
```bash
|
|
171
|
+
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
## Contributing
|
|
176
|
+
|
|
177
|
+
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
|
178
|
+
|
|
179
|
+
## License
|
|
180
|
+
|
|
181
|
+
The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
firecrawl/__init__.py
|
|
5
|
+
firecrawl/firecrawl.py
|
|
6
|
+
firecrawl.egg-info/PKG-INFO
|
|
7
|
+
firecrawl.egg-info/SOURCES.txt
|
|
8
|
+
firecrawl.egg-info/dependency_links.txt
|
|
9
|
+
firecrawl.egg-info/requires.txt
|
|
10
|
+
firecrawl.egg-info/top_level.txt
|
|
11
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py
|
|
12
|
+
firecrawl/__tests__/e2e_withAuth/test.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
requests
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
dynamic = ["version"]
|
|
7
|
+
name = "firecrawl"
|
|
8
|
+
description = "Python SDK for Firecrawl API"
|
|
9
|
+
readme = {file="README.md", content-type = "text/markdown"}
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"requests",
|
|
13
|
+
]
|
|
14
|
+
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
|
15
|
+
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
|
16
|
+
license = {text = "GNU General Public License v3 (GPLv3)"}
|
|
17
|
+
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 5 - Production/Stable",
|
|
20
|
+
"Environment :: Web Environment",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
23
|
+
"Natural Language :: English",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
"Programming Language :: Python",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.8",
|
|
28
|
+
"Programming Language :: Python :: 3.9",
|
|
29
|
+
"Programming Language :: Python :: 3.10",
|
|
30
|
+
"Topic :: Internet",
|
|
31
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
32
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
33
|
+
"Topic :: Software Development",
|
|
34
|
+
"Topic :: Software Development :: Libraries",
|
|
35
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
36
|
+
"Topic :: Text Processing",
|
|
37
|
+
"Topic :: Text Processing :: Indexing",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
keywords = ["SDK", "API", "firecrawl"]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
"Documentation" = "https://docs.firecrawl.dev"
|
|
44
|
+
"Source" = "https://github.com/mendableai/firecrawl"
|
|
45
|
+
"Tracker" = "https://github.com/mendableai/firecrawl/issues"
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["."]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from setuptools import find_packages, setup
|
|
5
|
+
|
|
6
|
+
this_directory = Path(__file__).parent
|
|
7
|
+
long_description_content = (this_directory / "README.md").read_text()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_version():
|
|
11
|
+
"""Dynamically set version"""
|
|
12
|
+
version_file = (this_directory / "firecrawl" / "__init__.py").read_text()
|
|
13
|
+
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
|
14
|
+
if version_match:
|
|
15
|
+
return version_match.group(1)
|
|
16
|
+
raise RuntimeError("Unable to find version string.")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
setup(
|
|
20
|
+
name="firecrawl",
|
|
21
|
+
version=get_version(),
|
|
22
|
+
url="https://github.com/mendableai/firecrawl",
|
|
23
|
+
author="Mendable.ai",
|
|
24
|
+
author_email="nick@mendable.ai",
|
|
25
|
+
description="Python SDK for Firecrawl API",
|
|
26
|
+
long_description=long_description_content,
|
|
27
|
+
long_description_content_type="text/markdown",
|
|
28
|
+
packages=find_packages(),
|
|
29
|
+
install_requires=[
|
|
30
|
+
'requests',
|
|
31
|
+
'pytest',
|
|
32
|
+
'python-dotenv',
|
|
33
|
+
],
|
|
34
|
+
python_requires=">=3.8",
|
|
35
|
+
classifiers=[
|
|
36
|
+
"Development Status :: 5 - Production/Stable",
|
|
37
|
+
"Environment :: Web Environment",
|
|
38
|
+
"Intended Audience :: Developers",
|
|
39
|
+
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
40
|
+
"Natural Language :: English",
|
|
41
|
+
"Operating System :: OS Independent",
|
|
42
|
+
"Programming Language :: Python",
|
|
43
|
+
"Programming Language :: Python :: 3",
|
|
44
|
+
"Programming Language :: Python :: 3.8",
|
|
45
|
+
"Programming Language :: Python :: 3.9",
|
|
46
|
+
"Programming Language :: Python :: 3.10",
|
|
47
|
+
"Topic :: Internet",
|
|
48
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
49
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
50
|
+
"Topic :: Software Development",
|
|
51
|
+
"Topic :: Software Development :: Libraries",
|
|
52
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
53
|
+
"Topic :: Text Processing",
|
|
54
|
+
"Topic :: Text Processing :: Indexing",
|
|
55
|
+
],
|
|
56
|
+
keywords="SDK API firecrawl",
|
|
57
|
+
project_urls={
|
|
58
|
+
"Documentation": "https://docs.firecrawl.dev",
|
|
59
|
+
"Source": "https://github.com/mendableai/firecrawl",
|
|
60
|
+
"Tracker": "https://github.com/mendableai/firecrawl/issues",
|
|
61
|
+
},
|
|
62
|
+
license="GNU General Public License v3 (GPLv3)",
|
|
63
|
+
)
|