firecrawl 2.16.5__tar.gz → 3.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-2.16.5 → firecrawl-3.0.3}/LICENSE +0 -0
- {firecrawl-2.16.5 → firecrawl-3.0.3}/PKG-INFO +49 -32
- {firecrawl-2.16.5 → firecrawl-3.0.3}/README.md +44 -28
- {firecrawl-2.16.5 → firecrawl-3.0.3}/firecrawl/__init__.py +27 -19
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl-3.0.3/firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl-3.0.3/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl-3.0.3/firecrawl/client.py +241 -0
- firecrawl-2.16.5/firecrawl/firecrawl.py → firecrawl-3.0.3/firecrawl/firecrawl.backup.py +17 -15
- firecrawl-3.0.3/firecrawl/types.py +157 -0
- firecrawl-3.0.3/firecrawl/v1/__init__.py +14 -0
- firecrawl-3.0.3/firecrawl/v1/client.py +4653 -0
- firecrawl-3.0.3/firecrawl/v2/__init__.py +4 -0
- firecrawl-3.0.3/firecrawl/v2/client.py +802 -0
- firecrawl-3.0.3/firecrawl/v2/client_async.py +250 -0
- firecrawl-3.0.3/firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl-3.0.3/firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl-3.0.3/firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl-3.0.3/firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl-3.0.3/firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl-3.0.3/firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl-3.0.3/firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl-3.0.3/firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl-3.0.3/firecrawl/v2/methods/batch.py +420 -0
- firecrawl-3.0.3/firecrawl/v2/methods/crawl.py +468 -0
- firecrawl-3.0.3/firecrawl/v2/methods/extract.py +131 -0
- firecrawl-3.0.3/firecrawl/v2/methods/map.py +77 -0
- firecrawl-3.0.3/firecrawl/v2/methods/scrape.py +68 -0
- firecrawl-3.0.3/firecrawl/v2/methods/search.py +173 -0
- firecrawl-3.0.3/firecrawl/v2/methods/usage.py +41 -0
- firecrawl-3.0.3/firecrawl/v2/types.py +546 -0
- firecrawl-3.0.3/firecrawl/v2/utils/__init__.py +9 -0
- firecrawl-3.0.3/firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl-3.0.3/firecrawl/v2/utils/get_version.py +15 -0
- firecrawl-3.0.3/firecrawl/v2/utils/http_client.py +153 -0
- firecrawl-3.0.3/firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl-3.0.3/firecrawl/v2/utils/validation.py +324 -0
- firecrawl-3.0.3/firecrawl/v2/watcher.py +312 -0
- firecrawl-3.0.3/firecrawl/v2/watcher_async.py +245 -0
- {firecrawl-2.16.5 → firecrawl-3.0.3}/firecrawl.egg-info/PKG-INFO +49 -32
- firecrawl-3.0.3/firecrawl.egg-info/SOURCES.txt +82 -0
- {firecrawl-2.16.5 → firecrawl-3.0.3}/firecrawl.egg-info/requires.txt +1 -0
- {firecrawl-2.16.5 → firecrawl-3.0.3}/pyproject.toml +3 -2
- {firecrawl-2.16.5 → firecrawl-3.0.3}/setup.py +3 -3
- {firecrawl-2.16.5 → firecrawl-3.0.3}/tests/test_change_tracking.py +0 -0
- firecrawl-3.0.3/tests/test_timeout_conversion.py +117 -0
- firecrawl-2.16.5/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl-2.16.5/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl-2.16.5/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl-2.16.5/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl-2.16.5/firecrawl.egg-info/SOURCES.txt +0 -16
- {firecrawl-2.16.5 → firecrawl-3.0.3}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-2.16.5 → firecrawl-3.0.3}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-2.16.5 → firecrawl-3.0.3}/setup.cfg +0 -0
|
File without changes
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: firecrawl
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.3
|
|
4
4
|
Summary: Python SDK for Firecrawl API
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/firecrawl/firecrawl
|
|
6
6
|
Author: Mendable.ai
|
|
7
7
|
Author-email: "Mendable.ai" <nick@mendable.ai>
|
|
8
8
|
Maintainer-email: "Mendable.ai" <nick@mendable.ai>
|
|
9
9
|
License: MIT License
|
|
10
10
|
Project-URL: Documentation, https://docs.firecrawl.dev
|
|
11
|
-
Project-URL: Source, https://github.com/
|
|
12
|
-
Project-URL: Tracker, https://github.com/
|
|
11
|
+
Project-URL: Source, https://github.com/firecrawl/firecrawl
|
|
12
|
+
Project-URL: Tracker, https://github.com/firecrawl/firecrawl/issues
|
|
13
13
|
Keywords: SDK,API,firecrawl
|
|
14
14
|
Classifier: Development Status :: 5 - Production/Stable
|
|
15
15
|
Classifier: Environment :: Web Environment
|
|
@@ -34,6 +34,7 @@ Requires-Python: >=3.8
|
|
|
34
34
|
Description-Content-Type: text/markdown
|
|
35
35
|
License-File: LICENSE
|
|
36
36
|
Requires-Dist: requests
|
|
37
|
+
Requires-Dist: httpx
|
|
37
38
|
Requires-Dist: python-dotenv
|
|
38
39
|
Requires-Dist: websockets
|
|
39
40
|
Requires-Dist: nest-asyncio
|
|
@@ -55,24 +56,25 @@ pip install firecrawl-py
|
|
|
55
56
|
## Usage
|
|
56
57
|
|
|
57
58
|
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
|
58
|
-
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `
|
|
59
|
+
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `Firecrawl` class.
|
|
59
60
|
|
|
60
61
|
Here's an example of how to use the SDK:
|
|
61
62
|
|
|
62
63
|
```python
|
|
63
|
-
from firecrawl import
|
|
64
|
+
from firecrawl import Firecrawl
|
|
65
|
+
from firecrawl.types import ScrapeOptions
|
|
64
66
|
|
|
65
|
-
|
|
67
|
+
firecrawl = Firecrawl(api_key="fc-YOUR_API_KEY")
|
|
66
68
|
|
|
67
|
-
# Scrape a website:
|
|
68
|
-
data =
|
|
69
|
+
# Scrape a website (v2):
|
|
70
|
+
data = firecrawl.scrape(
|
|
69
71
|
'https://firecrawl.dev',
|
|
70
72
|
formats=['markdown', 'html']
|
|
71
73
|
)
|
|
72
74
|
print(data)
|
|
73
75
|
|
|
74
|
-
# Crawl a website:
|
|
75
|
-
crawl_status =
|
|
76
|
+
# Crawl a website (v2 waiter):
|
|
77
|
+
crawl_status = firecrawl.crawl(
|
|
76
78
|
'https://firecrawl.dev',
|
|
77
79
|
limit=100,
|
|
78
80
|
scrape_options=ScrapeOptions(formats=['markdown', 'html'])
|
|
@@ -82,20 +84,20 @@ print(crawl_status)
|
|
|
82
84
|
|
|
83
85
|
### Scraping a URL
|
|
84
86
|
|
|
85
|
-
To scrape a single URL, use the `
|
|
87
|
+
To scrape a single URL, use the `scrape` method. It takes the URL as a parameter and returns a document with the requested formats.
|
|
86
88
|
|
|
87
89
|
```python
|
|
88
|
-
# Scrape a website:
|
|
89
|
-
scrape_result =
|
|
90
|
+
# Scrape a website (v2):
|
|
91
|
+
scrape_result = firecrawl.scrape('https://firecrawl.dev', formats=['markdown', 'html'])
|
|
90
92
|
print(scrape_result)
|
|
91
93
|
```
|
|
92
94
|
|
|
93
95
|
### Crawling a Website
|
|
94
96
|
|
|
95
|
-
To crawl a website, use the `
|
|
97
|
+
To crawl a website, use the `crawl` method. It takes the starting URL and optional parameters as arguments. You can control depth, limits, formats, and more.
|
|
96
98
|
|
|
97
99
|
```python
|
|
98
|
-
crawl_status =
|
|
100
|
+
crawl_status = firecrawl.crawl(
|
|
99
101
|
'https://firecrawl.dev',
|
|
100
102
|
limit=100,
|
|
101
103
|
scrape_options=ScrapeOptions(formats=['markdown', 'html']),
|
|
@@ -108,23 +110,23 @@ print(crawl_status)
|
|
|
108
110
|
|
|
109
111
|
<Tip>Looking for async operations? Check out the [Async Class](#async-class) section below.</Tip>
|
|
110
112
|
|
|
111
|
-
To
|
|
113
|
+
To enqueue a crawl asynchronously, use `start_crawl`. It returns the crawl `ID` which you can use to check the status of the crawl job.
|
|
112
114
|
|
|
113
115
|
```python
|
|
114
|
-
|
|
116
|
+
crawl_job = firecrawl.start_crawl(
|
|
115
117
|
'https://firecrawl.dev',
|
|
116
118
|
limit=100,
|
|
117
119
|
scrape_options=ScrapeOptions(formats=['markdown', 'html']),
|
|
118
120
|
)
|
|
119
|
-
print(
|
|
121
|
+
print(crawl_job)
|
|
120
122
|
```
|
|
121
123
|
|
|
122
124
|
### Checking Crawl Status
|
|
123
125
|
|
|
124
|
-
To check the status of a crawl job, use the `
|
|
126
|
+
To check the status of a crawl job, use the `get_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
|
125
127
|
|
|
126
128
|
```python
|
|
127
|
-
crawl_status =
|
|
129
|
+
crawl_status = firecrawl.get_crawl_status("<crawl_id>")
|
|
128
130
|
print(crawl_status)
|
|
129
131
|
```
|
|
130
132
|
|
|
@@ -133,17 +135,17 @@ print(crawl_status)
|
|
|
133
135
|
To cancel an asynchronous crawl job, use the `cancel_crawl` method. It takes the job ID of the asynchronous crawl as a parameter and returns the cancellation status.
|
|
134
136
|
|
|
135
137
|
```python
|
|
136
|
-
cancel_crawl =
|
|
138
|
+
cancel_crawl = firecrawl.cancel_crawl(id)
|
|
137
139
|
print(cancel_crawl)
|
|
138
140
|
```
|
|
139
141
|
|
|
140
142
|
### Map a Website
|
|
141
143
|
|
|
142
|
-
Use `
|
|
144
|
+
Use `map` to generate a list of URLs from a website. Options let you customize the mapping process, including whether to use the sitemap or include subdomains.
|
|
143
145
|
|
|
144
146
|
```python
|
|
145
|
-
# Map a website:
|
|
146
|
-
map_result =
|
|
147
|
+
# Map a website (v2):
|
|
148
|
+
map_result = firecrawl.map('https://firecrawl.dev')
|
|
147
149
|
print(map_result)
|
|
148
150
|
```
|
|
149
151
|
|
|
@@ -194,20 +196,35 @@ The SDK handles errors returned by the Firecrawl API and raises appropriate exce
|
|
|
194
196
|
|
|
195
197
|
## Async Class
|
|
196
198
|
|
|
197
|
-
For async operations, you can use the `
|
|
199
|
+
For async operations, you can use the `AsyncFirecrawl` class. Its methods mirror the `Firecrawl` class, but you `await` them.
|
|
198
200
|
|
|
199
201
|
```python
|
|
200
|
-
from firecrawl import
|
|
202
|
+
from firecrawl import AsyncFirecrawl
|
|
201
203
|
|
|
202
|
-
|
|
204
|
+
firecrawl = AsyncFirecrawl(api_key="YOUR_API_KEY")
|
|
203
205
|
|
|
204
|
-
# Async Scrape
|
|
206
|
+
# Async Scrape (v2)
|
|
205
207
|
async def example_scrape():
|
|
206
|
-
scrape_result = await
|
|
208
|
+
scrape_result = await firecrawl.scrape(url="https://example.com")
|
|
207
209
|
print(scrape_result)
|
|
208
210
|
|
|
209
|
-
# Async Crawl
|
|
211
|
+
# Async Crawl (v2)
|
|
210
212
|
async def example_crawl():
|
|
211
|
-
crawl_result = await
|
|
213
|
+
crawl_result = await firecrawl.crawl(url="https://example.com")
|
|
212
214
|
print(crawl_result)
|
|
213
215
|
```
|
|
216
|
+
|
|
217
|
+
## v1 compatibility
|
|
218
|
+
|
|
219
|
+
For legacy code paths, v1 remains available under `firecrawl.v1` with the original method names.
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
from firecrawl import Firecrawl
|
|
223
|
+
|
|
224
|
+
firecrawl = Firecrawl(api_key="YOUR_API_KEY")
|
|
225
|
+
|
|
226
|
+
# v1 methods (feature‑frozen)
|
|
227
|
+
doc_v1 = firecrawl.v1.scrape_url('https://firecrawl.dev', formats=['markdown', 'html'])
|
|
228
|
+
crawl_v1 = firecrawl.v1.crawl_url('https://firecrawl.dev', limit=100)
|
|
229
|
+
map_v1 = firecrawl.v1.map_url('https://firecrawl.dev')
|
|
230
|
+
```
|
|
@@ -13,24 +13,25 @@ pip install firecrawl-py
|
|
|
13
13
|
## Usage
|
|
14
14
|
|
|
15
15
|
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
|
16
|
-
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `
|
|
16
|
+
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `Firecrawl` class.
|
|
17
17
|
|
|
18
18
|
Here's an example of how to use the SDK:
|
|
19
19
|
|
|
20
20
|
```python
|
|
21
|
-
from firecrawl import
|
|
21
|
+
from firecrawl import Firecrawl
|
|
22
|
+
from firecrawl.types import ScrapeOptions
|
|
22
23
|
|
|
23
|
-
|
|
24
|
+
firecrawl = Firecrawl(api_key="fc-YOUR_API_KEY")
|
|
24
25
|
|
|
25
|
-
# Scrape a website:
|
|
26
|
-
data =
|
|
26
|
+
# Scrape a website (v2):
|
|
27
|
+
data = firecrawl.scrape(
|
|
27
28
|
'https://firecrawl.dev',
|
|
28
29
|
formats=['markdown', 'html']
|
|
29
30
|
)
|
|
30
31
|
print(data)
|
|
31
32
|
|
|
32
|
-
# Crawl a website:
|
|
33
|
-
crawl_status =
|
|
33
|
+
# Crawl a website (v2 waiter):
|
|
34
|
+
crawl_status = firecrawl.crawl(
|
|
34
35
|
'https://firecrawl.dev',
|
|
35
36
|
limit=100,
|
|
36
37
|
scrape_options=ScrapeOptions(formats=['markdown', 'html'])
|
|
@@ -40,20 +41,20 @@ print(crawl_status)
|
|
|
40
41
|
|
|
41
42
|
### Scraping a URL
|
|
42
43
|
|
|
43
|
-
To scrape a single URL, use the `
|
|
44
|
+
To scrape a single URL, use the `scrape` method. It takes the URL as a parameter and returns a document with the requested formats.
|
|
44
45
|
|
|
45
46
|
```python
|
|
46
|
-
# Scrape a website:
|
|
47
|
-
scrape_result =
|
|
47
|
+
# Scrape a website (v2):
|
|
48
|
+
scrape_result = firecrawl.scrape('https://firecrawl.dev', formats=['markdown', 'html'])
|
|
48
49
|
print(scrape_result)
|
|
49
50
|
```
|
|
50
51
|
|
|
51
52
|
### Crawling a Website
|
|
52
53
|
|
|
53
|
-
To crawl a website, use the `
|
|
54
|
+
To crawl a website, use the `crawl` method. It takes the starting URL and optional parameters as arguments. You can control depth, limits, formats, and more.
|
|
54
55
|
|
|
55
56
|
```python
|
|
56
|
-
crawl_status =
|
|
57
|
+
crawl_status = firecrawl.crawl(
|
|
57
58
|
'https://firecrawl.dev',
|
|
58
59
|
limit=100,
|
|
59
60
|
scrape_options=ScrapeOptions(formats=['markdown', 'html']),
|
|
@@ -66,23 +67,23 @@ print(crawl_status)
|
|
|
66
67
|
|
|
67
68
|
<Tip>Looking for async operations? Check out the [Async Class](#async-class) section below.</Tip>
|
|
68
69
|
|
|
69
|
-
To
|
|
70
|
+
To enqueue a crawl asynchronously, use `start_crawl`. It returns the crawl `ID` which you can use to check the status of the crawl job.
|
|
70
71
|
|
|
71
72
|
```python
|
|
72
|
-
|
|
73
|
+
crawl_job = firecrawl.start_crawl(
|
|
73
74
|
'https://firecrawl.dev',
|
|
74
75
|
limit=100,
|
|
75
76
|
scrape_options=ScrapeOptions(formats=['markdown', 'html']),
|
|
76
77
|
)
|
|
77
|
-
print(
|
|
78
|
+
print(crawl_job)
|
|
78
79
|
```
|
|
79
80
|
|
|
80
81
|
### Checking Crawl Status
|
|
81
82
|
|
|
82
|
-
To check the status of a crawl job, use the `
|
|
83
|
+
To check the status of a crawl job, use the `get_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
|
83
84
|
|
|
84
85
|
```python
|
|
85
|
-
crawl_status =
|
|
86
|
+
crawl_status = firecrawl.get_crawl_status("<crawl_id>")
|
|
86
87
|
print(crawl_status)
|
|
87
88
|
```
|
|
88
89
|
|
|
@@ -91,17 +92,17 @@ print(crawl_status)
|
|
|
91
92
|
To cancel an asynchronous crawl job, use the `cancel_crawl` method. It takes the job ID of the asynchronous crawl as a parameter and returns the cancellation status.
|
|
92
93
|
|
|
93
94
|
```python
|
|
94
|
-
cancel_crawl =
|
|
95
|
+
cancel_crawl = firecrawl.cancel_crawl(id)
|
|
95
96
|
print(cancel_crawl)
|
|
96
97
|
```
|
|
97
98
|
|
|
98
99
|
### Map a Website
|
|
99
100
|
|
|
100
|
-
Use `
|
|
101
|
+
Use `map` to generate a list of URLs from a website. Options let you customize the mapping process, including whether to use the sitemap or include subdomains.
|
|
101
102
|
|
|
102
103
|
```python
|
|
103
|
-
# Map a website:
|
|
104
|
-
map_result =
|
|
104
|
+
# Map a website (v2):
|
|
105
|
+
map_result = firecrawl.map('https://firecrawl.dev')
|
|
105
106
|
print(map_result)
|
|
106
107
|
```
|
|
107
108
|
|
|
@@ -152,20 +153,35 @@ The SDK handles errors returned by the Firecrawl API and raises appropriate exce
|
|
|
152
153
|
|
|
153
154
|
## Async Class
|
|
154
155
|
|
|
155
|
-
For async operations, you can use the `
|
|
156
|
+
For async operations, you can use the `AsyncFirecrawl` class. Its methods mirror the `Firecrawl` class, but you `await` them.
|
|
156
157
|
|
|
157
158
|
```python
|
|
158
|
-
from firecrawl import
|
|
159
|
+
from firecrawl import AsyncFirecrawl
|
|
159
160
|
|
|
160
|
-
|
|
161
|
+
firecrawl = AsyncFirecrawl(api_key="YOUR_API_KEY")
|
|
161
162
|
|
|
162
|
-
# Async Scrape
|
|
163
|
+
# Async Scrape (v2)
|
|
163
164
|
async def example_scrape():
|
|
164
|
-
scrape_result = await
|
|
165
|
+
scrape_result = await firecrawl.scrape(url="https://example.com")
|
|
165
166
|
print(scrape_result)
|
|
166
167
|
|
|
167
|
-
# Async Crawl
|
|
168
|
+
# Async Crawl (v2)
|
|
168
169
|
async def example_crawl():
|
|
169
|
-
crawl_result = await
|
|
170
|
+
crawl_result = await firecrawl.crawl(url="https://example.com")
|
|
170
171
|
print(crawl_result)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## v1 compatibility
|
|
175
|
+
|
|
176
|
+
For legacy code paths, v1 remains available under `firecrawl.v1` with the original method names.
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from firecrawl import Firecrawl
|
|
180
|
+
|
|
181
|
+
firecrawl = Firecrawl(api_key="YOUR_API_KEY")
|
|
182
|
+
|
|
183
|
+
# v1 methods (feature‑frozen)
|
|
184
|
+
doc_v1 = firecrawl.v1.scrape_url('https://firecrawl.dev', formats=['markdown', 'html'])
|
|
185
|
+
crawl_v1 = firecrawl.v1.crawl_url('https://firecrawl.dev', limit=100)
|
|
186
|
+
map_v1 = firecrawl.v1.map_url('https://firecrawl.dev')
|
|
171
187
|
```
|
|
@@ -1,19 +1,23 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Firecrawl Python SDK
|
|
3
3
|
|
|
4
|
-
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
-
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
-
and check the status of these jobs.
|
|
7
|
-
|
|
8
|
-
For more information visit https://github.com/firecrawl/
|
|
9
4
|
"""
|
|
10
5
|
|
|
11
6
|
import logging
|
|
12
7
|
import os
|
|
13
8
|
|
|
14
|
-
from .
|
|
9
|
+
from .client import Firecrawl, AsyncFirecrawl, FirecrawlApp, AsyncFirecrawlApp
|
|
10
|
+
from .v2.watcher import Watcher
|
|
11
|
+
from .v2.watcher_async import AsyncWatcher
|
|
12
|
+
from .v1 import (
|
|
13
|
+
V1FirecrawlApp,
|
|
14
|
+
AsyncV1FirecrawlApp,
|
|
15
|
+
V1JsonConfig,
|
|
16
|
+
V1ScrapeOptions,
|
|
17
|
+
V1ChangeTrackingOptions,
|
|
18
|
+
)
|
|
15
19
|
|
|
16
|
-
__version__ = "
|
|
20
|
+
__version__ = "3.0.3"
|
|
17
21
|
|
|
18
22
|
# Define the logger for the Firecrawl project
|
|
19
23
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -27,17 +31,14 @@ def _configure_logger() -> None:
|
|
|
27
31
|
format to the firecrawl logger.
|
|
28
32
|
"""
|
|
29
33
|
try:
|
|
30
|
-
# Create the formatter
|
|
31
34
|
formatter = logging.Formatter(
|
|
32
35
|
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
33
36
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
37
|
)
|
|
35
38
|
|
|
36
|
-
# Create the console handler and set the formatter
|
|
37
39
|
console_handler = logging.StreamHandler()
|
|
38
40
|
console_handler.setFormatter(formatter)
|
|
39
41
|
|
|
40
|
-
# Add the console handler to the firecrawl logger
|
|
41
42
|
logger.addHandler(console_handler)
|
|
42
43
|
except Exception as e:
|
|
43
44
|
logger.error("Failed to configure logging: %s", e)
|
|
@@ -45,20 +46,15 @@ def _configure_logger() -> None:
|
|
|
45
46
|
|
|
46
47
|
def setup_logging() -> None:
|
|
47
48
|
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
48
|
-
# Check if the firecrawl logger already has a handler
|
|
49
49
|
if logger.hasHandlers():
|
|
50
|
-
return
|
|
50
|
+
return
|
|
51
51
|
|
|
52
|
-
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
|
53
52
|
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
54
|
-
# Attach a no-op handler to prevent warnings about no handlers
|
|
55
53
|
logger.addHandler(logging.NullHandler())
|
|
56
54
|
return
|
|
57
55
|
|
|
58
|
-
# Attach the console handler to the firecrawl logger
|
|
59
56
|
_configure_logger()
|
|
60
57
|
|
|
61
|
-
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
|
62
58
|
if env == "DEBUG":
|
|
63
59
|
logger.setLevel(logging.DEBUG)
|
|
64
60
|
elif env == "INFO":
|
|
@@ -73,7 +69,19 @@ def setup_logging() -> None:
|
|
|
73
69
|
logger.setLevel(logging.INFO)
|
|
74
70
|
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
75
71
|
|
|
76
|
-
|
|
77
|
-
# Initialize logging configuration when the module is imported
|
|
78
72
|
setup_logging()
|
|
79
73
|
logger.debug("Debugging logger setup")
|
|
74
|
+
|
|
75
|
+
__all__ = [
|
|
76
|
+
'Firecrawl',
|
|
77
|
+
'AsyncFirecrawl',
|
|
78
|
+
'FirecrawlApp',
|
|
79
|
+
'AsyncFirecrawlApp',
|
|
80
|
+
'Watcher',
|
|
81
|
+
'AsyncWatcher',
|
|
82
|
+
'V1FirecrawlApp',
|
|
83
|
+
'AsyncV1FirecrawlApp',
|
|
84
|
+
'V1JsonConfig',
|
|
85
|
+
'V1ScrapeOptions',
|
|
86
|
+
'V1ChangeTrackingOptions',
|
|
87
|
+
]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import pytest
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from firecrawl import AsyncFirecrawl
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
if not os.getenv("API_KEY"):
|
|
11
|
+
raise ValueError("API_KEY is not set")
|
|
12
|
+
|
|
13
|
+
if not os.getenv("API_URL"):
|
|
14
|
+
raise ValueError("API_URL is not set")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.mark.asyncio
|
|
18
|
+
async def test_async_batch_start_and_status():
|
|
19
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
20
|
+
start = await client.start_batch_scrape([
|
|
21
|
+
"https://docs.firecrawl.dev",
|
|
22
|
+
"https://firecrawl.dev",
|
|
23
|
+
], formats=["markdown"], max_concurrency=1)
|
|
24
|
+
job_id = start.id
|
|
25
|
+
|
|
26
|
+
deadline = asyncio.get_event_loop().time() + 240
|
|
27
|
+
status = await client.get_batch_scrape_status(job_id)
|
|
28
|
+
while status.status not in ("completed", "failed", "cancelled") and asyncio.get_event_loop().time() < deadline:
|
|
29
|
+
await asyncio.sleep(2)
|
|
30
|
+
status = await client.get_batch_scrape_status(job_id)
|
|
31
|
+
|
|
32
|
+
assert status.status in ("completed", "failed", "cancelled")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@pytest.mark.asyncio
|
|
36
|
+
async def test_async_batch_wait_minimal():
|
|
37
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
38
|
+
job = await client.batch_scrape([
|
|
39
|
+
"https://docs.firecrawl.dev",
|
|
40
|
+
"https://firecrawl.dev",
|
|
41
|
+
], formats=["markdown"], poll_interval=1, timeout=120)
|
|
42
|
+
assert job.status in ("completed", "failed")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.mark.asyncio
|
|
46
|
+
async def test_async_batch_wait_with_all_params():
|
|
47
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
48
|
+
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
49
|
+
job = await client.batch_scrape(
|
|
50
|
+
[
|
|
51
|
+
"https://docs.firecrawl.dev",
|
|
52
|
+
"https://firecrawl.dev",
|
|
53
|
+
],
|
|
54
|
+
formats=[
|
|
55
|
+
"markdown",
|
|
56
|
+
{"type": "json", "prompt": "Extract page title", "schema": json_schema},
|
|
57
|
+
{"type": "changeTracking", "prompt": "Track changes", "modes": ["json"]},
|
|
58
|
+
],
|
|
59
|
+
only_main_content=True,
|
|
60
|
+
mobile=False,
|
|
61
|
+
ignore_invalid_urls=True,
|
|
62
|
+
max_concurrency=2,
|
|
63
|
+
zero_data_retention=False,
|
|
64
|
+
poll_interval=1,
|
|
65
|
+
timeout=180,
|
|
66
|
+
)
|
|
67
|
+
assert job.status in ("completed", "failed")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.mark.asyncio
|
|
71
|
+
async def test_async_cancel_batch():
|
|
72
|
+
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
73
|
+
start = await client.start_batch_scrape([
|
|
74
|
+
"https://docs.firecrawl.dev",
|
|
75
|
+
"https://firecrawl.dev",
|
|
76
|
+
], formats=["markdown"], max_concurrency=1)
|
|
77
|
+
ok = await client.cancel_batch_scrape(start.id)
|
|
78
|
+
assert ok is True
|
|
79
|
+
|