firecrawl 0.0.19__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

firecrawl/__init__.py CHANGED
@@ -13,7 +13,7 @@ import os
13
13
 
14
14
  from .firecrawl import FirecrawlApp
15
15
 
16
- __version__ = "0.0.19"
16
+ __version__ = "1.1.1"
17
17
 
18
18
  # Define the logger for the Firecrawl project
19
19
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -7,7 +7,7 @@ from dotenv import load_dotenv
7
7
 
8
8
  load_dotenv()
9
9
 
10
- API_URL = "http://127.0.0.1:3002";
10
+ API_URL = "http://127.0.0.1:3002"
11
11
  ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
12
12
  TEST_API_KEY = os.getenv('TEST_API_KEY')
13
13
 
@@ -20,32 +20,34 @@ FirecrawlApp = firecrawl.FirecrawlApp
20
20
 
21
21
  def test_no_api_key():
22
22
  with pytest.raises(Exception) as excinfo:
23
- invalid_app = FirecrawlApp(api_url=API_URL)
23
+ invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
24
24
  assert "No API key provided" in str(excinfo.value)
25
25
 
26
26
  def test_scrape_url_invalid_api_key():
27
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
27
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
28
28
  with pytest.raises(Exception) as excinfo:
29
29
  invalid_app.scrape_url('https://firecrawl.dev')
30
30
  assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
31
31
 
32
32
  def test_blocklisted_url():
33
33
  blocklisted_url = "https://facebook.com/fake-test"
34
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
34
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
35
35
  with pytest.raises(Exception) as excinfo:
36
36
  app.scrape_url(blocklisted_url)
37
37
  assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
38
38
 
39
39
  def test_successful_response_with_valid_preview_token():
40
- app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
40
+ app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
41
41
  response = app.scrape_url('https://roastmywebsite.ai')
42
42
  assert response is not None
43
43
  assert 'content' in response
44
44
  assert "_Roast_" in response['content']
45
45
 
46
46
  def test_scrape_url_e2e():
47
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
47
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
48
48
  response = app.scrape_url('https://roastmywebsite.ai')
49
+ print(response)
50
+
49
51
  assert response is not None
50
52
  assert 'content' in response
51
53
  assert 'markdown' in response
@@ -54,7 +56,7 @@ def test_scrape_url_e2e():
54
56
  assert "_Roast_" in response['content']
55
57
 
56
58
  def test_successful_response_with_valid_api_key_and_include_html():
57
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
59
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
58
60
  response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
59
61
  assert response is not None
60
62
  assert 'content' in response
@@ -66,7 +68,7 @@ def test_successful_response_with_valid_api_key_and_include_html():
66
68
  assert "<h1" in response['html']
67
69
 
68
70
  def test_successful_response_for_valid_scrape_with_pdf_file():
69
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
71
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
70
72
  response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
71
73
  assert response is not None
72
74
  assert 'content' in response
@@ -74,7 +76,7 @@ def test_successful_response_for_valid_scrape_with_pdf_file():
74
76
  assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
75
77
 
76
78
  def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
77
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
79
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
78
80
  response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
79
81
  time.sleep(6) # wait for 6 seconds
80
82
  assert response is not None
@@ -83,20 +85,20 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
83
85
  assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
84
86
 
85
87
  def test_crawl_url_invalid_api_key():
86
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
88
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
87
89
  with pytest.raises(Exception) as excinfo:
88
90
  invalid_app.crawl_url('https://firecrawl.dev')
89
91
  assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
90
92
 
91
93
  def test_should_return_error_for_blocklisted_url():
92
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
94
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
93
95
  blocklisted_url = "https://twitter.com/fake-test"
94
96
  with pytest.raises(Exception) as excinfo:
95
97
  app.crawl_url(blocklisted_url)
96
98
  assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
97
99
 
98
100
  def test_crawl_url_wait_for_completion_e2e():
99
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
101
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
100
102
  response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
101
103
  assert response is not None
102
104
  assert len(response) > 0
@@ -104,7 +106,7 @@ def test_crawl_url_wait_for_completion_e2e():
104
106
  assert "_Roast_" in response[0]['content']
105
107
 
106
108
  def test_crawl_url_with_idempotency_key_e2e():
107
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
109
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
108
110
  uniqueIdempotencyKey = str(uuid4())
109
111
  response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
110
112
  assert response is not None
@@ -117,7 +119,7 @@ def test_crawl_url_with_idempotency_key_e2e():
117
119
  assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
118
120
 
119
121
  def test_check_crawl_status_e2e():
120
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
122
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
121
123
  response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
122
124
  assert response is not None
123
125
  assert 'jobId' in response
@@ -131,21 +133,21 @@ def test_check_crawl_status_e2e():
131
133
  assert len(status_response['data']) > 0
132
134
 
133
135
  def test_search_e2e():
134
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
136
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
135
137
  response = app.search("test query")
136
138
  assert response is not None
137
139
  assert 'content' in response[0]
138
140
  assert len(response) > 2
139
141
 
140
142
  def test_search_invalid_api_key():
141
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
143
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
142
144
  with pytest.raises(Exception) as excinfo:
143
145
  invalid_app.search("test query")
144
146
  assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
145
147
 
146
148
  def test_llm_extraction():
147
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
148
- response = app.scrape_url("https://mendable.ai", {
149
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
150
+ response = app.scrape_url("https://firecrawl.dev", {
149
151
  'extractorOptions': {
150
152
  'mode': 'llm-extraction',
151
153
  'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
File without changes
@@ -0,0 +1,352 @@
1
+ import importlib.util
2
+ import pytest
3
+ import time
4
+ import os
5
+ from uuid import uuid4
6
+ from dotenv import load_dotenv
7
+ from datetime import datetime
8
+
9
+ load_dotenv()
10
+
11
+ API_URL = "http://127.0.0.1:3002";
12
+ ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
13
+ TEST_API_KEY = os.getenv('TEST_API_KEY')
14
+
15
+ print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
16
+
17
+ spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
18
+ firecrawl = importlib.util.module_from_spec(spec)
19
+ spec.loader.exec_module(firecrawl)
20
+ FirecrawlApp = firecrawl.FirecrawlApp
21
+
22
+ def test_no_api_key():
23
+ with pytest.raises(Exception) as excinfo:
24
+ invalid_app = FirecrawlApp(api_url=API_URL)
25
+ assert "No API key provided" in str(excinfo.value)
26
+
27
+ def test_scrape_url_invalid_api_key():
28
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
29
+ with pytest.raises(Exception) as excinfo:
30
+ invalid_app.scrape_url('https://firecrawl.dev')
31
+ assert "Unauthorized: Invalid token" in str(excinfo.value)
32
+
33
+ def test_blocklisted_url():
34
+ blocklisted_url = "https://facebook.com/fake-test"
35
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
36
+ with pytest.raises(Exception) as excinfo:
37
+ app.scrape_url(blocklisted_url)
38
+ assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
39
+
40
+ def test_successful_response_with_valid_preview_token():
41
+ app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
42
+ response = app.scrape_url('https://roastmywebsite.ai')
43
+ assert response is not None
44
+ assert "_Roast_" in response['markdown']
45
+ assert "content" not in response
46
+ assert "html" not in response
47
+ assert "metadata" in response
48
+ assert "links" not in response
49
+ assert "rawHtml" not in response
50
+
51
+ def test_successful_response_for_valid_scrape():
52
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
53
+ response = app.scrape_url('https://roastmywebsite.ai')
54
+ assert response is not None
55
+ assert 'markdown' in response
56
+ assert "_Roast_" in response['markdown']
57
+ assert 'metadata' in response
58
+ assert 'content' not in response
59
+ assert 'html' not in response
60
+ assert 'rawHtml' not in response
61
+ assert 'screenshot' not in response
62
+ assert 'links' not in response
63
+
64
+ def test_successful_response_with_valid_api_key_and_options():
65
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
66
+ params = {
67
+ 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
68
+ 'headers': {'x-key': 'test'},
69
+ 'includeTags': ['h1'],
70
+ 'excludeTags': ['h2'],
71
+ 'onlyMainContent': True,
72
+ 'timeout': 30000,
73
+ 'waitFor': 1000
74
+ }
75
+ response = app.scrape_url('https://roastmywebsite.ai', params)
76
+ assert response is not None
77
+ assert 'content' not in response
78
+ assert 'markdown' in response
79
+ assert 'html' in response
80
+ assert 'rawHtml' in response
81
+ assert 'screenshot' in response
82
+ assert 'links' in response
83
+ assert "_Roast_" in response['markdown']
84
+ assert "<h1" in response['html']
85
+ assert "<h1" in response['rawHtml']
86
+ assert "https://" in response['screenshot']
87
+ assert len(response['links']) > 0
88
+ assert "https://" in response['links'][0]
89
+ assert 'metadata' in response
90
+ assert 'title' in response['metadata']
91
+ assert 'description' in response['metadata']
92
+ assert 'keywords' in response['metadata']
93
+ assert 'robots' in response['metadata']
94
+ assert 'ogTitle' in response['metadata']
95
+ assert 'ogDescription' in response['metadata']
96
+ assert 'ogUrl' in response['metadata']
97
+ assert 'ogImage' in response['metadata']
98
+ assert 'ogLocaleAlternate' in response['metadata']
99
+ assert 'ogSiteName' in response['metadata']
100
+ assert 'sourceURL' in response['metadata']
101
+ assert 'statusCode' in response['metadata']
102
+ assert 'pageStatusCode' not in response['metadata']
103
+ assert 'pageError' not in response['metadata']
104
+ assert 'error' not in response['metadata']
105
+ assert response['metadata']['title'] == "Roast My Website"
106
+ assert response['metadata']['description'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
107
+ assert response['metadata']['keywords'] == "Roast My Website,Roast,Website,GitHub,Firecrawl"
108
+ assert response['metadata']['robots'] == "follow, index"
109
+ assert response['metadata']['ogTitle'] == "Roast My Website"
110
+ assert response['metadata']['ogDescription'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
111
+ assert response['metadata']['ogUrl'] == "https://www.roastmywebsite.ai"
112
+ assert response['metadata']['ogImage'] == "https://www.roastmywebsite.ai/og.png"
113
+ assert response['metadata']['ogLocaleAlternate'] == []
114
+ assert response['metadata']['ogSiteName'] == "Roast My Website"
115
+ assert response['metadata']['sourceURL'] == "https://roastmywebsite.ai"
116
+ assert response['metadata']['statusCode'] == 200
117
+
118
+ def test_successful_response_for_valid_scrape_with_pdf_file():
119
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
120
+ response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
121
+ assert response is not None
122
+ assert 'content' not in response
123
+ assert 'metadata' in response
124
+ assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
125
+
126
+ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
127
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
128
+ response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
129
+ time.sleep(1) # wait for 1 second
130
+ assert response is not None
131
+ assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
132
+
133
+ def test_crawl_url_invalid_api_key():
134
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
135
+ with pytest.raises(Exception) as excinfo:
136
+ invalid_app.crawl_url('https://firecrawl.dev')
137
+ assert "Unauthorized: Invalid token" in str(excinfo.value)
138
+
139
+ def test_should_return_error_for_blocklisted_url():
140
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
141
+ blocklisted_url = "https://twitter.com/fake-test"
142
+ with pytest.raises(Exception) as excinfo:
143
+ app.crawl_url(blocklisted_url)
144
+ assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
145
+
146
+ def test_crawl_url_wait_for_completion_e2e():
147
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
148
+ response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, True, 30)
149
+ assert response is not None
150
+ assert 'total' in response
151
+ assert response['total'] > 0
152
+ assert 'creditsUsed' in response
153
+ assert response['creditsUsed'] > 0
154
+ assert 'expiresAt' in response
155
+ assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
156
+ assert 'status' in response
157
+ assert response['status'] == 'completed'
158
+ assert 'next' not in response
159
+ assert len(response['data']) > 0
160
+ assert 'markdown' in response['data'][0]
161
+ assert "_Roast_" in response['data'][0]['markdown']
162
+ assert 'content' not in response['data'][0]
163
+ assert 'html' not in response['data'][0]
164
+ assert 'rawHtml' not in response['data'][0]
165
+ assert 'screenshot' not in response['data'][0]
166
+ assert 'links' not in response['data'][0]
167
+ assert 'metadata' in response['data'][0]
168
+ assert 'title' in response['data'][0]['metadata']
169
+ assert 'description' in response['data'][0]['metadata']
170
+ assert 'language' in response['data'][0]['metadata']
171
+ assert 'sourceURL' in response['data'][0]['metadata']
172
+ assert 'statusCode' in response['data'][0]['metadata']
173
+ assert 'error' not in response['data'][0]['metadata']
174
+
175
+ def test_crawl_url_with_options_and_wait_for_completion():
176
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
177
+ response = app.crawl_url('https://roastmywebsite.ai', {
178
+ 'excludePaths': ['blog/*'],
179
+ 'includePaths': ['/'],
180
+ 'maxDepth': 2,
181
+ 'ignoreSitemap': True,
182
+ 'limit': 10,
183
+ 'allowBackwardLinks': True,
184
+ 'allowExternalLinks': True,
185
+ 'scrapeOptions': {
186
+ 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
187
+ 'headers': {"x-key": "test"},
188
+ 'includeTags': ['h1'],
189
+ 'excludeTags': ['h2'],
190
+ 'onlyMainContent': True,
191
+ 'waitFor': 1000
192
+ }
193
+ }, True, 30)
194
+ assert response is not None
195
+ assert 'total' in response
196
+ assert response['total'] > 0
197
+ assert 'creditsUsed' in response
198
+ assert response['creditsUsed'] > 0
199
+ assert 'expiresAt' in response
200
+ assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
201
+ assert 'status' in response
202
+ assert response['status'] == 'completed'
203
+ assert 'next' not in response
204
+ assert len(response['data']) > 0
205
+ assert 'markdown' in response['data'][0]
206
+ assert "_Roast_" in response['data'][0]['markdown']
207
+ assert 'content' not in response['data'][0]
208
+ assert 'html' in response['data'][0]
209
+ assert "<h1" in response['data'][0]['html']
210
+ assert 'rawHtml' in response['data'][0]
211
+ assert "<h1" in response['data'][0]['rawHtml']
212
+ assert 'screenshot' in response['data'][0]
213
+ assert "https://" in response['data'][0]['screenshot']
214
+ assert 'links' in response['data'][0]
215
+ assert len(response['data'][0]['links']) > 0
216
+ assert 'metadata' in response['data'][0]
217
+ assert 'title' in response['data'][0]['metadata']
218
+ assert 'description' in response['data'][0]['metadata']
219
+ assert 'language' in response['data'][0]['metadata']
220
+ assert 'sourceURL' in response['data'][0]['metadata']
221
+ assert 'statusCode' in response['data'][0]['metadata']
222
+ assert 'error' not in response['data'][0]['metadata']
223
+
224
+ def test_crawl_url_with_idempotency_key_e2e():
225
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
226
+ uniqueIdempotencyKey = str(uuid4())
227
+ response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey)
228
+ assert response is not None
229
+ assert 'id' in response
230
+
231
+ with pytest.raises(Exception) as excinfo:
232
+ app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey)
233
+ assert "Idempotency key already used" in str(excinfo.value)
234
+
235
+ def test_check_crawl_status_e2e():
236
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
237
+ response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False)
238
+ assert response is not None
239
+ assert 'id' in response
240
+
241
+ max_checks = 15
242
+ checks = 0
243
+ status_response = app.check_crawl_status(response['id'])
244
+
245
+ while status_response['status'] == 'scraping' and checks < max_checks:
246
+ time.sleep(1) # wait for 1 second
247
+ assert 'partial_data' not in status_response
248
+ assert 'current' not in status_response
249
+ assert 'data' in status_response
250
+ assert 'total' in status_response
251
+ assert 'creditsUsed' in status_response
252
+ assert 'expiresAt' in status_response
253
+ assert 'status' in status_response
254
+ assert 'next' in status_response
255
+ assert status_response['total'] > 0
256
+ assert status_response['creditsUsed'] > 0
257
+ assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
258
+ assert status_response['status'] == 'scraping'
259
+ assert '/v1/crawl/' in status_response['next']
260
+ status_response = app.check_crawl_status(response['id'])
261
+ checks += 1
262
+
263
+ assert status_response is not None
264
+ assert 'total' in status_response
265
+ assert status_response['total'] > 0
266
+ assert 'creditsUsed' in status_response
267
+ assert status_response['creditsUsed'] > 0
268
+ assert 'expiresAt' in status_response
269
+ assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
270
+ assert 'status' in status_response
271
+ assert status_response['status'] == 'completed'
272
+ assert len(status_response['data']) > 0
273
+ assert 'markdown' in status_response['data'][0]
274
+ assert len(status_response['data'][0]['markdown']) > 10
275
+ assert 'content' not in status_response['data'][0]
276
+ assert 'html' in status_response['data'][0]
277
+ assert "<div" in status_response['data'][0]['html']
278
+ assert 'rawHtml' in status_response['data'][0]
279
+ assert "<div" in status_response['data'][0]['rawHtml']
280
+ assert 'screenshot' in status_response['data'][0]
281
+ assert "https://" in status_response['data'][0]['screenshot']
282
+ assert 'links' in status_response['data'][0]
283
+ assert status_response['data'][0]['links'] is not None
284
+ assert len(status_response['data'][0]['links']) > 0
285
+ assert 'metadata' in status_response['data'][0]
286
+ assert 'title' in status_response['data'][0]['metadata']
287
+ assert 'description' in status_response['data'][0]['metadata']
288
+ assert 'language' in status_response['data'][0]['metadata']
289
+ assert 'sourceURL' in status_response['data'][0]['metadata']
290
+ assert 'statusCode' in status_response['data'][0]['metadata']
291
+ assert 'error' not in status_response['data'][0]['metadata']
292
+
293
+ def test_invalid_api_key_on_map():
294
+ invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
295
+ with pytest.raises(Exception) as excinfo:
296
+ invalid_app.map_url('https://roastmywebsite.ai')
297
+ assert "Unauthorized: Invalid token" in str(excinfo.value)
298
+
299
+ def test_blocklisted_url_on_map():
300
+ app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
301
+ blocklisted_url = "https://facebook.com/fake-test"
302
+ with pytest.raises(Exception) as excinfo:
303
+ app.map_url(blocklisted_url)
304
+ assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
305
+
306
+ def test_successful_response_with_valid_preview_token_on_map():
307
+ app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)
308
+ response = app.map_url('https://roastmywebsite.ai')
309
+ assert response is not None
310
+ assert len(response) > 0
311
+
312
+ def test_successful_response_for_valid_map():
313
+ app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
314
+ response = app.map_url('https://roastmywebsite.ai')
315
+ assert response is not None
316
+ assert len(response) > 0
317
+ assert any("https://" in link for link in response)
318
+ filtered_links = [link for link in response if "roastmywebsite.ai" in link]
319
+ assert len(filtered_links) > 0
320
+
321
+ def test_search_e2e():
322
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
323
+ with pytest.raises(NotImplementedError) as excinfo:
324
+ app.search("test query")
325
+ assert "Search is not supported in v1" in str(excinfo.value)
326
+
327
+ # def test_llm_extraction():
328
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
329
+ # response = app.scrape_url("https://mendable.ai", {
330
+ # 'extractorOptions': {
331
+ # 'mode': 'llm-extraction',
332
+ # 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
333
+ # 'extractionSchema': {
334
+ # 'type': 'object',
335
+ # 'properties': {
336
+ # 'company_mission': {'type': 'string'},
337
+ # 'supports_sso': {'type': 'boolean'},
338
+ # 'is_open_source': {'type': 'boolean'}
339
+ # },
340
+ # 'required': ['company_mission', 'supports_sso', 'is_open_source']
341
+ # }
342
+ # }
343
+ # })
344
+ # assert response is not None
345
+ # assert 'llm_extraction' in response
346
+ # llm_extraction = response['llm_extraction']
347
+ # assert 'company_mission' in llm_extraction
348
+ # assert isinstance(llm_extraction['supports_sso'], bool)
349
+ # assert isinstance(llm_extraction['is_open_source'], bool)
350
+
351
+
352
+
firecrawl/firecrawl.py CHANGED
@@ -12,31 +12,30 @@ Classes:
12
12
  import logging
13
13
  import os
14
14
  import time
15
- from typing import Any, Dict, Optional
15
+ from typing import Any, Dict, Optional, List
16
+ import asyncio
17
+ import json
16
18
 
17
19
  import requests
20
+ import websockets
18
21
 
19
22
  logger : logging.Logger = logging.getLogger("firecrawl")
20
23
 
21
24
  class FirecrawlApp:
22
- """
23
- Initialize the FirecrawlApp instance.
24
-
25
- Args:
26
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
27
- api_url (Optional[str]): Base URL for the Firecrawl API.
28
- """
29
25
  def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
30
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
31
- if self.api_key is None:
32
- logger.warning("No API key provided")
33
- raise ValueError('No API key provided')
34
- else:
35
- logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
36
-
37
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
38
- if self.api_url != 'https://api.firecrawl.dev':
39
- logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
26
+ """
27
+ Initialize the FirecrawlApp instance with API key, API URL.
28
+
29
+ Args:
30
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
31
+ api_url (Optional[str]): Base URL for the Firecrawl API.
32
+ """
33
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
34
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
35
+ if self.api_key is None:
36
+ logger.warning("No API key provided")
37
+ raise ValueError('No API key provided')
38
+ logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")
40
39
 
41
40
  def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
42
41
  """
@@ -75,9 +74,11 @@ class FirecrawlApp:
75
74
  for key, value in params.items():
76
75
  if key != 'extractorOptions':
77
76
  scrape_params[key] = value
77
+
78
+ endpoint = f'/v1/scrape'
78
79
  # Make the POST request with the prepared headers and JSON data
79
80
  response = requests.post(
80
- f'{self.api_url}/v0/scrape',
81
+ f'{self.api_url}{endpoint}',
81
82
  headers=headers,
82
83
  json=scrape_params,
83
84
  )
@@ -102,32 +103,14 @@ class FirecrawlApp:
102
103
  Any: The search results if the request is successful.
103
104
 
104
105
  Raises:
106
+ NotImplementedError: If the search request is attempted on API version v1.
105
107
  Exception: If the search request fails.
106
108
  """
107
- headers = self._prepare_headers()
108
- json_data = {'query': query}
109
- if params:
110
- json_data.update(params)
111
- response = requests.post(
112
- f'{self.api_url}/v0/search',
113
- headers=headers,
114
- json=json_data
115
- )
116
- if response.status_code == 200:
117
- response = response.json()
118
-
119
- if response['success'] and 'data' in response:
120
- return response['data']
121
- else:
122
- raise Exception(f'Failed to search. Error: {response["error"]}')
123
-
124
- else:
125
- self._handle_error(response, 'search')
109
+ raise NotImplementedError("Search is not supported in v1.")
126
110
 
127
111
  def crawl_url(self, url: str,
128
112
  params: Optional[Dict[str, Any]] = None,
129
- wait_until_done: bool = True,
130
- poll_interval: int = 2,
113
+ poll_interval: Optional[int] = 2,
131
114
  idempotency_key: Optional[str] = None) -> Any:
132
115
  """
133
116
  Initiate a crawl job for the specified URL using the Firecrawl API.
@@ -135,8 +118,7 @@ class FirecrawlApp:
135
118
  Args:
136
119
  url (str): The URL to crawl.
137
120
  params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
138
- wait_until_done (bool): Whether to wait until the crawl job is completed.
139
- poll_interval (int): Time in seconds between status checks when waiting for job completion.
121
+ poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
140
122
  idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
141
123
 
142
124
  Returns:
@@ -145,26 +127,49 @@ class FirecrawlApp:
145
127
  Raises:
146
128
  Exception: If the crawl job initiation or monitoring fails.
147
129
  """
130
+ endpoint = f'/v1/crawl'
148
131
  headers = self._prepare_headers(idempotency_key)
149
132
  json_data = {'url': url}
150
133
  if params:
151
134
  json_data.update(params)
152
- response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
135
+ response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
153
136
  if response.status_code == 200:
154
- job_id = response.json().get('jobId')
155
- if wait_until_done:
156
- return self._monitor_job_status(job_id, headers, poll_interval)
157
- else:
158
- return {'jobId': job_id}
137
+ id = response.json().get('id')
138
+ return self._monitor_job_status(id, headers, poll_interval)
139
+
159
140
  else:
160
141
  self._handle_error(response, 'start crawl job')
161
142
 
162
- def check_crawl_status(self, job_id: str) -> Any:
143
+
144
+ def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
145
+ """
146
+ Initiate a crawl job asynchronously.
147
+
148
+ Args:
149
+ url (str): The URL to crawl.
150
+ params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
151
+ idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
152
+
153
+ Returns:
154
+ Dict[str, Any]: The response from the crawl initiation request.
155
+ """
156
+ endpoint = f'/v1/crawl'
157
+ headers = self._prepare_headers(idempotency_key)
158
+ json_data = {'url': url}
159
+ if params:
160
+ json_data.update(params)
161
+ response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
162
+ if response.status_code == 200:
163
+ return response.json()
164
+ else:
165
+ self._handle_error(response, 'start crawl job')
166
+
167
+ def check_crawl_status(self, id: str) -> Any:
163
168
  """
164
169
  Check the status of a crawl job using the Firecrawl API.
165
170
 
166
171
  Args:
167
- job_id (str): The ID of the crawl job.
172
+ id (str): The ID of the crawl job.
168
173
 
169
174
  Returns:
170
175
  Any: The status of the crawl job.
@@ -172,13 +177,79 @@ class FirecrawlApp:
172
177
  Raises:
173
178
  Exception: If the status check request fails.
174
179
  """
180
+ endpoint = f'/v1/crawl/{id}'
181
+
175
182
  headers = self._prepare_headers()
176
- response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
183
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
177
184
  if response.status_code == 200:
178
- return response.json()
185
+ data = response.json()
186
+ return {
187
+ 'success': True,
188
+ 'status': data.get('status'),
189
+ 'total': data.get('total'),
190
+ 'completed': data.get('completed'),
191
+ 'creditsUsed': data.get('creditsUsed'),
192
+ 'expiresAt': data.get('expiresAt'),
193
+ 'next': data.get('next'),
194
+ 'data': data.get('data'),
195
+ 'error': data.get('error')
196
+ }
179
197
  else:
180
198
  self._handle_error(response, 'check crawl status')
181
199
 
200
+ def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
201
+ """
202
+ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.
203
+
204
+ Args:
205
+ url (str): The URL to crawl.
206
+ params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
207
+ idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
208
+
209
+ Returns:
210
+ CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job.
211
+ """
212
+ crawl_response = self.async_crawl_url(url, params, idempotency_key)
213
+ if crawl_response['success'] and 'id' in crawl_response:
214
+ return CrawlWatcher(crawl_response['id'], self)
215
+ else:
216
+ raise Exception("Crawl job failed to start")
217
+
218
+ def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
219
+ """
220
+ Perform a map search using the Firecrawl API.
221
+
222
+ Args:
223
+ url (str): The URL to perform the map search on.
224
+ params (Optional[Dict[str, Any]]): Additional parameters for the map search.
225
+
226
+ Returns:
227
+ Any: The result of the map search, typically a dictionary containing mapping data.
228
+ """
229
+ endpoint = f'/v1/map'
230
+ headers = self._prepare_headers()
231
+
232
+ # Prepare the base scrape parameters with the URL
233
+ json_data = {'url': url}
234
+ if params:
235
+ json_data.update(params)
236
+
237
+ # Make the POST request with the prepared headers and JSON data
238
+ response = requests.post(
239
+ f'{self.api_url}{endpoint}',
240
+ headers=headers,
241
+ json=json_data,
242
+ )
243
+ if response.status_code == 200:
244
+ response = response.json()
245
+ print(response)
246
+ if response['success'] and 'links' in response:
247
+ return response['links']
248
+ else:
249
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
250
+ else:
251
+ self._handle_error(response, 'map')
252
+
182
253
  def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
183
254
  """
184
255
  Prepare the headers for API requests.
@@ -257,15 +328,14 @@ class FirecrawlApp:
257
328
  return response
258
329
  return response
259
330
 
260
- def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
331
+ def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any:
261
332
  """
262
333
  Monitor the status of a crawl job until completion.
263
334
 
264
335
  Args:
265
- job_id (str): The ID of the crawl job.
336
+ id (str): The ID of the crawl job.
266
337
  headers (Dict[str, str]): The headers to include in the status check requests.
267
338
  poll_interval (int): Secounds between status checks.
268
-
269
339
  Returns:
270
340
  Any: The crawl results if the job is completed successfully.
271
341
 
@@ -273,15 +343,17 @@ class FirecrawlApp:
273
343
  Exception: If the job fails or an error occurs during status checks.
274
344
  """
275
345
  while True:
276
- status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
346
+ api_url = f'{self.api_url}/v1/crawl/{id}'
347
+
348
+ status_response = self._get_request(api_url, headers)
277
349
  if status_response.status_code == 200:
278
350
  status_data = status_response.json()
279
351
  if status_data['status'] == 'completed':
280
352
  if 'data' in status_data:
281
- return status_data['data']
353
+ return status_data
282
354
  else:
283
355
  raise Exception('Crawl job completed but no data was returned')
284
- elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
356
+ elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
285
357
  poll_interval=max(poll_interval,2)
286
358
  time.sleep(poll_interval) # Wait for the specified interval before checking again
287
359
  else:
@@ -300,19 +372,66 @@ class FirecrawlApp:
300
372
  Raises:
301
373
  Exception: An exception with a message containing the status code and error details from the response.
302
374
  """
303
- error_message = response.json().get('error', 'No additional error details provided.')
375
+ error_message = response.json().get('error', 'No error message provided.')
376
+ error_details = response.json().get('details', 'No additional error details provided.')
304
377
 
305
378
  if response.status_code == 402:
306
- message = f"Payment Required: Failed to {action}. {error_message}"
379
+ message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
307
380
  elif response.status_code == 408:
308
- message = f"Request Timeout: Failed to {action} as the request timed out. {error_message}"
381
+ message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
309
382
  elif response.status_code == 409:
310
- message = f"Conflict: Failed to {action} due to a conflict. {error_message}"
383
+ message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}"
311
384
  elif response.status_code == 500:
312
- message = f"Internal Server Error: Failed to {action}. {error_message}"
385
+ message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
313
386
  else:
314
- message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message}"
387
+ message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
315
388
 
316
389
  # Raise an HTTPError with the custom message and attach the response
317
390
  raise requests.exceptions.HTTPError(message, response=response)
318
-
391
+
392
+ class CrawlWatcher:
393
+ def __init__(self, id: str, app: FirecrawlApp):
394
+ self.id = id
395
+ self.app = app
396
+ self.data: List[Dict[str, Any]] = []
397
+ self.status = "scraping"
398
+ self.ws_url = f"{app.api_url.replace('http', 'ws')}/v1/crawl/{id}"
399
+ self.event_handlers = {
400
+ 'done': [],
401
+ 'error': [],
402
+ 'document': []
403
+ }
404
+
405
+ async def connect(self):
406
+ async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket:
407
+ await self._listen(websocket)
408
+
409
+ async def _listen(self, websocket):
410
+ async for message in websocket:
411
+ msg = json.loads(message)
412
+ await self._handle_message(msg)
413
+
414
+ def add_event_listener(self, event_type: str, handler):
415
+ if event_type in self.event_handlers:
416
+ self.event_handlers[event_type].append(handler)
417
+
418
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]):
419
+ if event_type in self.event_handlers:
420
+ for handler in self.event_handlers[event_type]:
421
+ handler(detail)
422
+
423
+ async def _handle_message(self, msg: Dict[str, Any]):
424
+ if msg['type'] == 'done':
425
+ self.status = 'completed'
426
+ self.dispatch_event('done', {'status': self.status, 'data': self.data})
427
+ elif msg['type'] == 'error':
428
+ self.status = 'failed'
429
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']})
430
+ elif msg['type'] == 'catchup':
431
+ self.status = msg['data']['status']
432
+ self.data.extend(msg['data'].get('data', []))
433
+ for doc in self.data:
434
+ self.dispatch_event('document', doc)
435
+ elif msg['type'] == 'document':
436
+ self.data.append(msg['data'])
437
+ self.dispatch_event('document', msg['data'])
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Sideguide Technologies Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 0.0.19
3
+ Version: 1.1.1
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/mendableai/firecrawl
6
6
  Author: Mendable.ai
7
7
  Author-email: "Mendable.ai" <nick@mendable.ai>
8
8
  Maintainer-email: "Mendable.ai" <nick@mendable.ai>
9
- License: GNU General Public License v3 (GPLv3)
9
+ License: GNU Affero General Public License v3 (AGPLv3)
10
10
  Project-URL: Documentation, https://docs.firecrawl.dev
11
11
  Project-URL: Source, https://github.com/mendableai/firecrawl
12
12
  Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
@@ -30,9 +30,14 @@ Classifier: Topic :: Software Development :: Libraries
30
30
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
31
31
  Classifier: Topic :: Text Processing
32
32
  Classifier: Topic :: Text Processing :: Indexing
33
- Requires-Python: >=3.7
33
+ Requires-Python: >=3.8
34
34
  Description-Content-Type: text/markdown
35
+ License-File: LICENSE
35
36
  Requires-Dist: requests
37
+ Requires-Dist: python-dotenv
38
+ Requires-Dist: websockets
39
+ Requires-Dist: asyncio
40
+ Requires-Dist: nest-asyncio
36
41
 
37
42
  # Firecrawl Python SDK
38
43
 
@@ -51,27 +56,31 @@ pip install firecrawl-py
51
56
  1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
52
57
  2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
53
58
 
54
-
55
59
  Here's an example of how to use the SDK:
56
60
 
57
61
  ```python
58
- from firecrawl import FirecrawlApp
59
-
60
- # Initialize the FirecrawlApp with your API key
61
- app = FirecrawlApp(api_key='your_api_key')
62
-
63
- # Scrape a single URL
64
- url = 'https://mendable.ai'
65
- scraped_data = app.scrape_url(url)
66
-
67
- # Crawl a website
68
- crawl_url = 'https://mendable.ai'
69
- params = {
70
- 'pageOptions': {
71
- 'onlyMainContent': True
72
- }
73
- }
74
- crawl_result = app.crawl_url(crawl_url, params=params)
62
+ from firecrawl.firecrawl import FirecrawlApp
63
+
64
+ app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
65
+
66
+ # Scrape a website:
67
+ scrape_status = app.scrape_url(
68
+ 'https://firecrawl.dev',
69
+ params={'formats': ['markdown', 'html']}
70
+ )
71
+ print(scrape_status)
72
+
73
+ # Crawl a website:
74
+ crawl_status = app.crawl_url(
75
+ 'https://firecrawl.dev',
76
+ params={
77
+ 'limit': 100,
78
+ 'scrapeOptions': {'formats': ['markdown', 'html']}
79
+ },
80
+ wait_until_done=True,
81
+ poll_interval=30
82
+ )
83
+ print(crawl_status)
75
84
  ```
76
85
 
77
86
  ### Scraping a URL
@@ -82,6 +91,7 @@ To scrape a single URL, use the `scrape_url` method. It takes the URL as a param
82
91
  url = 'https://example.com'
83
92
  scraped_data = app.scrape_url(url)
84
93
  ```
94
+
85
95
  ### Extracting structured data from a URL
86
96
 
87
97
  With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
@@ -89,7 +99,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup
89
99
  ```python
90
100
  class ArticleSchema(BaseModel):
91
101
  title: str
92
- points: int
102
+ points: int
93
103
  by: str
94
104
  commentsURL: str
95
105
 
@@ -108,45 +118,77 @@ data = app.scrape_url('https://news.ycombinator.com', {
108
118
  print(data["llm_extraction"])
109
119
  ```
110
120
 
111
- ### Search for a query
121
+ ### Crawling a Website
112
122
 
113
- Used to search the web, get the most relevant results, scrap each page and return the markdown.
123
+ To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
114
124
 
115
125
  ```python
116
- query = 'what is mendable?'
117
- search_result = app.search(query)
126
+ idempotency_key = str(uuid.uuid4()) # optional idempotency key
127
+ crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
128
+ print(crawl_result)
118
129
  ```
119
130
 
120
- ### Crawling a Website
121
-
122
- To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
131
+ ### Asynchronous Crawl a Website
123
132
 
124
- The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
133
+ To crawl a website asynchronously, use the `async_crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
125
134
 
126
135
  ```python
127
- crawl_url = 'https://example.com'
128
- params = {
129
- 'crawlerOptions': {
130
- 'excludes': ['blog/*'],
131
- 'includes': [], # leave empty for all pages
132
- 'limit': 1000,
133
- },
134
- 'pageOptions': {
135
- 'onlyMainContent': True
136
- }
137
- }
138
- crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
136
+ crawl_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
137
+ print(crawl_result)
139
138
  ```
140
139
 
141
- If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
142
-
143
140
  ### Checking Crawl Status
144
141
 
145
142
  To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
146
143
 
147
144
  ```python
148
- job_id = crawl_result['jobId']
149
- status = app.check_crawl_status(job_id)
145
+ id = crawl_result['id']
146
+ status = app.check_crawl_status(id)
147
+ ```
148
+
149
+ ### Map a Website
150
+
151
+ Use `map_url` to generate a list of URLs from a website. The `params` argument let you customize the mapping process, including options to exclude subdomains or to utilize the sitemap.
152
+
153
+ ```python
154
+ # Map a website:
155
+ map_result = app.map_url('https://example.com')
156
+ print(map_result)
157
+ ```
158
+
159
+ ### Crawl a website with WebSockets
160
+
161
+ To crawl a website with WebSockets, use the `crawl_url_and_watch` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
162
+
163
+ ```python
164
+ # inside an async function...
165
+ nest_asyncio.apply()
166
+
167
+ # Define event handlers
168
+ def on_document(detail):
169
+ print("DOC", detail)
170
+
171
+ def on_error(detail):
172
+ print("ERR", detail['error'])
173
+
174
+ def on_done(detail):
175
+ print("DONE", detail['status'])
176
+
177
+ # Function to start the crawl and watch process
178
+ async def start_crawl_and_watch():
179
+ # Initiate the crawl job and get the watcher
180
+ watcher = app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
181
+
182
+ # Add event listeners
183
+ watcher.add_event_listener("document", on_document)
184
+ watcher.add_event_listener("error", on_error)
185
+ watcher.add_event_listener("done", on_done)
186
+
187
+ # Start the watcher
188
+ await watcher.connect()
189
+
190
+ # Run the event loop
191
+ await start_crawl_and_watch()
150
192
  ```
151
193
 
152
194
  ## Error Handling
@@ -162,20 +204,27 @@ To ensure the functionality of the Firecrawl Python SDK, we have included end-to
162
204
  To run the tests, execute the following commands:
163
205
 
164
206
  Install pytest:
207
+
165
208
  ```bash
166
209
  pip install pytest
167
210
  ```
168
211
 
169
212
  Run:
213
+
170
214
  ```bash
171
215
  pytest firecrawl/__tests__/e2e_withAuth/test.py
172
216
  ```
173
217
 
174
-
175
218
  ## Contributing
176
219
 
177
220
  Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
178
221
 
179
222
  ## License
180
223
 
181
- The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
224
+ The Firecrawl Python SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
225
+
226
+ - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
227
+
228
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
229
+
230
+ Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.
@@ -0,0 +1,11 @@
1
+ firecrawl/__init__.py,sha256=j1M2OH1eJLLBEooOLOVrfgqyDyuOqSHdSRbxKE6FH2E,1682
2
+ firecrawl/firecrawl.py,sha256=XIO5Pyw2w9i32f-fIjO5hgqLvPV7ZSi66pR0JQjFzVI,17713
3
+ firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ firecrawl/__tests__/e2e_withAuth/test.py,sha256=L-umFR3WyrJso1EwqkxjbTMr5AEI4t5zDfhQcCzitOI,7911
5
+ firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=KQMmGAtJAIafja6EGtJ-W9162w2Hm6PNjqKl3_RQXLA,16456
7
+ firecrawl-1.1.1.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
8
+ firecrawl-1.1.1.dist-info/METADATA,sha256=I7nFWbZ4zD9GRoeQDtGfYVEnGEKNPT7ltkA0D4gUH0g,8297
9
+ firecrawl-1.1.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
10
+ firecrawl-1.1.1.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
11
+ firecrawl-1.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.2.0)
2
+ Generator: bdist_wheel (0.38.4)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- firecrawl/__init__.py,sha256=ixdimo55Kkf1ccxo31pemLUm16hBOG5GvgEen9S3qNc,1683
2
- firecrawl/firecrawl.py,sha256=wfy2KXSrLoGlj8HKEP3VKG_u38kRbUbcmRQsMgza5I0,12767
3
- firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- firecrawl/__tests__/e2e_withAuth/test.py,sha256=vZlhdURkMXY1-2_FBrKwT6D4WbH7HEf_xMEd0X63krQ,7665
5
- firecrawl-0.0.19.dist-info/METADATA,sha256=M92W_xzUtd6k5-789ti_yyfC88ZXZAdXq7Beqqx6iqc,6264
6
- firecrawl-0.0.19.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
7
- firecrawl-0.0.19.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
8
- firecrawl-0.0.19.dist-info/RECORD,,