firecrawl 2.7.1__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

@@ -1,440 +0,0 @@
1
- import importlib.util
2
- import pytest
3
- import time
4
- import os
5
- from uuid import uuid4
6
- from dotenv import load_dotenv
7
- from datetime import datetime
8
-
9
- load_dotenv()
10
-
11
- API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002')
12
- ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
13
- TEST_API_KEY = os.getenv('TEST_API_KEY')
14
-
15
- print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
16
-
17
- spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
18
- firecrawl = importlib.util.module_from_spec(spec)
19
- spec.loader.exec_module(firecrawl)
20
- FirecrawlApp = firecrawl.FirecrawlApp
21
-
22
- def test_no_api_key():
23
- if 'api.firecrawl.dev' in API_URL:
24
- with pytest.raises(Exception) as excinfo:
25
- invalid_app = FirecrawlApp(api_url=API_URL)
26
- assert "No API key provided" in str(excinfo.value)
27
- else:
28
- # Should not raise error for self-hosted
29
- app = FirecrawlApp(api_url=API_URL)
30
- assert app is not None
31
-
32
- def test_scrape_url_invalid_api_key():
33
- if 'api.firecrawl.dev' in API_URL:
34
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
35
- with pytest.raises(Exception) as excinfo:
36
- invalid_app.scrape_url('https://firecrawl.dev')
37
- assert "Unauthorized: Invalid token" in str(excinfo.value)
38
- else:
39
- # Should work without API key for self-hosted
40
- app = FirecrawlApp(api_url=API_URL)
41
- response = app.scrape_url('https://firecrawl.dev')
42
- assert response is not None
43
-
44
- # def test_blocklisted_url():
45
- # blocklisted_url = "https://facebook.com/fake-test"
46
- # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
47
- # with pytest.raises(Exception) as excinfo:
48
- # app.scrape_url(blocklisted_url)
49
- # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
50
-
51
- def test_successful_response_with_valid_preview_token():
52
- app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'))
53
- response = app.scrape_url('https://roastmywebsite.ai')
54
- assert response is not None
55
- assert "_Roast_" in response['markdown']
56
- assert "content" not in response
57
- assert "html" not in response
58
- assert "metadata" in response
59
- assert "links" not in response
60
- assert "rawHtml" not in response
61
-
62
- def test_successful_response_for_valid_scrape():
63
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
64
- response = app.scrape_url('https://roastmywebsite.ai')
65
- assert response is not None
66
- assert 'markdown' in response
67
- assert "_Roast_" in response['markdown']
68
- assert 'metadata' in response
69
- assert 'content' not in response
70
- assert 'html' not in response
71
- assert 'rawHtml' not in response
72
- assert 'screenshot' not in response
73
- assert 'links' not in response
74
-
75
- def test_successful_response_with_valid_api_key_and_options():
76
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
77
- params = {
78
- 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
79
- 'headers': {'x-key': 'test'},
80
- 'includeTags': ['h1'],
81
- 'excludeTags': ['h2'],
82
- 'onlyMainContent': True,
83
- 'timeout': 30000,
84
- 'waitFor': 1000
85
- }
86
- response = app.scrape_url('https://roastmywebsite.ai', params)
87
- assert response is not None
88
- assert 'content' not in response
89
- assert 'markdown' in response
90
- assert 'html' in response
91
- assert 'rawHtml' in response
92
- assert 'screenshot' in response
93
- assert 'links' in response
94
- assert "_Roast_" in response['markdown']
95
- assert "<h1" in response['html']
96
- assert "<h1" in response['rawHtml']
97
- assert "https://" in response['screenshot']
98
- assert len(response['links']) > 0
99
- assert "https://" in response['links'][0]
100
- assert 'metadata' in response
101
- assert 'title' in response['metadata']
102
- assert 'description' in response['metadata']
103
- assert 'keywords' in response['metadata']
104
- assert 'robots' in response['metadata']
105
- assert 'ogTitle' in response['metadata']
106
- assert 'ogDescription' in response['metadata']
107
- assert 'ogUrl' in response['metadata']
108
- assert 'ogImage' in response['metadata']
109
- assert 'ogLocaleAlternate' in response['metadata']
110
- assert 'ogSiteName' in response['metadata']
111
- assert 'sourceURL' in response['metadata']
112
- assert 'statusCode' in response['metadata']
113
- assert 'pageStatusCode' not in response['metadata']
114
- assert 'pageError' not in response['metadata']
115
- assert 'error' not in response['metadata']
116
- assert response['metadata']['title'] == "Roast My Website"
117
- assert response['metadata']['description'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
118
- assert response['metadata']['keywords'] == "Roast My Website,Roast,Website,GitHub,Firecrawl"
119
- assert response['metadata']['robots'] == "follow, index"
120
- assert response['metadata']['ogTitle'] == "Roast My Website"
121
- assert response['metadata']['ogDescription'] == "Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
122
- assert response['metadata']['ogUrl'] == "https://www.roastmywebsite.ai"
123
- assert response['metadata']['ogImage'] == "https://www.roastmywebsite.ai/og.png"
124
- assert response['metadata']['ogLocaleAlternate'] == []
125
- assert response['metadata']['ogSiteName'] == "Roast My Website"
126
- assert response['metadata']['sourceURL'] == "https://roastmywebsite.ai"
127
- assert response['metadata']['statusCode'] == 200
128
-
129
- def test_successful_response_for_valid_scrape_with_pdf_file():
130
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
131
- response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
132
- assert response is not None
133
- assert 'content' not in response
134
- assert 'metadata' in response
135
- assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
136
-
137
- def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
138
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
139
- response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
140
- time.sleep(1) # wait for 1 second
141
- assert response is not None
142
- assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
143
-
144
- def test_crawl_url_invalid_api_key():
145
- if 'api.firecrawl.dev' in API_URL:
146
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
147
- with pytest.raises(Exception) as excinfo:
148
- invalid_app.crawl_url('https://firecrawl.dev')
149
- assert "Unauthorized: Invalid token" in str(excinfo.value)
150
- else:
151
- # Should work without API key for self-hosted
152
- app = FirecrawlApp(api_url=API_URL)
153
- response = app.crawl_url('https://firecrawl.dev')
154
- assert response is not None
155
-
156
- # def test_should_return_error_for_blocklisted_url():
157
- # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
158
- # blocklisted_url = "https://twitter.com/fake-test"
159
- # with pytest.raises(Exception) as excinfo:
160
- # app.crawl_url(blocklisted_url)
161
- # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
162
-
163
- def test_crawl_url_wait_for_completion_e2e():
164
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
165
- response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, True, 30)
166
- assert response is not None
167
- assert 'total' in response
168
- assert response['total'] > 0
169
- assert 'creditsUsed' in response
170
- assert response['creditsUsed'] > 0
171
- assert 'expiresAt' in response
172
- assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
173
- assert 'status' in response
174
- assert response['status'] == 'completed'
175
- assert 'next' not in response
176
- assert len(response['data']) > 0
177
- assert 'markdown' in response['data'][0]
178
- assert "_Roast_" in response['data'][0]['markdown']
179
- assert 'content' not in response['data'][0]
180
- assert 'html' not in response['data'][0]
181
- assert 'rawHtml' not in response['data'][0]
182
- assert 'screenshot' not in response['data'][0]
183
- assert 'links' not in response['data'][0]
184
- assert 'metadata' in response['data'][0]
185
- assert 'title' in response['data'][0]['metadata']
186
- assert 'description' in response['data'][0]['metadata']
187
- assert 'language' in response['data'][0]['metadata']
188
- assert 'sourceURL' in response['data'][0]['metadata']
189
- assert 'statusCode' in response['data'][0]['metadata']
190
- assert 'error' not in response['data'][0]['metadata']
191
-
192
- def test_crawl_url_with_options_and_wait_for_completion():
193
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
194
- response = app.crawl_url('https://roastmywebsite.ai', {
195
- 'excludePaths': ['blog/*'],
196
- 'includePaths': ['/'],
197
- 'maxDepth': 2,
198
- 'ignoreSitemap': True,
199
- 'limit': 10,
200
- 'allowBackwardLinks': True,
201
- 'allowExternalLinks': True,
202
- 'scrapeOptions': {
203
- 'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
204
- 'headers': {"x-key": "test"},
205
- 'includeTags': ['h1'],
206
- 'excludeTags': ['h2'],
207
- 'onlyMainContent': True,
208
- 'waitFor': 1000
209
- }
210
- }, True, 30)
211
- assert response is not None
212
- assert 'total' in response
213
- assert response['total'] > 0
214
- assert 'creditsUsed' in response
215
- assert response['creditsUsed'] > 0
216
- assert 'expiresAt' in response
217
- assert datetime.strptime(response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
218
- assert 'status' in response
219
- assert response['status'] == 'completed'
220
- assert 'next' not in response
221
- assert len(response['data']) > 0
222
- assert 'markdown' in response['data'][0]
223
- assert "_Roast_" in response['data'][0]['markdown']
224
- assert 'content' not in response['data'][0]
225
- assert 'html' in response['data'][0]
226
- assert "<h1" in response['data'][0]['html']
227
- assert 'rawHtml' in response['data'][0]
228
- assert "<h1" in response['data'][0]['rawHtml']
229
- assert 'screenshot' in response['data'][0]
230
- assert "https://" in response['data'][0]['screenshot']
231
- assert 'links' in response['data'][0]
232
- assert len(response['data'][0]['links']) > 0
233
- assert 'metadata' in response['data'][0]
234
- assert 'title' in response['data'][0]['metadata']
235
- assert 'description' in response['data'][0]['metadata']
236
- assert 'language' in response['data'][0]['metadata']
237
- assert 'sourceURL' in response['data'][0]['metadata']
238
- assert 'statusCode' in response['data'][0]['metadata']
239
- assert 'error' not in response['data'][0]['metadata']
240
-
241
- def test_crawl_url_with_idempotency_key_e2e():
242
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
243
- uniqueIdempotencyKey = str(uuid4())
244
- response = app.crawl_url('https://roastmywebsite.ai', {'excludePaths': ['blog/*']}, False, 2, uniqueIdempotencyKey)
245
- assert response is not None
246
- assert 'id' in response
247
-
248
- with pytest.raises(Exception) as excinfo:
249
- app.crawl_url('https://firecrawl.dev', {'excludePaths': ['blog/*']}, True, 2, uniqueIdempotencyKey)
250
- assert "Idempotency key already used" in str(excinfo.value)
251
-
252
- def test_check_crawl_status_e2e():
253
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
254
- response = app.crawl_url('https://firecrawl.dev', {'scrapeOptions': {'formats': ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}}, False)
255
- assert response is not None
256
- assert 'id' in response
257
-
258
- max_checks = 15
259
- checks = 0
260
- status_response = app.check_crawl_status(response['id'])
261
-
262
- while status_response['status'] == 'scraping' and checks < max_checks:
263
- time.sleep(1) # wait for 1 second
264
- assert 'partial_data' not in status_response
265
- assert 'current' not in status_response
266
- assert 'data' in status_response
267
- assert 'total' in status_response
268
- assert 'creditsUsed' in status_response
269
- assert 'expiresAt' in status_response
270
- assert 'status' in status_response
271
- assert 'next' in status_response
272
- assert status_response['total'] > 0
273
- assert status_response['creditsUsed'] > 0
274
- assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
275
- assert status_response['status'] == 'scraping'
276
- assert '/v1/crawl/' in status_response['next']
277
- status_response = app.check_crawl_status(response['id'])
278
- checks += 1
279
-
280
- assert status_response is not None
281
- assert 'total' in status_response
282
- assert status_response['total'] > 0
283
- assert 'creditsUsed' in status_response
284
- assert status_response['creditsUsed'] > 0
285
- assert 'expiresAt' in status_response
286
- assert datetime.strptime(status_response['expiresAt'], '%Y-%m-%dT%H:%M:%S.%fZ') > datetime.now()
287
- assert 'status' in status_response
288
- assert status_response['status'] == 'completed'
289
- assert len(status_response['data']) > 0
290
- assert 'markdown' in status_response['data'][0]
291
- assert len(status_response['data'][0]['markdown']) > 10
292
- assert 'content' not in status_response['data'][0]
293
- assert 'html' in status_response['data'][0]
294
- assert "<div" in status_response['data'][0]['html']
295
- assert 'rawHtml' in status_response['data'][0]
296
- assert "<div" in status_response['data'][0]['rawHtml']
297
- assert 'screenshot' in status_response['data'][0]
298
- assert "https://" in status_response['data'][0]['screenshot']
299
- assert 'links' in status_response['data'][0]
300
- assert status_response['data'][0]['links'] is not None
301
- assert len(status_response['data'][0]['links']) > 0
302
- assert 'metadata' in status_response['data'][0]
303
- assert 'title' in status_response['data'][0]['metadata']
304
- assert 'description' in status_response['data'][0]['metadata']
305
- assert 'language' in status_response['data'][0]['metadata']
306
- assert 'sourceURL' in status_response['data'][0]['metadata']
307
- assert 'statusCode' in status_response['data'][0]['metadata']
308
- assert 'error' not in status_response['data'][0]['metadata']
309
-
310
- def test_invalid_api_key_on_map():
311
- if 'api.firecrawl.dev' in API_URL:
312
- invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
313
- with pytest.raises(Exception) as excinfo:
314
- invalid_app.map_url('https://roastmywebsite.ai')
315
- assert "Unauthorized: Invalid token" in str(excinfo.value)
316
- else:
317
- # Should work without API key for self-hosted
318
- app = FirecrawlApp(api_url=API_URL)
319
- response = app.map_url('https://roastmywebsite.ai')
320
- assert response is not None
321
-
322
- # def test_blocklisted_url_on_map():
323
- # app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
324
- # blocklisted_url = "https://facebook.com/fake-test"
325
- # with pytest.raises(Exception) as excinfo:
326
- # app.map_url(blocklisted_url)
327
- # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
328
-
329
- def test_successful_response_with_valid_preview_token_on_map():
330
- app = FirecrawlApp(api_key=os.getenv('PREVIEW_TOKEN'), api_url=API_URL)
331
- response = app.map_url('https://roastmywebsite.ai')
332
- assert response is not None
333
- assert len(response) > 0
334
-
335
- def test_successful_response_for_valid_map():
336
- app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
337
- response = app.map_url('https://roastmywebsite.ai')
338
- assert response is not None
339
- assert len(response) > 0
340
- assert any("https://" in link for link in response)
341
- filtered_links = [link for link in response if "roastmywebsite.ai" in link]
342
- assert len(filtered_links) > 0
343
-
344
- def test_search_e2e():
345
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
346
- with pytest.raises(NotImplementedError) as excinfo:
347
- app.search("test query")
348
- assert "Search is not supported in v1" in str(excinfo.value)
349
-
350
- # def test_llm_extraction():
351
- # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
352
- # response = app.scrape_url("https://mendable.ai", {
353
- # 'extractorOptions': {
354
- # 'mode': 'llm-extraction',
355
- # 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
356
- # 'extractionSchema': {
357
- # 'type': 'object',
358
- # 'properties': {
359
- # 'company_mission': {'type': 'string'},
360
- # 'supports_sso': {'type': 'boolean'},
361
- # 'is_open_source': {'type': 'boolean'}
362
- # },
363
- # 'required': ['company_mission', 'supports_sso', 'is_open_source']
364
- # }
365
- # }
366
- # })
367
- # assert response is not None
368
- # assert 'llm_extraction' in response
369
- # llm_extraction = response['llm_extraction']
370
- # assert 'company_mission' in llm_extraction
371
- # assert isinstance(llm_extraction['supports_sso'], bool)
372
- # assert isinstance(llm_extraction['is_open_source'], bool)
373
-
374
- def test_search_with_string_query():
375
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
376
- response = app.search("firecrawl")
377
- assert response["success"] is True
378
- assert len(response["data"]) > 0
379
- assert response["data"][0]["markdown"] is not None
380
- assert response["data"][0]["metadata"] is not None
381
- assert response["data"][0]["metadata"]["title"] is not None
382
- assert response["data"][0]["metadata"]["description"] is not None
383
-
384
- def test_search_with_params_dict():
385
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
386
- response = app.search("firecrawl", {
387
- "limit": 3,
388
- "lang": "en",
389
- "country": "us",
390
- "scrapeOptions": {
391
- "formats": ["markdown", "html", "links"],
392
- "onlyMainContent": True
393
- }
394
- })
395
- assert response["success"] is True
396
- assert len(response["data"]) <= 3
397
- for doc in response["data"]:
398
- assert doc["markdown"] is not None
399
- assert doc["html"] is not None
400
- assert doc["links"] is not None
401
- assert doc["metadata"] is not None
402
- assert doc["metadata"]["title"] is not None
403
- assert doc["metadata"]["description"] is not None
404
-
405
- def test_search_with_params_object():
406
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
407
- params = SearchParams(
408
- query="firecrawl",
409
- limit=3,
410
- lang="en",
411
- country="us",
412
- scrapeOptions={
413
- "formats": ["markdown", "html", "links"],
414
- "onlyMainContent": True
415
- }
416
- )
417
- response = app.search(params.query, params)
418
- assert response["success"] is True
419
- assert len(response["data"]) <= 3
420
- for doc in response["data"]:
421
- assert doc["markdown"] is not None
422
- assert doc["html"] is not None
423
- assert doc["links"] is not None
424
- assert doc["metadata"] is not None
425
- assert doc["metadata"]["title"] is not None
426
- assert doc["metadata"]["description"] is not None
427
-
428
- def test_search_invalid_api_key():
429
- app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
430
- with pytest.raises(Exception) as e:
431
- app.search("test query")
432
- assert "404" in str(e.value)
433
-
434
- def test_search_with_invalid_params():
435
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
436
- with pytest.raises(Exception) as e:
437
- app.search("test query", {"invalid_param": "value"})
438
- assert "ValidationError" in str(e.value)
439
-
440
-