firecrawl-py 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
mendable/__init__.py,sha256=Z0tfQTnh-Rr7V-_3yjYlgxt3cREhhqV8s1LVXSmNCuo,31
|
|
2
|
+
mendable/firecrawl.py,sha256=ReCvqTWgxHEtgYakT8nOkWYv1KsEiLzkSSQ9-3HyZc8,3362
|
|
3
|
+
firecrawl_py-0.0.1.dist-info/METADATA,sha256=qGKynQcGLplfRP6XN5IUweLZp9E4N1M7cTx0edBef38,224
|
|
4
|
+
firecrawl_py-0.0.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
5
|
+
firecrawl_py-0.0.1.dist-info/top_level.txt,sha256=HV-vVOHTsAD2b_EZNXwuUyiCCD4Ex74C2SPkTf1h2eA,9
|
|
6
|
+
firecrawl_py-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mendable
|
mendable/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .firecrawl import ChatApp
|
mendable/firecrawl.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
|
|
4
|
+
class FireCrawl:
|
|
5
|
+
def __init__(self, api_key=None):
|
|
6
|
+
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
|
7
|
+
if self.api_key is None:
|
|
8
|
+
raise ValueError('No API key provided')
|
|
9
|
+
|
|
10
|
+
def scrape_url(self, url, params):
|
|
11
|
+
headers = {
|
|
12
|
+
'Content-Type': 'application/json',
|
|
13
|
+
'Authorization': f'Bearer {self.api_key}'
|
|
14
|
+
}
|
|
15
|
+
response = requests.post(
|
|
16
|
+
'https://api.firecrawl.dev/v0/scrape',
|
|
17
|
+
headers=headers,
|
|
18
|
+
json={'url': url, **params}
|
|
19
|
+
)
|
|
20
|
+
if response.status_code == 200:
|
|
21
|
+
return response.json()
|
|
22
|
+
elif response.status_code in [402, 409, 500]:
|
|
23
|
+
error_message = response.json().get('error', 'Unknown error occurred')
|
|
24
|
+
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
|
25
|
+
else:
|
|
26
|
+
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
|
27
|
+
|
|
28
|
+
def crawl_url(self, url, params):
|
|
29
|
+
import time
|
|
30
|
+
headers = self._prepare_headers()
|
|
31
|
+
response = self._post_request('https://api.firecrawl.dev/v0/crawl', {'url': url, **params}, headers)
|
|
32
|
+
if response.status_code == 200:
|
|
33
|
+
job_id = response.json().get('jobId')
|
|
34
|
+
return self._monitor_job_status(job_id, headers)
|
|
35
|
+
else:
|
|
36
|
+
self._handle_error(response, 'start crawl job')
|
|
37
|
+
|
|
38
|
+
def check_crawl_status(self, job_id):
|
|
39
|
+
headers = self._prepare_headers()
|
|
40
|
+
response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
|
|
41
|
+
if response.status_code == 200:
|
|
42
|
+
return response.json()
|
|
43
|
+
else:
|
|
44
|
+
self._handle_error(response, 'check crawl status')
|
|
45
|
+
|
|
46
|
+
def _prepare_headers(self):
|
|
47
|
+
return {
|
|
48
|
+
'Content-Type': 'application/json',
|
|
49
|
+
'Authorization': f'Bearer {self.api_key}'
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def _post_request(self, url, data, headers):
|
|
53
|
+
return requests.post(url, headers=headers, json=data)
|
|
54
|
+
|
|
55
|
+
def _get_request(self, url, headers):
|
|
56
|
+
return requests.get(url, headers=headers)
|
|
57
|
+
|
|
58
|
+
def _monitor_job_status(self, job_id, headers):
|
|
59
|
+
import time
|
|
60
|
+
while True:
|
|
61
|
+
status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
|
|
62
|
+
if status_response.status_code == 200:
|
|
63
|
+
status_data = status_response.json()
|
|
64
|
+
if status_data['status'] == 'completed':
|
|
65
|
+
return status_data
|
|
66
|
+
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
|
|
67
|
+
time.sleep(2) # Wait for 2 seconds before checking again
|
|
68
|
+
else:
|
|
69
|
+
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
|
70
|
+
else:
|
|
71
|
+
self._handle_error(status_response, 'check crawl status')
|
|
72
|
+
|
|
73
|
+
def _handle_error(self, response, action):
|
|
74
|
+
if response.status_code in [402, 409, 500]:
|
|
75
|
+
error_message = response.json().get('error', 'Unknown error occurred')
|
|
76
|
+
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
|
77
|
+
else:
|
|
78
|
+
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|