firecrawl 1.6.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

firecrawl/__init__.py CHANGED
@@ -13,7 +13,7 @@ import os
13
13
 
14
14
  from .firecrawl import FirecrawlApp # noqa
15
15
 
16
- __version__ = "1.6.0"
16
+ __version__ = "1.10.0"
17
17
 
18
18
  # Define the logger for the Firecrawl project
19
19
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -29,12 +29,12 @@ def test_scrape_url_invalid_api_key():
29
29
  invalid_app.scrape_url('https://firecrawl.dev')
30
30
  assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
31
31
 
32
- def test_blocklisted_url():
33
- blocklisted_url = "https://facebook.com/fake-test"
34
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
35
- with pytest.raises(Exception) as excinfo:
36
- app.scrape_url(blocklisted_url)
37
- assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
32
+ # def test_blocklisted_url():
33
+ # blocklisted_url = "https://facebook.com/fake-test"
34
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
35
+ # with pytest.raises(Exception) as excinfo:
36
+ # app.scrape_url(blocklisted_url)
37
+ # assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
38
38
 
39
39
  def test_successful_response_with_valid_preview_token():
40
40
  app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
@@ -90,12 +90,12 @@ def test_crawl_url_invalid_api_key():
90
90
  invalid_app.crawl_url('https://firecrawl.dev')
91
91
  assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
92
92
 
93
- def test_should_return_error_for_blocklisted_url():
94
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
95
- blocklisted_url = "https://twitter.com/fake-test"
96
- with pytest.raises(Exception) as excinfo:
97
- app.crawl_url(blocklisted_url)
98
- assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
93
+ # def test_should_return_error_for_blocklisted_url():
94
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
95
+ # blocklisted_url = "https://twitter.com/fake-test"
96
+ # with pytest.raises(Exception) as excinfo:
97
+ # app.crawl_url(blocklisted_url)
98
+ # assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
99
99
 
100
100
  def test_crawl_url_wait_for_completion_e2e():
101
101
  app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
@@ -8,7 +8,7 @@ from datetime import datetime
8
8
 
9
9
  load_dotenv()
10
10
 
11
- API_URL = "http://127.0.0.1:3002";
11
+ API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002')
12
12
  ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
13
13
  TEST_API_KEY = os.getenv('TEST_API_KEY')
14
14
 
@@ -20,22 +20,33 @@ spec.loader.exec_module(firecrawl)
20
20
  FirecrawlApp = firecrawl.FirecrawlApp
21
21
 
22
22
  def test_no_api_key():
23
- with pytest.raises(Exception) as excinfo:
24
- invalid_app = FirecrawlApp(api_url=API_URL)
25
- assert "No API key provided" in str(excinfo.value)
23
+ if 'api.firecrawl.dev' in API_URL:
24
+ with pytest.raises(Exception) as excinfo:
25
+ invalid_app = FirecrawlApp(api_url=API_URL)
26
+ assert "No API key provided" in str(excinfo.value)
27
+ else:
28
+ # Should not raise error for self-hosted
29
+ app = FirecrawlApp(api_url=API_URL)
30
+ assert app is not None
26
31
 
27
32
  def test_scrape_url_invalid_api_key():
28
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
29
- with pytest.raises(Exception) as excinfo:
30
- invalid_app.scrape_url('https://firecrawl.dev')
31
- assert "Unauthorized: Invalid token" in str(excinfo.value)
33
+ if 'api.firecrawl.dev' in API_URL:
34
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
35
+ with pytest.raises(Exception) as excinfo:
36
+ invalid_app.scrape_url('https://firecrawl.dev')
37
+ assert "Unauthorized: Invalid token" in str(excinfo.value)
38
+ else:
39
+ # Should work without API key for self-hosted
40
+ app = FirecrawlApp(api_url=API_URL)
41
+ response = app.scrape_url('https://firecrawl.dev')
42
+ assert response is not None
32
43
 
33
- def test_blocklisted_url():
34
- blocklisted_url = "https://facebook.com/fake-test"
35
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
36
- with pytest.raises(Exception) as excinfo:
37
- app.scrape_url(blocklisted_url)
38
- assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
44
+ # def test_blocklisted_url():
45
+ # blocklisted_url = "https://facebook.com/fake-test"
46
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
47
+ # with pytest.raises(Exception) as excinfo:
48
+ # app.scrape_url(blocklisted_url)
49
+ # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
39
50
 
40
51
  def test_successful_response_with_valid_preview_token():
41
52
  app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
@@ -131,17 +142,23 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
131
142
  assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
132
143
 
133
144
  def test_crawl_url_invalid_api_key():
134
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
135
- with pytest.raises(Exception) as excinfo:
136
- invalid_app.crawl_url('https://firecrawl.dev')
137
- assert "Unauthorized: Invalid token" in str(excinfo.value)
145
+ if 'api.firecrawl.dev' in API_URL:
146
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
147
+ with pytest.raises(Exception) as excinfo:
148
+ invalid_app.crawl_url('https://firecrawl.dev')
149
+ assert "Unauthorized: Invalid token" in str(excinfo.value)
150
+ else:
151
+ # Should work without API key for self-hosted
152
+ app = FirecrawlApp(api_url=API_URL)
153
+ response = app.crawl_url('https://firecrawl.dev')
154
+ assert response is not None
138
155
 
139
- def test_should_return_error_for_blocklisted_url():
140
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
141
- blocklisted_url = "https://twitter.com/fake-test"
142
- with pytest.raises(Exception) as excinfo:
143
- app.crawl_url(blocklisted_url)
144
- assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
156
+ # def test_should_return_error_for_blocklisted_url():
157
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
158
+ # blocklisted_url = "https://twitter.com/fake-test"
159
+ # with pytest.raises(Exception) as excinfo:
160
+ # app.crawl_url(blocklisted_url)
161
+ # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
145
162
 
146
163
  def test_crawl_url_wait_for_completion_e2e():
147
164
  app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@@ -291,17 +308,23 @@ def test_check_crawl_status_e2e():
291
308
  assert 'error' not in status_response['data'][0]['metadata']
292
309
 
293
310
  def test_invalid_api_key_on_map():
294
- invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
295
- with pytest.raises(Exception) as excinfo:
296
- invalid_app.map_url('https://roastmywebsite.ai')
297
- assert "Unauthorized: Invalid token" in str(excinfo.value)
311
+ if 'api.firecrawl.dev' in API_URL:
312
+ invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
313
+ with pytest.raises(Exception) as excinfo:
314
+ invalid_app.map_url('https://roastmywebsite.ai')
315
+ assert "Unauthorized: Invalid token" in str(excinfo.value)
316
+ else:
317
+ # Should work without API key for self-hosted
318
+ app = FirecrawlApp(api_url=API_URL)
319
+ response = app.map_url('https://roastmywebsite.ai')
320
+ assert response is not None
298
321
 
299
- def test_blocklisted_url_on_map():
300
- app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
301
- blocklisted_url = "https://facebook.com/fake-test"
302
- with pytest.raises(Exception) as excinfo:
303
- app.map_url(blocklisted_url)
304
- assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
322
+ # def test_blocklisted_url_on_map():
323
+ # app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
324
+ # blocklisted_url = "https://facebook.com/fake-test"
325
+ # with pytest.raises(Exception) as excinfo:
326
+ # app.map_url(blocklisted_url)
327
+ # assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
305
328
 
306
329
  def test_successful_response_with_valid_preview_token_on_map():
307
330
  app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)
@@ -348,5 +371,70 @@ def test_search_e2e():
348
371
  # assert isinstance(llm_extraction['supports_sso'], bool)
349
372
  # assert isinstance(llm_extraction['is_open_source'], bool)
350
373
 
374
+ def test_search_with_string_query():
375
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
376
+ response = app.search("firecrawl")
377
+ assert response["success"] is True
378
+ assert len(response["data"]) > 0
379
+ assert response["data"][0]["markdown"] is not None
380
+ assert response["data"][0]["metadata"] is not None
381
+ assert response["data"][0]["metadata"]["title"] is not None
382
+ assert response["data"][0]["metadata"]["description"] is not None
383
+
384
+ def test_search_with_params_dict():
385
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
386
+ response = app.search("firecrawl", {
387
+ "limit": 3,
388
+ "lang": "en",
389
+ "country": "us",
390
+ "scrapeOptions": {
391
+ "formats": ["markdown", "html", "links"],
392
+ "onlyMainContent": True
393
+ }
394
+ })
395
+ assert response["success"] is True
396
+ assert len(response["data"]) <= 3
397
+ for doc in response["data"]:
398
+ assert doc["markdown"] is not None
399
+ assert doc["html"] is not None
400
+ assert doc["links"] is not None
401
+ assert doc["metadata"] is not None
402
+ assert doc["metadata"]["title"] is not None
403
+ assert doc["metadata"]["description"] is not None
404
+
405
+ def test_search_with_params_object():
406
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
407
+ params = SearchParams(
408
+ query="firecrawl",
409
+ limit=3,
410
+ lang="en",
411
+ country="us",
412
+ scrapeOptions={
413
+ "formats": ["markdown", "html", "links"],
414
+ "onlyMainContent": True
415
+ }
416
+ )
417
+ response = app.search(params.query, params)
418
+ assert response["success"] is True
419
+ assert len(response["data"]) <= 3
420
+ for doc in response["data"]:
421
+ assert doc["markdown"] is not None
422
+ assert doc["html"] is not None
423
+ assert doc["links"] is not None
424
+ assert doc["metadata"] is not None
425
+ assert doc["metadata"]["title"] is not None
426
+ assert doc["metadata"]["description"] is not None
427
+
428
+ def test_search_invalid_api_key():
429
+ app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
430
+ with pytest.raises(Exception) as e:
431
+ app.search("test query")
432
+ assert "404" in str(e.value)
433
+
434
+ def test_search_with_invalid_params():
435
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
436
+ with pytest.raises(Exception) as e:
437
+ app.search("test query", {"invalid_param": "value"})
438
+ assert "ValidationError" in str(e.value)
439
+
351
440
 
352
-
firecrawl/firecrawl.py CHANGED
@@ -21,13 +21,34 @@ import websockets
21
21
 
22
22
  logger : logging.Logger = logging.getLogger("firecrawl")
23
23
 
24
+ class SearchParams(pydantic.BaseModel):
25
+ query: str
26
+ limit: Optional[int] = 5
27
+ tbs: Optional[str] = None
28
+ filter: Optional[str] = None
29
+ lang: Optional[str] = "en"
30
+ country: Optional[str] = "us"
31
+ location: Optional[str] = None
32
+ origin: Optional[str] = "api"
33
+ timeout: Optional[int] = 60000
34
+ scrapeOptions: Optional[Dict[str, Any]] = None
35
+
24
36
  class FirecrawlApp:
37
+ class SearchResponse(pydantic.BaseModel):
38
+ """
39
+ Response from the search operation.
40
+ """
41
+ success: bool
42
+ data: List[Dict[str, Any]]
43
+ warning: Optional[str] = None
44
+ error: Optional[str] = None
45
+
25
46
  class ExtractParams(pydantic.BaseModel):
26
47
  """
27
48
  Parameters for the extract operation.
28
49
  """
29
- prompt: str
30
- schema: Optional[Any] = None
50
+ prompt: Optional[str] = None
51
+ schema_: Optional[Any] = pydantic.Field(None, alias='schema')
31
52
  system_prompt: Optional[str] = None
32
53
  allow_external_links: Optional[bool] = False
33
54
 
@@ -39,27 +60,23 @@ class FirecrawlApp:
39
60
  data: Optional[Any] = None
40
61
  error: Optional[str] = None
41
62
 
42
- class ErrorResponse(pydantic.BaseModel):
43
- """
44
- Error response.
63
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
45
64
  """
46
- success: bool
47
- error: str
65
+ Initialize the FirecrawlApp instance with API key, API URL.
48
66
 
49
- def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
50
- """
51
- Initialize the FirecrawlApp instance with API key, API URL.
52
-
53
- Args:
54
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
55
- api_url (Optional[str]): Base URL for the Firecrawl API.
56
- """
57
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
58
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
59
- if self.api_key is None:
60
- logger.warning("No API key provided")
61
- raise ValueError('No API key provided')
62
- logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")
67
+ Args:
68
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
69
+ api_url (Optional[str]): Base URL for the Firecrawl API.
70
+ """
71
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
72
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
73
+
74
+ # Only require API key when using cloud service
75
+ if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
76
+ logger.warning("No API key provided for cloud service")
77
+ raise ValueError('No API key provided')
78
+
79
+ logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
63
80
 
64
81
  def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
65
82
  """
@@ -95,6 +112,18 @@ class FirecrawlApp:
95
112
  if key not in ['extract']:
96
113
  scrape_params[key] = value
97
114
 
115
+ json = params.get("jsonOptions", {})
116
+ if json:
117
+ if 'schema' in json and hasattr(json['schema'], 'schema'):
118
+ json['schema'] = json['schema'].schema()
119
+ scrape_params['jsonOptions'] = json
120
+
121
+ # Include any other params directly at the top level of scrape_params
122
+ for key, value in params.items():
123
+ if key not in ['jsonOptions']:
124
+ scrape_params[key] = value
125
+
126
+
98
127
  endpoint = f'/v1/scrape'
99
128
  # Make the POST request with the prepared headers and JSON data
100
129
  response = requests.post(
@@ -103,7 +132,10 @@ class FirecrawlApp:
103
132
  json=scrape_params,
104
133
  )
105
134
  if response.status_code == 200:
106
- response = response.json()
135
+ try:
136
+ response = response.json()
137
+ except:
138
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
107
139
  if response['success'] and 'data' in response:
108
140
  return response['data']
109
141
  elif "error" in response:
@@ -113,22 +145,39 @@ class FirecrawlApp:
113
145
  else:
114
146
  self._handle_error(response, 'scrape URL')
115
147
 
116
- def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
148
+ def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
117
149
  """
118
- Perform a search using the Firecrawl API.
150
+ Search for content using the Firecrawl API.
119
151
 
120
152
  Args:
121
- query (str): The search query.
122
- params (Optional[Dict[str, Any]]): Additional parameters for the search request.
153
+ query (str): The search query string.
154
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
123
155
 
124
156
  Returns:
125
- Any: The search results if the request is successful.
126
-
127
- Raises:
128
- NotImplementedError: If the search request is attempted on API version v1.
129
- Exception: If the search request fails.
157
+ Dict[str, Any]: The search response containing success status and search results.
130
158
  """
131
- raise NotImplementedError("Search is not supported in v1.")
159
+ if params is None:
160
+ params = {}
161
+
162
+ if isinstance(params, dict):
163
+ search_params = SearchParams(query=query, **params)
164
+ else:
165
+ search_params = params
166
+ search_params.query = query
167
+
168
+ response = requests.post(
169
+ f"{self.api_url}/v1/search",
170
+ headers={"Authorization": f"Bearer {self.api_key}"},
171
+ json=search_params.dict(exclude_none=True)
172
+ )
173
+
174
+ if response.status_code != 200:
175
+ raise Exception(f"Request failed with status code {response.status_code}")
176
+
177
+ try:
178
+ return response.json()
179
+ except:
180
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
132
181
 
133
182
  def crawl_url(self, url: str,
134
183
  params: Optional[Dict[str, Any]] = None,
@@ -163,7 +212,10 @@ class FirecrawlApp:
163
212
  json_data.update(params)
164
213
  response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
165
214
  if response.status_code == 200:
166
- id = response.json().get('id')
215
+ try:
216
+ id = response.json().get('id')
217
+ except:
218
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
167
219
  return self._monitor_job_status(id, headers, poll_interval)
168
220
 
169
221
  else:
@@ -192,7 +244,10 @@ class FirecrawlApp:
192
244
  json_data.update(params)
193
245
  response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
194
246
  if response.status_code == 200:
195
- return response.json()
247
+ try:
248
+ return response.json()
249
+ except:
250
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
196
251
  else:
197
252
  self._handle_error(response, 'start crawl job')
198
253
 
@@ -214,11 +269,16 @@ class FirecrawlApp:
214
269
  headers = self._prepare_headers()
215
270
  response = self._get_request(f'{self.api_url}{endpoint}', headers)
216
271
  if response.status_code == 200:
217
- status_data = response.json()
272
+ try:
273
+ status_data = response.json()
274
+ except:
275
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
218
276
  if status_data['status'] == 'completed':
219
277
  if 'data' in status_data:
220
278
  data = status_data['data']
221
279
  while 'next' in status_data:
280
+ if len(status_data['data']) == 0:
281
+ break
222
282
  next_url = status_data.get('next')
223
283
  if not next_url:
224
284
  logger.warning("Expected 'next' URL is missing.")
@@ -228,28 +288,59 @@ class FirecrawlApp:
228
288
  if status_response.status_code != 200:
229
289
  logger.error(f"Failed to fetch next page: {status_response.status_code}")
230
290
  break
231
- status_data = status_response.json()
232
- data.extend(status_data.get('data', []))
291
+ try:
292
+ next_data = status_response.json()
293
+ except:
294
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
295
+ data.extend(next_data.get('data', []))
296
+ status_data = next_data
233
297
  except Exception as e:
234
298
  logger.error(f"Error during pagination request: {e}")
235
299
  break
236
- status_data.pop('next', None)
237
300
  status_data['data'] = data
238
-
239
- return {
240
- 'success': True,
301
+
302
+ response = {
241
303
  'status': status_data.get('status'),
242
304
  'total': status_data.get('total'),
243
305
  'completed': status_data.get('completed'),
244
306
  'creditsUsed': status_data.get('creditsUsed'),
245
307
  'expiresAt': status_data.get('expiresAt'),
246
- 'data': status_data.get('data'),
247
- 'error': status_data.get('error'),
248
- 'next': status_data.get('next', None)
308
+ 'data': status_data.get('data')
309
+ }
310
+
311
+ if 'error' in status_data:
312
+ response['error'] = status_data['error']
313
+
314
+ if 'next' in status_data:
315
+ response['next'] = status_data['next']
316
+
317
+ return {
318
+ 'success': False if 'error' in status_data else True,
319
+ **response
249
320
  }
250
321
  else:
251
322
  self._handle_error(response, 'check crawl status')
252
323
 
324
+ def check_crawl_errors(self, id: str) -> Dict[str, Any]:
325
+ """
326
+ Returns information about crawl errors.
327
+
328
+ Args:
329
+ id (str): The ID of the crawl job.
330
+
331
+ Returns:
332
+ Dict[str, Any]: Information about crawl errors.
333
+ """
334
+ headers = self._prepare_headers()
335
+ response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
336
+ if response.status_code == 200:
337
+ try:
338
+ return response.json()
339
+ except:
340
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
341
+ else:
342
+ self._handle_error(response, "check crawl errors")
343
+
253
344
  def cancel_crawl(self, id: str) -> Dict[str, Any]:
254
345
  """
255
346
  Cancel an asynchronous crawl job using the Firecrawl API.
@@ -263,7 +354,10 @@ class FirecrawlApp:
263
354
  headers = self._prepare_headers()
264
355
  response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
265
356
  if response.status_code == 200:
266
- return response.json()
357
+ try:
358
+ return response.json()
359
+ except:
360
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
267
361
  else:
268
362
  self._handle_error(response, "cancel crawl job")
269
363
 
@@ -311,7 +405,10 @@ class FirecrawlApp:
311
405
  json=json_data,
312
406
  )
313
407
  if response.status_code == 200:
314
- response = response.json()
408
+ try:
409
+ response = response.json()
410
+ except:
411
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
315
412
  if response['success'] and 'links' in response:
316
413
  return response
317
414
  elif 'error' in response:
@@ -354,7 +451,10 @@ class FirecrawlApp:
354
451
  json_data.update(params)
355
452
  response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
356
453
  if response.status_code == 200:
357
- id = response.json().get('id')
454
+ try:
455
+ id = response.json().get('id')
456
+ except:
457
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
358
458
  return self._monitor_job_status(id, headers, poll_interval)
359
459
 
360
460
  else:
@@ -383,7 +483,10 @@ class FirecrawlApp:
383
483
  json_data.update(params)
384
484
  response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
385
485
  if response.status_code == 200:
386
- return response.json()
486
+ try:
487
+ return response.json()
488
+ except:
489
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
387
490
  else:
388
491
  self._handle_error(response, 'start batch scrape job')
389
492
 
@@ -423,11 +526,16 @@ class FirecrawlApp:
423
526
  headers = self._prepare_headers()
424
527
  response = self._get_request(f'{self.api_url}{endpoint}', headers)
425
528
  if response.status_code == 200:
426
- status_data = response.json()
529
+ try:
530
+ status_data = response.json()
531
+ except:
532
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
427
533
  if status_data['status'] == 'completed':
428
534
  if 'data' in status_data:
429
535
  data = status_data['data']
430
536
  while 'next' in status_data:
537
+ if len(status_data['data']) == 0:
538
+ break
431
539
  next_url = status_data.get('next')
432
540
  if not next_url:
433
541
  logger.warning("Expected 'next' URL is missing.")
@@ -437,30 +545,60 @@ class FirecrawlApp:
437
545
  if status_response.status_code != 200:
438
546
  logger.error(f"Failed to fetch next page: {status_response.status_code}")
439
547
  break
440
- status_data = status_response.json()
441
- data.extend(status_data.get('data', []))
548
+ try:
549
+ next_data = status_response.json()
550
+ except:
551
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
552
+ data.extend(next_data.get('data', []))
553
+ status_data = next_data
442
554
  except Exception as e:
443
555
  logger.error(f"Error during pagination request: {e}")
444
556
  break
445
- status_data.pop('next', None)
446
557
  status_data['data'] = data
447
558
 
448
- return {
449
- 'success': True,
559
+ response = {
450
560
  'status': status_data.get('status'),
451
561
  'total': status_data.get('total'),
452
562
  'completed': status_data.get('completed'),
453
563
  'creditsUsed': status_data.get('creditsUsed'),
454
564
  'expiresAt': status_data.get('expiresAt'),
455
- 'data': status_data.get('data'),
456
- 'error': status_data.get('error'),
457
- 'next': status_data.get('next', None)
565
+ 'data': status_data.get('data')
566
+ }
567
+
568
+ if 'error' in status_data:
569
+ response['error'] = status_data['error']
570
+
571
+ if 'next' in status_data:
572
+ response['next'] = status_data['next']
573
+
574
+ return {
575
+ 'success': False if 'error' in status_data else True,
576
+ **response
458
577
  }
459
578
  else:
460
579
  self._handle_error(response, 'check batch scrape status')
461
580
 
581
+ def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]:
582
+ """
583
+ Returns information about batch scrape errors.
584
+
585
+ Args:
586
+ id (str): The ID of the crawl job.
587
+
588
+ Returns:
589
+ Dict[str, Any]: Information about crawl errors.
590
+ """
591
+ headers = self._prepare_headers()
592
+ response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
593
+ if response.status_code == 200:
594
+ try:
595
+ return response.json()
596
+ except:
597
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
598
+ else:
599
+ self._handle_error(response, "check batch scrape errors")
462
600
 
463
- def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
601
+ def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any:
464
602
  """
465
603
  Extracts information from a URL using the Firecrawl API.
466
604
 
@@ -473,33 +611,140 @@ class FirecrawlApp:
473
611
  """
474
612
  headers = self._prepare_headers()
475
613
 
476
- if not params or not params.get('prompt'):
477
- raise ValueError("Prompt is required")
614
+ if not params or (not params.get('prompt') and not params.get('schema')):
615
+ raise ValueError("Either prompt or schema is required")
478
616
 
479
- if not params.get('schema'):
480
- raise ValueError("Schema is required for extraction")
617
+ schema = params.get('schema')
618
+ if schema:
619
+ if hasattr(schema, 'model_json_schema'):
620
+ # Convert Pydantic model to JSON schema
621
+ schema = schema.model_json_schema()
622
+ # Otherwise assume it's already a JSON schema dict
481
623
 
482
624
  jsonData = {'urls': urls, **params}
483
- jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
625
+ request_data = {
626
+ **jsonData,
627
+ 'allowExternalLinks': params.get('allow_external_links', False),
628
+ 'schema': schema,
629
+ 'origin': 'api-sdk'
630
+ }
484
631
 
485
632
  try:
633
+ # Send the initial extract request
486
634
  response = self._post_request(
487
635
  f'{self.api_url}/v1/extract',
488
- {
489
- **jsonData,
490
- 'allowExternalLinks': params.get('allow_external_links', False),
491
- 'schema': jsonSchema
492
- },
636
+ request_data,
493
637
  headers
494
638
  )
495
639
  if response.status_code == 200:
496
- return response.json()
640
+ try:
641
+ data = response.json()
642
+ except:
643
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
644
+ if data['success']:
645
+ job_id = data.get('id')
646
+ if not job_id:
647
+ raise Exception('Job ID not returned from extract request.')
648
+
649
+ # Poll for the extract status
650
+ while True:
651
+ status_response = self._get_request(
652
+ f'{self.api_url}/v1/extract/{job_id}',
653
+ headers
654
+ )
655
+ if status_response.status_code == 200:
656
+ try:
657
+ status_data = status_response.json()
658
+ except:
659
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
660
+ if status_data['status'] == 'completed':
661
+ if status_data['success']:
662
+ return status_data
663
+ else:
664
+ raise Exception(f'Failed to extract. Error: {status_data["error"]}')
665
+ elif status_data['status'] in ['failed', 'cancelled']:
666
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
667
+ else:
668
+ self._handle_error(status_response, "extract-status")
669
+
670
+ time.sleep(2) # Polling interval
671
+ else:
672
+ raise Exception(f'Failed to extract. Error: {data["error"]}')
497
673
  else:
498
674
  self._handle_error(response, "extract")
499
675
  except Exception as e:
500
676
  raise ValueError(str(e), 500)
501
677
 
502
678
  return {'success': False, 'error': "Internal server error."}
679
+
680
+ def get_extract_status(self, job_id: str) -> Dict[str, Any]:
681
+ """
682
+ Retrieve the status of an extract job.
683
+
684
+ Args:
685
+ job_id (str): The ID of the extract job.
686
+
687
+ Returns:
688
+ Dict[str, Any]: The status of the extract job.
689
+
690
+ Raises:
691
+ ValueError: If there is an error retrieving the status.
692
+ """
693
+ headers = self._prepare_headers()
694
+ try:
695
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
696
+ if response.status_code == 200:
697
+ try:
698
+ return response.json()
699
+ except:
700
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
701
+ else:
702
+ self._handle_error(response, "get extract status")
703
+ except Exception as e:
704
+ raise ValueError(str(e), 500)
705
+
706
+ def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
707
+ """
708
+ Initiate an asynchronous extract job.
709
+
710
+ Args:
711
+ urls (List[str]): The URLs to extract data from.
712
+ params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
713
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
714
+
715
+ Returns:
716
+ Dict[str, Any]: The response from the extract operation.
717
+
718
+ Raises:
719
+ ValueError: If there is an error initiating the extract job.
720
+ """
721
+ headers = self._prepare_headers(idempotency_key)
722
+
723
+ schema = params.get('schema') if params else None
724
+ if schema:
725
+ if hasattr(schema, 'model_json_schema'):
726
+ # Convert Pydantic model to JSON schema
727
+ schema = schema.model_json_schema()
728
+ # Otherwise assume it's already a JSON schema dict
729
+
730
+ jsonData = {'urls': urls, **(params or {})}
731
+ request_data = {
732
+ **jsonData,
733
+ 'allowExternalLinks': params.get('allow_external_links', False) if params else False,
734
+ 'schema': schema
735
+ }
736
+
737
+ try:
738
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
739
+ if response.status_code == 200:
740
+ try:
741
+ return response.json()
742
+ except:
743
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
744
+ else:
745
+ self._handle_error(response, "async extract")
746
+ except Exception as e:
747
+ raise ValueError(str(e), 500)
503
748
 
504
749
  def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
505
750
  """
@@ -625,14 +870,22 @@ class FirecrawlApp:
625
870
 
626
871
  status_response = self._get_request(api_url, headers)
627
872
  if status_response.status_code == 200:
628
- status_data = status_response.json()
873
+ try:
874
+ status_data = status_response.json()
875
+ except:
876
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
629
877
  if status_data['status'] == 'completed':
630
878
  if 'data' in status_data:
631
879
  data = status_data['data']
632
880
  while 'next' in status_data:
633
- status_response = self._get_request(status_data['next'], headers)
634
- status_data = status_response.json()
635
- data.extend(status_data['data'])
881
+ if len(status_data['data']) == 0:
882
+ break
883
+ status_response = self._get_request(status_data['next'], headers)
884
+ try:
885
+ status_data = status_response.json()
886
+ except:
887
+ raise Exception(f'Failed to parse Firecrawl response as JSON.')
888
+ data.extend(status_data.get('data', []))
636
889
  status_data['data'] = data
637
890
  return status_data
638
891
  else:
@@ -656,8 +909,12 @@ class FirecrawlApp:
656
909
  Raises:
657
910
  Exception: An exception with a message containing the status code and error details from the response.
658
911
  """
659
- error_message = response.json().get('error', 'No error message provided.')
660
- error_details = response.json().get('details', 'No additional error details provided.')
912
+ try:
913
+ error_message = response.json().get('error', 'No error message provided.')
914
+ error_details = response.json().get('details', 'No additional error details provided.')
915
+ except:
916
+ raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
917
+
661
918
 
662
919
  if response.status_code == 402:
663
920
  message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
@@ -707,15 +964,15 @@ class CrawlWatcher:
707
964
  async def _handle_message(self, msg: Dict[str, Any]):
708
965
  if msg['type'] == 'done':
709
966
  self.status = 'completed'
710
- self.dispatch_event('done', {'status': self.status, 'data': self.data})
967
+ self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
711
968
  elif msg['type'] == 'error':
712
969
  self.status = 'failed'
713
- self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']})
970
+ self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
714
971
  elif msg['type'] == 'catchup':
715
972
  self.status = msg['data']['status']
716
973
  self.data.extend(msg['data'].get('data', []))
717
974
  for doc in self.data:
718
- self.dispatch_event('document', doc)
975
+ self.dispatch_event('document', {'data': doc, 'id': self.id})
719
976
  elif msg['type'] == 'document':
720
977
  self.data.append(msg['data'])
721
- self.dispatch_event('document', msg['data'])
978
+ self.dispatch_event('document', {'data': msg['data'], 'id': self.id})
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 1.6.0
3
+ Version: 1.10.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/mendableai/firecrawl
6
6
  Author: Mendable.ai
@@ -37,6 +37,7 @@ Requires-Dist: requests
37
37
  Requires-Dist: python-dotenv
38
38
  Requires-Dist: websockets
39
39
  Requires-Dist: nest-asyncio
40
+ Requires-Dist: pydantic (>=2.10.3)
40
41
 
41
42
  # Firecrawl Python SDK
42
43
 
@@ -0,0 +1,11 @@
1
+ firecrawl/__init__.py,sha256=jO4L4KZKDbIL-Gef19zkY5xiEFYxuZUBCxM4B-TGaBI,2544
2
+ firecrawl/firecrawl.py,sha256=WpIBgsUTq8KWaZeaiJJnCjEh48-ObOlOOfXBRie1Quc,40493
3
+ firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ firecrawl/__tests__/e2e_withAuth/test.py,sha256=6OawnVF4IPeGyXg_Izi3t8U7MyT90roaJBJIG5UfllM,7935
5
+ firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=tL5kJJ4el37Wc-Z2TRSuSWwWG2M40h3VPxHYuWijD00,19888
7
+ firecrawl-1.10.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
8
+ firecrawl-1.10.0.dist-info/METADATA,sha256=uYVHavePTK87Zo8Haw90Afdpkm4nkFGW5L4SuSP6u9I,10632
9
+ firecrawl-1.10.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
10
+ firecrawl-1.10.0.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
11
+ firecrawl-1.10.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- firecrawl/__init__.py,sha256=9mQfSNKz0VYJilNxhiaYwxWw2gMvUA1Ql2SUnGXCivY,2543
2
- firecrawl/firecrawl.py,sha256=RcOaoGUs-JWvz2Xy8W5eEizoVZnE3RCbb8P75RAc1JQ,30207
3
- firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- firecrawl/__tests__/e2e_withAuth/test.py,sha256=L-umFR3WyrJso1EwqkxjbTMr5AEI4t5zDfhQcCzitOI,7911
5
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=KQMmGAtJAIafja6EGtJ-W9162w2Hm6PNjqKl3_RQXLA,16456
7
- firecrawl-1.6.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
8
- firecrawl-1.6.0.dist-info/METADATA,sha256=AvmxvRgdpvL-pTdz43kUd1DhgPX4evG1tV6yUJhUda8,10596
9
- firecrawl-1.6.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
10
- firecrawl-1.6.0.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
11
- firecrawl-1.6.0.dist-info/RECORD,,