firecrawl 1.7.1__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

firecrawl/__init__.py CHANGED
@@ -13,7 +13,7 @@ import os
13
13
 
14
14
  from .firecrawl import FirecrawlApp # noqa
15
15
 
16
- __version__ = "1.7.1"
16
+ __version__ = "1.9.0"
17
17
 
18
18
  # Define the logger for the Firecrawl project
19
19
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -371,4 +371,70 @@ def test_search_e2e():
371
371
  # assert isinstance(llm_extraction['supports_sso'], bool)
372
372
  # assert isinstance(llm_extraction['is_open_source'], bool)
373
373
 
374
+ def test_search_with_string_query():
375
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
376
+ response = app.search("firecrawl")
377
+ assert response["success"] is True
378
+ assert len(response["data"]) > 0
379
+ assert response["data"][0]["markdown"] is not None
380
+ assert response["data"][0]["metadata"] is not None
381
+ assert response["data"][0]["metadata"]["title"] is not None
382
+ assert response["data"][0]["metadata"]["description"] is not None
383
+
384
+ def test_search_with_params_dict():
385
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
386
+ response = app.search("firecrawl", {
387
+ "limit": 3,
388
+ "lang": "en",
389
+ "country": "us",
390
+ "scrapeOptions": {
391
+ "formats": ["markdown", "html", "links"],
392
+ "onlyMainContent": True
393
+ }
394
+ })
395
+ assert response["success"] is True
396
+ assert len(response["data"]) <= 3
397
+ for doc in response["data"]:
398
+ assert doc["markdown"] is not None
399
+ assert doc["html"] is not None
400
+ assert doc["links"] is not None
401
+ assert doc["metadata"] is not None
402
+ assert doc["metadata"]["title"] is not None
403
+ assert doc["metadata"]["description"] is not None
404
+
405
+ def test_search_with_params_object():
406
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
407
+ params = SearchParams(
408
+ query="firecrawl",
409
+ limit=3,
410
+ lang="en",
411
+ country="us",
412
+ scrapeOptions={
413
+ "formats": ["markdown", "html", "links"],
414
+ "onlyMainContent": True
415
+ }
416
+ )
417
+ response = app.search(params.query, params)
418
+ assert response["success"] is True
419
+ assert len(response["data"]) <= 3
420
+ for doc in response["data"]:
421
+ assert doc["markdown"] is not None
422
+ assert doc["html"] is not None
423
+ assert doc["links"] is not None
424
+ assert doc["metadata"] is not None
425
+ assert doc["metadata"]["title"] is not None
426
+ assert doc["metadata"]["description"] is not None
427
+
428
+ def test_search_invalid_api_key():
429
+ app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
430
+ with pytest.raises(Exception) as e:
431
+ app.search("test query")
432
+ assert "404" in str(e.value)
433
+
434
+ def test_search_with_invalid_params():
435
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
436
+ with pytest.raises(Exception) as e:
437
+ app.search("test query", {"invalid_param": "value"})
438
+ assert "ValidationError" in str(e.value)
439
+
374
440
 
firecrawl/firecrawl.py CHANGED
@@ -21,7 +21,28 @@ import websockets
21
21
 
22
22
  logger : logging.Logger = logging.getLogger("firecrawl")
23
23
 
24
+ class SearchParams(pydantic.BaseModel):
25
+ query: str
26
+ limit: Optional[int] = 5
27
+ tbs: Optional[str] = None
28
+ filter: Optional[str] = None
29
+ lang: Optional[str] = "en"
30
+ country: Optional[str] = "us"
31
+ location: Optional[str] = None
32
+ origin: Optional[str] = "api"
33
+ timeout: Optional[int] = 60000
34
+ scrapeOptions: Optional[Dict[str, Any]] = None
35
+
24
36
  class FirecrawlApp:
37
+ class SearchResponse(pydantic.BaseModel):
38
+ """
39
+ Response from the search operation.
40
+ """
41
+ success: bool
42
+ data: List[Dict[str, Any]]
43
+ warning: Optional[str] = None
44
+ error: Optional[str] = None
45
+
25
46
  class ExtractParams(pydantic.BaseModel):
26
47
  """
27
48
  Parameters for the extract operation.
@@ -109,22 +130,36 @@ class FirecrawlApp:
109
130
  else:
110
131
  self._handle_error(response, 'scrape URL')
111
132
 
112
- def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
133
+ def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
113
134
  """
114
- Perform a search using the Firecrawl API.
135
+ Search for content using the Firecrawl API.
115
136
 
116
137
  Args:
117
- query (str): The search query.
118
- params (Optional[Dict[str, Any]]): Additional parameters for the search request.
138
+ query (str): The search query string.
139
+ params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
119
140
 
120
141
  Returns:
121
- Any: The search results if the request is successful.
122
-
123
- Raises:
124
- NotImplementedError: If the search request is attempted on API version v1.
125
- Exception: If the search request fails.
142
+ Dict[str, Any]: The search response containing success status and search results.
126
143
  """
127
- raise NotImplementedError("Search is not supported in v1.")
144
+ if params is None:
145
+ params = {}
146
+
147
+ if isinstance(params, dict):
148
+ search_params = SearchParams(query=query, **params)
149
+ else:
150
+ search_params = params
151
+ search_params.query = query
152
+
153
+ response = requests.post(
154
+ f"{self.api_url}/v1/search",
155
+ headers={"Authorization": f"Bearer {self.api_key}"},
156
+ json=search_params.dict(exclude_none=True)
157
+ )
158
+
159
+ if response.status_code != 200:
160
+ raise Exception(f"Request failed with status code {response.status_code}")
161
+
162
+ return response.json()
128
163
 
129
164
  def crawl_url(self, url: str,
130
165
  params: Optional[Dict[str, Any]] = None,
@@ -215,6 +250,8 @@ class FirecrawlApp:
215
250
  if 'data' in status_data:
216
251
  data = status_data['data']
217
252
  while 'next' in status_data:
253
+ if len(status_data['data']) == 0:
254
+ break
218
255
  next_url = status_data.get('next')
219
256
  if not next_url:
220
257
  logger.warning("Expected 'next' URL is missing.")
@@ -231,17 +268,25 @@ class FirecrawlApp:
231
268
  logger.error(f"Error during pagination request: {e}")
232
269
  break
233
270
  status_data['data'] = data
234
-
235
- return {
236
- 'success': True,
271
+
272
+ response = {
237
273
  'status': status_data.get('status'),
238
274
  'total': status_data.get('total'),
239
275
  'completed': status_data.get('completed'),
240
276
  'creditsUsed': status_data.get('creditsUsed'),
241
277
  'expiresAt': status_data.get('expiresAt'),
242
- 'data': status_data.get('data'),
243
- 'error': status_data.get('error'),
244
- 'next': status_data.get('next', None)
278
+ 'data': status_data.get('data')
279
+ }
280
+
281
+ if 'error' in status_data:
282
+ response['error'] = status_data['error']
283
+
284
+ if 'next' in status_data:
285
+ response['next'] = status_data['next']
286
+
287
+ return {
288
+ 'success': False if 'error' in status_data else True,
289
+ **response
245
290
  }
246
291
  else:
247
292
  self._handle_error(response, 'check crawl status')
@@ -424,6 +469,8 @@ class FirecrawlApp:
424
469
  if 'data' in status_data:
425
470
  data = status_data['data']
426
471
  while 'next' in status_data:
472
+ if len(status_data['data']) == 0:
473
+ break
427
474
  next_url = status_data.get('next')
428
475
  if not next_url:
429
476
  logger.warning("Expected 'next' URL is missing.")
@@ -441,16 +488,24 @@ class FirecrawlApp:
441
488
  break
442
489
  status_data['data'] = data
443
490
 
444
- return {
445
- 'success': True,
491
+ response = {
446
492
  'status': status_data.get('status'),
447
493
  'total': status_data.get('total'),
448
494
  'completed': status_data.get('completed'),
449
495
  'creditsUsed': status_data.get('creditsUsed'),
450
496
  'expiresAt': status_data.get('expiresAt'),
451
- 'data': status_data.get('data'),
452
- 'error': status_data.get('error'),
453
- 'next': status_data.get('next', None)
497
+ 'data': status_data.get('data')
498
+ }
499
+
500
+ if 'error' in status_data:
501
+ response['error'] = status_data['error']
502
+
503
+ if 'next' in status_data:
504
+ response['next'] = status_data['next']
505
+
506
+ return {
507
+ 'success': False if 'error' in status_data else True,
508
+ **response
454
509
  }
455
510
  else:
456
511
  self._handle_error(response, 'check batch scrape status')
@@ -483,10 +538,12 @@ class FirecrawlApp:
483
538
  request_data = {
484
539
  **jsonData,
485
540
  'allowExternalLinks': params.get('allow_external_links', False),
486
- 'schema': schema
541
+ 'schema': schema,
542
+ 'origin': 'api-sdk'
487
543
  }
488
544
 
489
545
  try:
546
+ # Send the initial extract request
490
547
  response = self._post_request(
491
548
  f'{self.api_url}/v1/extract',
492
549
  request_data,
@@ -495,7 +552,29 @@ class FirecrawlApp:
495
552
  if response.status_code == 200:
496
553
  data = response.json()
497
554
  if data['success']:
498
- return data
555
+ job_id = data.get('id')
556
+ if not job_id:
557
+ raise Exception('Job ID not returned from extract request.')
558
+
559
+ # Poll for the extract status
560
+ while True:
561
+ status_response = self._get_request(
562
+ f'{self.api_url}/v1/extract/{job_id}',
563
+ headers
564
+ )
565
+ if status_response.status_code == 200:
566
+ status_data = status_response.json()
567
+ if status_data['status'] == 'completed':
568
+ if status_data['success']:
569
+ return status_data
570
+ else:
571
+ raise Exception(f'Failed to extract. Error: {status_data["error"]}')
572
+ elif status_data['status'] in ['failed', 'cancelled']:
573
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
574
+ else:
575
+ self._handle_error(status_response, "extract-status")
576
+
577
+ time.sleep(2) # Polling interval
499
578
  else:
500
579
  raise Exception(f'Failed to extract. Error: {data["error"]}')
501
580
  else:
@@ -504,6 +583,69 @@ class FirecrawlApp:
504
583
  raise ValueError(str(e), 500)
505
584
 
506
585
  return {'success': False, 'error': "Internal server error."}
586
+
587
+ def get_extract_status(self, job_id: str) -> Dict[str, Any]:
588
+ """
589
+ Retrieve the status of an extract job.
590
+
591
+ Args:
592
+ job_id (str): The ID of the extract job.
593
+
594
+ Returns:
595
+ Dict[str, Any]: The status of the extract job.
596
+
597
+ Raises:
598
+ ValueError: If there is an error retrieving the status.
599
+ """
600
+ headers = self._prepare_headers()
601
+ try:
602
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
603
+ if response.status_code == 200:
604
+ return response.json()
605
+ else:
606
+ self._handle_error(response, "get extract status")
607
+ except Exception as e:
608
+ raise ValueError(str(e), 500)
609
+
610
+ def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
611
+ """
612
+ Initiate an asynchronous extract job.
613
+
614
+ Args:
615
+ urls (List[str]): The URLs to extract data from.
616
+ params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
617
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
618
+
619
+ Returns:
620
+ Dict[str, Any]: The response from the extract operation.
621
+
622
+ Raises:
623
+ ValueError: If there is an error initiating the extract job.
624
+ """
625
+ headers = self._prepare_headers(idempotency_key)
626
+
627
+ schema = params.get('schema') if params else None
628
+ if schema:
629
+ if hasattr(schema, 'model_json_schema'):
630
+ # Convert Pydantic model to JSON schema
631
+ schema = schema.model_json_schema()
632
+ # Otherwise assume it's already a JSON schema dict
633
+
634
+ jsonData = {'urls': urls, **(params or {})}
635
+ request_data = {
636
+ **jsonData,
637
+ 'allowExternalLinks': params.get('allow_external_links', False) if params else False,
638
+ 'schema': schema
639
+ }
640
+
641
+ try:
642
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
643
+ if response.status_code == 200:
644
+ return response.json()
645
+ else:
646
+ self._handle_error(response, "async extract")
647
+ except Exception as e:
648
+ raise ValueError(str(e), 500)
507
649
 
508
650
  def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
509
651
  """
@@ -634,6 +776,8 @@ class FirecrawlApp:
634
776
  if 'data' in status_data:
635
777
  data = status_data['data']
636
778
  while 'next' in status_data:
779
+ if len(status_data['data']) == 0:
780
+ break
637
781
  status_response = self._get_request(status_data['next'], headers)
638
782
  status_data = status_response.json()
639
783
  data.extend(status_data.get('data', []))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 1.7.1
3
+ Version: 1.9.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/mendableai/firecrawl
6
6
  Author: Mendable.ai
@@ -0,0 +1,11 @@
1
+ firecrawl/__init__.py,sha256=5ubhPauv4SGtK_XBudjfb2AgdfGzSMetytrO2nb9QII,2543
2
+ firecrawl/firecrawl.py,sha256=VuSKgvzxF3G-1MWK7INR1NBae3jYx6kES-kDyqkqD40,35962
3
+ firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ firecrawl/__tests__/e2e_withAuth/test.py,sha256=6OawnVF4IPeGyXg_Izi3t8U7MyT90roaJBJIG5UfllM,7935
5
+ firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=tL5kJJ4el37Wc-Z2TRSuSWwWG2M40h3VPxHYuWijD00,19888
7
+ firecrawl-1.9.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
8
+ firecrawl-1.9.0.dist-info/METADATA,sha256=Hz7bNsZqTKMLpZ-wP3myJXRSM1MOUbCoouQy9DIk78c,10631
9
+ firecrawl-1.9.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
10
+ firecrawl-1.9.0.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
11
+ firecrawl-1.9.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- firecrawl/__init__.py,sha256=tZdRrVCtEZ7K0rU3ivzNX_EfEElo3y5zBhk9Had3W3o,2543
2
- firecrawl/firecrawl.py,sha256=MFbF6gxXFQXe2kKeEE-PoQZnAbR00Ip163GD9MZh9ZM,30578
3
- firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- firecrawl/__tests__/e2e_withAuth/test.py,sha256=6OawnVF4IPeGyXg_Izi3t8U7MyT90roaJBJIG5UfllM,7935
5
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=Qad0xRPboRdlH6Q5o2985b4xjpjw2jr9LCik-GbXaZ0,17470
7
- firecrawl-1.7.1.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
8
- firecrawl-1.7.1.dist-info/METADATA,sha256=AleNQvsesEq0Uzt1R51p6mqWc43O5JQi3B_YYG6xr84,10631
9
- firecrawl-1.7.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
10
- firecrawl-1.7.1.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
11
- firecrawl-1.7.1.dist-info/RECORD,,