firecrawl 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

firecrawl/__init__.py CHANGED
@@ -13,7 +13,7 @@ import os
13
13
 
14
14
  from .firecrawl import FirecrawlApp # noqa
15
15
 
16
- __version__ = "1.8.0"
16
+ __version__ = "1.9.0"
17
17
 
18
18
  # Define the logger for the Firecrawl project
19
19
  logger: logging.Logger = logging.getLogger("firecrawl")
firecrawl/firecrawl.py CHANGED
@@ -250,6 +250,8 @@ class FirecrawlApp:
250
250
  if 'data' in status_data:
251
251
  data = status_data['data']
252
252
  while 'next' in status_data:
253
+ if len(status_data['data']) == 0:
254
+ break
253
255
  next_url = status_data.get('next')
254
256
  if not next_url:
255
257
  logger.warning("Expected 'next' URL is missing.")
@@ -266,17 +268,25 @@ class FirecrawlApp:
266
268
  logger.error(f"Error during pagination request: {e}")
267
269
  break
268
270
  status_data['data'] = data
269
-
270
- return {
271
- 'success': True,
271
+
272
+ response = {
272
273
  'status': status_data.get('status'),
273
274
  'total': status_data.get('total'),
274
275
  'completed': status_data.get('completed'),
275
276
  'creditsUsed': status_data.get('creditsUsed'),
276
277
  'expiresAt': status_data.get('expiresAt'),
277
- 'data': status_data.get('data'),
278
- 'error': status_data.get('error'),
279
- 'next': status_data.get('next', None)
278
+ 'data': status_data.get('data')
279
+ }
280
+
281
+ if 'error' in status_data:
282
+ response['error'] = status_data['error']
283
+
284
+ if 'next' in status_data:
285
+ response['next'] = status_data['next']
286
+
287
+ return {
288
+ 'success': False if 'error' in status_data else True,
289
+ **response
280
290
  }
281
291
  else:
282
292
  self._handle_error(response, 'check crawl status')
@@ -459,6 +469,8 @@ class FirecrawlApp:
459
469
  if 'data' in status_data:
460
470
  data = status_data['data']
461
471
  while 'next' in status_data:
472
+ if len(status_data['data']) == 0:
473
+ break
462
474
  next_url = status_data.get('next')
463
475
  if not next_url:
464
476
  logger.warning("Expected 'next' URL is missing.")
@@ -476,16 +488,24 @@ class FirecrawlApp:
476
488
  break
477
489
  status_data['data'] = data
478
490
 
479
- return {
480
- 'success': True,
491
+ response = {
481
492
  'status': status_data.get('status'),
482
493
  'total': status_data.get('total'),
483
494
  'completed': status_data.get('completed'),
484
495
  'creditsUsed': status_data.get('creditsUsed'),
485
496
  'expiresAt': status_data.get('expiresAt'),
486
- 'data': status_data.get('data'),
487
- 'error': status_data.get('error'),
488
- 'next': status_data.get('next', None)
497
+ 'data': status_data.get('data')
498
+ }
499
+
500
+ if 'error' in status_data:
501
+ response['error'] = status_data['error']
502
+
503
+ if 'next' in status_data:
504
+ response['next'] = status_data['next']
505
+
506
+ return {
507
+ 'success': False if 'error' in status_data else True,
508
+ **response
489
509
  }
490
510
  else:
491
511
  self._handle_error(response, 'check batch scrape status')
@@ -518,10 +538,12 @@ class FirecrawlApp:
518
538
  request_data = {
519
539
  **jsonData,
520
540
  'allowExternalLinks': params.get('allow_external_links', False),
521
- 'schema': schema
541
+ 'schema': schema,
542
+ 'origin': 'api-sdk'
522
543
  }
523
544
 
524
545
  try:
546
+ # Send the initial extract request
525
547
  response = self._post_request(
526
548
  f'{self.api_url}/v1/extract',
527
549
  request_data,
@@ -530,7 +552,29 @@ class FirecrawlApp:
530
552
  if response.status_code == 200:
531
553
  data = response.json()
532
554
  if data['success']:
533
- return data
555
+ job_id = data.get('id')
556
+ if not job_id:
557
+ raise Exception('Job ID not returned from extract request.')
558
+
559
+ # Poll for the extract status
560
+ while True:
561
+ status_response = self._get_request(
562
+ f'{self.api_url}/v1/extract/{job_id}',
563
+ headers
564
+ )
565
+ if status_response.status_code == 200:
566
+ status_data = status_response.json()
567
+ if status_data['status'] == 'completed':
568
+ if status_data['success']:
569
+ return status_data
570
+ else:
571
+ raise Exception(f'Failed to extract. Error: {status_data["error"]}')
572
+ elif status_data['status'] in ['failed', 'cancelled']:
573
+ raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
574
+ else:
575
+ self._handle_error(status_response, "extract-status")
576
+
577
+ time.sleep(2) # Polling interval
534
578
  else:
535
579
  raise Exception(f'Failed to extract. Error: {data["error"]}')
536
580
  else:
@@ -539,6 +583,69 @@ class FirecrawlApp:
539
583
  raise ValueError(str(e), 500)
540
584
 
541
585
  return {'success': False, 'error': "Internal server error."}
586
+
587
+ def get_extract_status(self, job_id: str) -> Dict[str, Any]:
588
+ """
589
+ Retrieve the status of an extract job.
590
+
591
+ Args:
592
+ job_id (str): The ID of the extract job.
593
+
594
+ Returns:
595
+ Dict[str, Any]: The status of the extract job.
596
+
597
+ Raises:
598
+ ValueError: If there is an error retrieving the status.
599
+ """
600
+ headers = self._prepare_headers()
601
+ try:
602
+ response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
603
+ if response.status_code == 200:
604
+ return response.json()
605
+ else:
606
+ self._handle_error(response, "get extract status")
607
+ except Exception as e:
608
+ raise ValueError(str(e), 500)
609
+
610
+ def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
611
+ """
612
+ Initiate an asynchronous extract job.
613
+
614
+ Args:
615
+ urls (List[str]): The URLs to extract data from.
616
+ params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
617
+ idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
618
+
619
+ Returns:
620
+ Dict[str, Any]: The response from the extract operation.
621
+
622
+ Raises:
623
+ ValueError: If there is an error initiating the extract job.
624
+ """
625
+ headers = self._prepare_headers(idempotency_key)
626
+
627
+ schema = params.get('schema') if params else None
628
+ if schema:
629
+ if hasattr(schema, 'model_json_schema'):
630
+ # Convert Pydantic model to JSON schema
631
+ schema = schema.model_json_schema()
632
+ # Otherwise assume it's already a JSON schema dict
633
+
634
+ jsonData = {'urls': urls, **(params or {})}
635
+ request_data = {
636
+ **jsonData,
637
+ 'allowExternalLinks': params.get('allow_external_links', False) if params else False,
638
+ 'schema': schema
639
+ }
640
+
641
+ try:
642
+ response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
643
+ if response.status_code == 200:
644
+ return response.json()
645
+ else:
646
+ self._handle_error(response, "async extract")
647
+ except Exception as e:
648
+ raise ValueError(str(e), 500)
542
649
 
543
650
  def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
544
651
  """
@@ -669,6 +776,8 @@ class FirecrawlApp:
669
776
  if 'data' in status_data:
670
777
  data = status_data['data']
671
778
  while 'next' in status_data:
779
+ if len(status_data['data']) == 0:
780
+ break
672
781
  status_response = self._get_request(status_data['next'], headers)
673
782
  status_data = status_response.json()
674
783
  data.extend(status_data.get('data', []))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 1.8.0
3
+ Version: 1.9.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/mendableai/firecrawl
6
6
  Author: Mendable.ai
@@ -0,0 +1,11 @@
1
+ firecrawl/__init__.py,sha256=5ubhPauv4SGtK_XBudjfb2AgdfGzSMetytrO2nb9QII,2543
2
+ firecrawl/firecrawl.py,sha256=VuSKgvzxF3G-1MWK7INR1NBae3jYx6kES-kDyqkqD40,35962
3
+ firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ firecrawl/__tests__/e2e_withAuth/test.py,sha256=6OawnVF4IPeGyXg_Izi3t8U7MyT90roaJBJIG5UfllM,7935
5
+ firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=tL5kJJ4el37Wc-Z2TRSuSWwWG2M40h3VPxHYuWijD00,19888
7
+ firecrawl-1.9.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
8
+ firecrawl-1.9.0.dist-info/METADATA,sha256=Hz7bNsZqTKMLpZ-wP3myJXRSM1MOUbCoouQy9DIk78c,10631
9
+ firecrawl-1.9.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
10
+ firecrawl-1.9.0.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
11
+ firecrawl-1.9.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- firecrawl/__init__.py,sha256=3jDnDwAg-3SU8XRq2E8HWtJ0Umi4PLKGf4JEsR7ESig,2543
2
- firecrawl/firecrawl.py,sha256=0l5WOmiy5OxEwZvgIS0TpsFx39F3X6zADjHMzg6Q8iI,31650
3
- firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- firecrawl/__tests__/e2e_withAuth/test.py,sha256=6OawnVF4IPeGyXg_Izi3t8U7MyT90roaJBJIG5UfllM,7935
5
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=tL5kJJ4el37Wc-Z2TRSuSWwWG2M40h3VPxHYuWijD00,19888
7
- firecrawl-1.8.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
8
- firecrawl-1.8.0.dist-info/METADATA,sha256=FCEt8ZVtXgyaGc2bNXssb2AC4hLDVI-LPoa-Qa_s2cM,10631
9
- firecrawl-1.8.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
10
- firecrawl-1.8.0.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
11
- firecrawl-1.8.0.dist-info/RECORD,,