PyPI - firecrawl - Versions diffs - 1.8.0__tar.gz → 1.9.0__tar.gz - Mend

firecrawl 1.8.0tar.gz → 1.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of firecrawl might be problematic. Click here for more details.

Files changed (17) hide show

{firecrawl-1.8.0 → firecrawl-1.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: firecrawl
-Version: 1.8.0
+Version: 1.9.0
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/mendableai/firecrawl
 Author: Mendable.ai

{firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl/__init__.py RENAMED Viewed

@@ -13,7 +13,7 @@ import os
 from .firecrawl import FirecrawlApp # noqa
-__version__ = "1.8.0"
+__version__ = "1.9.0"
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")

{firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl/firecrawl.py RENAMED Viewed

@@ -250,6 +250,8 @@ class FirecrawlApp:
                 if 'data' in status_data:
                     data = status_data['data']
                     while 'next' in status_data:
+                        if len(status_data['data']) == 0:
+                            break
                         next_url = status_data.get('next')
                         if not next_url:
                             logger.warning("Expected 'next' URL is missing.")
@@ -266,17 +268,25 @@ class FirecrawlApp:
                             logger.error(f"Error during pagination request: {e}")
                             break
                     status_data['data'] = data
-            return {
-                'success': True,
+            response = {
                 'status': status_data.get('status'),
                 'total': status_data.get('total'),
                 'completed': status_data.get('completed'),
                 'creditsUsed': status_data.get('creditsUsed'),
                 'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
-                'error': status_data.get('error'),
-                'next': status_data.get('next', None)
+                'data': status_data.get('data')
+            }
+            if 'error' in status_data:
+                response['error'] = status_data['error']
+            if 'next' in status_data:
+                response['next'] = status_data['next']
+            return {
+                'success': False if 'error' in status_data else True,
+                **response
             }
         else:
             self._handle_error(response, 'check crawl status')
@@ -459,6 +469,8 @@ class FirecrawlApp:
                 if 'data' in status_data:
                     data = status_data['data']
                     while 'next' in status_data:
+                        if len(status_data['data']) == 0:
+                            break
                         next_url = status_data.get('next')
                         if not next_url:
                             logger.warning("Expected 'next' URL is missing.")
@@ -476,16 +488,24 @@ class FirecrawlApp:
                             break
                     status_data['data'] = data
-            return {
-                'success': True,
+            response = {
                 'status': status_data.get('status'),
                 'total': status_data.get('total'),
                 'completed': status_data.get('completed'),
                 'creditsUsed': status_data.get('creditsUsed'),
                 'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
-                'error': status_data.get('error'),
-                'next': status_data.get('next', None)
+                'data': status_data.get('data')
+            }
+            if 'error' in status_data:
+                response['error'] = status_data['error']
+            if 'next' in status_data:
+                response['next'] = status_data['next']
+            return {
+                'success': False if 'error' in status_data else True,
+                **response
             }
         else:
             self._handle_error(response, 'check batch scrape status')
@@ -518,10 +538,12 @@ class FirecrawlApp:
         request_data = {
             **jsonData,
             'allowExternalLinks': params.get('allow_external_links', False),
-            'schema': schema
+            'schema': schema,
+            'origin': 'api-sdk'
         }
         try:
+            # Send the initial extract request
             response = self._post_request(
                 f'{self.api_url}/v1/extract',
                 request_data,
@@ -530,7 +552,29 @@ class FirecrawlApp:
             if response.status_code == 200:
                 data = response.json()
                 if data['success']:
-                    return data
+                    job_id = data.get('id')
+                    if not job_id:
+                        raise Exception('Job ID not returned from extract request.')
+                    # Poll for the extract status
+                    while True:
+                        status_response = self._get_request(
+                            f'{self.api_url}/v1/extract/{job_id}',
+                            headers
+                        )
+                        if status_response.status_code == 200:
+                            status_data = status_response.json()
+                            if status_data['status'] == 'completed':
+                                if status_data['success']:
+                                    return status_data
+                                else:
+                                    raise Exception(f'Failed to extract. Error: {status_data["error"]}')
+                            elif status_data['status'] in ['failed', 'cancelled']:
+                                raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
+                        else:
+                            self._handle_error(status_response, "extract-status")
+                        time.sleep(2)  # Polling interval
                 else:
                     raise Exception(f'Failed to extract. Error: {data["error"]}')
             else:
@@ -539,6 +583,69 @@ class FirecrawlApp:
             raise ValueError(str(e), 500)
         return {'success': False, 'error': "Internal server error."}
+    def get_extract_status(self, job_id: str) -> Dict[str, Any]:
+        """
+        Retrieve the status of an extract job.
+        Args:
+            job_id (str): The ID of the extract job.
+        Returns:
+            Dict[str, Any]: The status of the extract job.
+        Raises:
+            ValueError: If there is an error retrieving the status.
+        """
+        headers = self._prepare_headers()
+        try:
+            response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
+            if response.status_code == 200:
+                return response.json()
+            else:
+                self._handle_error(response, "get extract status")
+        except Exception as e:
+            raise ValueError(str(e), 500)
+    def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Initiate an asynchronous extract job.
+        Args:
+            urls (List[str]): The URLs to extract data from.
+            params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
+            idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
+        Returns:
+            Dict[str, Any]: The response from the extract operation.
+        Raises:
+            ValueError: If there is an error initiating the extract job.
+        """
+        headers = self._prepare_headers(idempotency_key)
+        schema = params.get('schema') if params else None
+        if schema:
+            if hasattr(schema, 'model_json_schema'):
+                # Convert Pydantic model to JSON schema
+                schema = schema.model_json_schema()
+            # Otherwise assume it's already a JSON schema dict
+        jsonData = {'urls': urls, **(params or {})}
+        request_data = {
+            **jsonData,
+            'allowExternalLinks': params.get('allow_external_links', False) if params else False,
+            'schema': schema
+        }
+        try:
+            response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
+            if response.status_code == 200:
+                return response.json()
+            else:
+                self._handle_error(response, "async extract")
+        except Exception as e:
+            raise ValueError(str(e), 500)
     def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
         """
@@ -669,6 +776,8 @@ class FirecrawlApp:
                     if 'data' in status_data:
                         data = status_data['data']
                         while 'next' in status_data:
+                          if len(status_data['data']) == 0:
+                              break
                           status_response = self._get_request(status_data['next'], headers)
                           status_data = status_response.json()
                           data.extend(status_data.get('data', []))

{firecrawl-1.8.0 → firecrawl-1.9.0}/firecrawl.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: firecrawl
-Version: 1.8.0
+Version: 1.9.0
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/mendableai/firecrawl
 Author: Mendable.ai