PyPI - firecrawl - Versions diffs - 1.6.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

firecrawl 1.6.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of firecrawl might be problematic. Click here for more details.

Files changed (10) hide show

firecrawl/__init__.py +1 -1
firecrawl/__tests__/e2e_withAuth/test.py +12 -12
firecrawl/__tests__/v1/e2e_withAuth/test.py +123 -35
firecrawl/firecrawl.py +336 -79
{firecrawl-1.6.0.dist-info → firecrawl-1.10.0.dist-info}/METADATA +2 -1
firecrawl-1.10.0.dist-info/RECORD +11 -0
firecrawl-1.6.0.dist-info/RECORD +0 -11
{firecrawl-1.6.0.dist-info → firecrawl-1.10.0.dist-info}/LICENSE +0 -0
{firecrawl-1.6.0.dist-info → firecrawl-1.10.0.dist-info}/WHEEL +0 -0
{firecrawl-1.6.0.dist-info → firecrawl-1.10.0.dist-info}/top_level.txt +0 -0

firecrawl/__init__.py CHANGED Viewed

@@ -13,7 +13,7 @@ import os
 from .firecrawl import FirecrawlApp # noqa
-__version__ = "1.6.0"
+__version__ = "1.10.0"
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")

firecrawl/__tests__/e2e_withAuth/test.py CHANGED Viewed

@@ -29,12 +29,12 @@ def test_scrape_url_invalid_api_key():
         invalid_app.scrape_url('https://firecrawl.dev')
     assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
-def test_blocklisted_url():
-    blocklisted_url = "https://facebook.com/fake-test"
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
-    with pytest.raises(Exception) as excinfo:
-        app.scrape_url(blocklisted_url)
-    assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
+# def test_blocklisted_url():
+#     blocklisted_url = "https://facebook.com/fake-test"
+#     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
+#     with pytest.raises(Exception) as excinfo:
+#         app.scrape_url(blocklisted_url)
+#     assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
 def test_successful_response_with_valid_preview_token():
     app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
@@ -90,12 +90,12 @@ def test_crawl_url_invalid_api_key():
         invalid_app.crawl_url('https://firecrawl.dev')
     assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
-def test_should_return_error_for_blocklisted_url():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
-    blocklisted_url = "https://twitter.com/fake-test"
-    with pytest.raises(Exception) as excinfo:
-        app.crawl_url(blocklisted_url)
-    assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
+# def test_should_return_error_for_blocklisted_url():
+#     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
+#     blocklisted_url = "https://twitter.com/fake-test"
+#     with pytest.raises(Exception) as excinfo:
+#         app.crawl_url(blocklisted_url)
+#     assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
 def test_crawl_url_wait_for_completion_e2e():
     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')

firecrawl/__tests__/v1/e2e_withAuth/test.py CHANGED Viewed

@@ -8,7 +8,7 @@ from datetime import datetime
 load_dotenv()
-API_URL = "http://127.0.0.1:3002";
+API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002')
 ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
 TEST_API_KEY = os.getenv('TEST_API_KEY')
@@ -20,22 +20,33 @@ spec.loader.exec_module(firecrawl)
 FirecrawlApp = firecrawl.FirecrawlApp
 def test_no_api_key():
-    with pytest.raises(Exception) as excinfo:
-      invalid_app = FirecrawlApp(api_url=API_URL)
-    assert "No API key provided" in str(excinfo.value)
+    if 'api.firecrawl.dev' in API_URL:
+        with pytest.raises(Exception) as excinfo:
+            invalid_app = FirecrawlApp(api_url=API_URL)
+        assert "No API key provided" in str(excinfo.value)
+    else:
+        # Should not raise error for self-hosted
+        app = FirecrawlApp(api_url=API_URL)
+        assert app is not None
 def test_scrape_url_invalid_api_key():
-    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
-    with pytest.raises(Exception) as excinfo:
-        invalid_app.scrape_url('https://firecrawl.dev')
-    assert "Unauthorized: Invalid token" in str(excinfo.value)
+    if 'api.firecrawl.dev' in API_URL:
+        invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+        with pytest.raises(Exception) as excinfo:
+            invalid_app.scrape_url('https://firecrawl.dev')
+        assert "Unauthorized: Invalid token" in str(excinfo.value)
+    else:
+        # Should work without API key for self-hosted
+        app = FirecrawlApp(api_url=API_URL)
+        response = app.scrape_url('https://firecrawl.dev')
+        assert response is not None
-def test_blocklisted_url():
-    blocklisted_url = "https://facebook.com/fake-test"
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
-    with pytest.raises(Exception) as excinfo:
-        app.scrape_url(blocklisted_url)
-    assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
+# def test_blocklisted_url():
+#     blocklisted_url = "https://facebook.com/fake-test"
+#     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+#     with pytest.raises(Exception) as excinfo:
+#         app.scrape_url(blocklisted_url)
+#     assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
 def test_successful_response_with_valid_preview_token():
     app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
@@ -131,17 +142,23 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
     assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']
 def test_crawl_url_invalid_api_key():
-    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
-    with pytest.raises(Exception) as excinfo:
-        invalid_app.crawl_url('https://firecrawl.dev')
-    assert "Unauthorized: Invalid token" in str(excinfo.value)
+    if 'api.firecrawl.dev' in API_URL:
+        invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+        with pytest.raises(Exception) as excinfo:
+            invalid_app.crawl_url('https://firecrawl.dev')
+        assert "Unauthorized: Invalid token" in str(excinfo.value)
+    else:
+        # Should work without API key for self-hosted
+        app = FirecrawlApp(api_url=API_URL)
+        response = app.crawl_url('https://firecrawl.dev')
+        assert response is not None
-def test_should_return_error_for_blocklisted_url():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
-    blocklisted_url = "https://twitter.com/fake-test"
-    with pytest.raises(Exception) as excinfo:
-        app.crawl_url(blocklisted_url)
-    assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
+# def test_should_return_error_for_blocklisted_url():
+#     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+#     blocklisted_url = "https://twitter.com/fake-test"
+#     with pytest.raises(Exception) as excinfo:
+#         app.crawl_url(blocklisted_url)
+#     assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
 def test_crawl_url_wait_for_completion_e2e():
     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@@ -291,17 +308,23 @@ def test_check_crawl_status_e2e():
     assert 'error' not in status_response['data'][0]['metadata']
 def test_invalid_api_key_on_map():
-    invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
-    with pytest.raises(Exception) as excinfo:
-        invalid_app.map_url('https://roastmywebsite.ai')
-    assert "Unauthorized: Invalid token" in str(excinfo.value)
+    if 'api.firecrawl.dev' in API_URL:
+        invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
+        with pytest.raises(Exception) as excinfo:
+            invalid_app.map_url('https://roastmywebsite.ai')
+        assert "Unauthorized: Invalid token" in str(excinfo.value)
+    else:
+        # Should work without API key for self-hosted
+        app = FirecrawlApp(api_url=API_URL)
+        response = app.map_url('https://roastmywebsite.ai')
+        assert response is not None
-def test_blocklisted_url_on_map():
-    app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
-    blocklisted_url = "https://facebook.com/fake-test"
-    with pytest.raises(Exception) as excinfo:
-        app.map_url(blocklisted_url)
-    assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
+# def test_blocklisted_url_on_map():
+#     app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
+#     blocklisted_url = "https://facebook.com/fake-test"
+#     with pytest.raises(Exception) as excinfo:
+#         app.map_url(blocklisted_url)
+#     assert "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." in str(excinfo.value)
 def test_successful_response_with_valid_preview_token_on_map():
     app = FirecrawlApp(api_key="this_is_just_a_preview_token", api_url=API_URL)
@@ -348,5 +371,70 @@ def test_search_e2e():
 #     assert isinstance(llm_extraction['supports_sso'], bool)
 #     assert isinstance(llm_extraction['is_open_source'], bool)
+def test_search_with_string_query():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.search("firecrawl")
+    assert response["success"] is True
+    assert len(response["data"]) > 0
+    assert response["data"][0]["markdown"] is not None
+    assert response["data"][0]["metadata"] is not None
+    assert response["data"][0]["metadata"]["title"] is not None
+    assert response["data"][0]["metadata"]["description"] is not None
+def test_search_with_params_dict():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.search("firecrawl", {
+        "limit": 3,
+        "lang": "en",
+        "country": "us",
+        "scrapeOptions": {
+            "formats": ["markdown", "html", "links"],
+            "onlyMainContent": True
+        }
+    })
+    assert response["success"] is True
+    assert len(response["data"]) <= 3
+    for doc in response["data"]:
+        assert doc["markdown"] is not None
+        assert doc["html"] is not None
+        assert doc["links"] is not None
+        assert doc["metadata"] is not None
+        assert doc["metadata"]["title"] is not None
+        assert doc["metadata"]["description"] is not None
+def test_search_with_params_object():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    params = SearchParams(
+        query="firecrawl",
+        limit=3,
+        lang="en",
+        country="us",
+        scrapeOptions={
+            "formats": ["markdown", "html", "links"],
+            "onlyMainContent": True
+        }
+    )
+    response = app.search(params.query, params)
+    assert response["success"] is True
+    assert len(response["data"]) <= 3
+    for doc in response["data"]:
+        assert doc["markdown"] is not None
+        assert doc["html"] is not None
+        assert doc["links"] is not None
+        assert doc["metadata"] is not None
+        assert doc["metadata"]["title"] is not None
+        assert doc["metadata"]["description"] is not None
+def test_search_invalid_api_key():
+    app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+    with pytest.raises(Exception) as e:
+        app.search("test query")
+    assert "404" in str(e.value)
+def test_search_with_invalid_params():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    with pytest.raises(Exception) as e:
+        app.search("test query", {"invalid_param": "value"})
+    assert "ValidationError" in str(e.value)

firecrawl/firecrawl.py CHANGED Viewed

@@ -21,13 +21,34 @@ import websockets
 logger : logging.Logger = logging.getLogger("firecrawl")
+class SearchParams(pydantic.BaseModel):
+    query: str
+    limit: Optional[int] = 5
+    tbs: Optional[str] = None
+    filter: Optional[str] = None
+    lang: Optional[str] = "en"
+    country: Optional[str] = "us"
+    location: Optional[str] = None
+    origin: Optional[str] = "api"
+    timeout: Optional[int] = 60000
+    scrapeOptions: Optional[Dict[str, Any]] = None
 class FirecrawlApp:
+    class SearchResponse(pydantic.BaseModel):
+        """
+        Response from the search operation.
+        """
+        success: bool
+        data: List[Dict[str, Any]]
+        warning: Optional[str] = None
+        error: Optional[str] = None
     class ExtractParams(pydantic.BaseModel):
         """
         Parameters for the extract operation.
         """
-        prompt: str
-        schema: Optional[Any] = None
+        prompt: Optional[str] = None
+        schema_: Optional[Any] = pydantic.Field(None, alias='schema')
         system_prompt: Optional[str] = None
         allow_external_links: Optional[bool] = False
@@ -39,27 +60,23 @@ class FirecrawlApp:
         data: Optional[Any] = None
         error: Optional[str] = None
-    class ErrorResponse(pydantic.BaseModel):
-        """
-        Error response.
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
         """
-        success: bool
-        error: str
+        Initialize the FirecrawlApp instance with API key, API URL.
-    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
-      """
-      Initialize the FirecrawlApp instance with API key, API URL.
-      Args:
-          api_key (Optional[str]): API key for authenticating with the Firecrawl API.
-          api_url (Optional[str]): Base URL for the Firecrawl API.
-      """
-      self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
-      self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
-      if self.api_key is None:
-          logger.warning("No API key provided")
-          raise ValueError('No API key provided')
-      logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")
+        Args:
+            api_key (Optional[str]): API key for authenticating with the Firecrawl API.
+            api_url (Optional[str]): Base URL for the Firecrawl API.
+        """
+        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
+        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
+        # Only require API key when using cloud service
+        if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
+            logger.warning("No API key provided for cloud service")
+            raise ValueError('No API key provided')
+        logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
     def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
         """
@@ -95,6 +112,18 @@ class FirecrawlApp:
                 if key not in ['extract']:
                     scrape_params[key] = value
+            json = params.get("jsonOptions", {})
+            if json:
+                if 'schema' in json and hasattr(json['schema'], 'schema'):
+                    json['schema'] = json['schema'].schema()
+                scrape_params['jsonOptions'] = json
+            # Include any other params directly at the top level of scrape_params
+            for key, value in params.items():
+                if key not in ['jsonOptions']:
+                    scrape_params[key] = value
         endpoint = f'/v1/scrape'
         # Make the POST request with the prepared headers and JSON data
         response = requests.post(
@@ -103,7 +132,10 @@ class FirecrawlApp:
             json=scrape_params,
         )
         if response.status_code == 200:
-            response = response.json()
+            try:
+                response = response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
             if response['success'] and 'data' in response:
                 return response['data']
             elif "error" in response:
@@ -113,22 +145,39 @@ class FirecrawlApp:
         else:
             self._handle_error(response, 'scrape URL')
-    def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
+    def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
         """
-        Perform a search using the Firecrawl API.
+        Search for content using the Firecrawl API.
         Args:
-            query (str): The search query.
-            params (Optional[Dict[str, Any]]): Additional parameters for the search request.
+            query (str): The search query string.
+            params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
         Returns:
-            Any: The search results if the request is successful.
-        Raises:
-            NotImplementedError: If the search request is attempted on API version v1.
-            Exception: If the search request fails.
+            Dict[str, Any]: The search response containing success status and search results.
         """
-        raise NotImplementedError("Search is not supported in v1.")
+        if params is None:
+            params = {}
+        if isinstance(params, dict):
+            search_params = SearchParams(query=query, **params)
+        else:
+            search_params = params
+            search_params.query = query
+        response = requests.post(
+            f"{self.api_url}/v1/search",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json=search_params.dict(exclude_none=True)
+        )
+        if response.status_code != 200:
+            raise Exception(f"Request failed with status code {response.status_code}")
+        try:
+            return response.json()
+        except:
+            raise Exception(f'Failed to parse Firecrawl response as JSON.')
     def crawl_url(self, url: str,
                   params: Optional[Dict[str, Any]] = None,
@@ -163,7 +212,10 @@ class FirecrawlApp:
             json_data.update(params)
         response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
         if response.status_code == 200:
-            id = response.json().get('id')
+            try:
+                id = response.json().get('id')
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
             return self._monitor_job_status(id, headers, poll_interval)
         else:
@@ -192,7 +244,10 @@ class FirecrawlApp:
             json_data.update(params)
         response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
         if response.status_code == 200:
-            return response.json()
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
         else:
             self._handle_error(response, 'start crawl job')
@@ -214,11 +269,16 @@ class FirecrawlApp:
         headers = self._prepare_headers()
         response = self._get_request(f'{self.api_url}{endpoint}', headers)
         if response.status_code == 200:
-            status_data = response.json()
+            try:
+                status_data = response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
             if status_data['status'] == 'completed':
                 if 'data' in status_data:
                     data = status_data['data']
                     while 'next' in status_data:
+                        if len(status_data['data']) == 0:
+                            break
                         next_url = status_data.get('next')
                         if not next_url:
                             logger.warning("Expected 'next' URL is missing.")
@@ -228,28 +288,59 @@ class FirecrawlApp:
                             if status_response.status_code != 200:
                                 logger.error(f"Failed to fetch next page: {status_response.status_code}")
                                 break
-                            status_data = status_response.json()
-                            data.extend(status_data.get('data', []))
+                            try:
+                                next_data = status_response.json()
+                            except:
+                                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+                            data.extend(next_data.get('data', []))
+                            status_data = next_data
                         except Exception as e:
                             logger.error(f"Error during pagination request: {e}")
                             break
-                        status_data.pop('next', None)
                     status_data['data'] = data
-            return {
-                'success': True,
+            response = {
                 'status': status_data.get('status'),
                 'total': status_data.get('total'),
                 'completed': status_data.get('completed'),
                 'creditsUsed': status_data.get('creditsUsed'),
                 'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
-                'error': status_data.get('error'),
-                'next': status_data.get('next', None)
+                'data': status_data.get('data')
+            }
+            if 'error' in status_data:
+                response['error'] = status_data['error']
+            if 'next' in status_data:
+                response['next'] = status_data['next']
+            return {
+                'success': False if 'error' in status_data else True,
+                **response
             }
         else:
             self._handle_error(response, 'check crawl status')
+    def check_crawl_errors(self, id: str) -> Dict[str, Any]:
+        """
+        Returns information about crawl errors.
+        Args:
+            id (str): The ID of the crawl job.
+        Returns:
+            Dict[str, Any]: Information about crawl errors.
+        """
+        headers = self._prepare_headers()
+        response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
+        if response.status_code == 200:
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+        else:
+            self._handle_error(response, "check crawl errors")
     def cancel_crawl(self, id: str) -> Dict[str, Any]:
         """
         Cancel an asynchronous crawl job using the Firecrawl API.
@@ -263,7 +354,10 @@ class FirecrawlApp:
         headers = self._prepare_headers()
         response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
         if response.status_code == 200:
-            return response.json()
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
         else:
             self._handle_error(response, "cancel crawl job")
@@ -311,7 +405,10 @@ class FirecrawlApp:
             json=json_data,
         )
         if response.status_code == 200:
-            response = response.json()
+            try:
+                response = response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
             if response['success'] and 'links' in response:
                 return response
             elif 'error' in response:
@@ -354,7 +451,10 @@ class FirecrawlApp:
             json_data.update(params)
         response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
         if response.status_code == 200:
-            id = response.json().get('id')
+            try:
+                id = response.json().get('id')
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
             return self._monitor_job_status(id, headers, poll_interval)
         else:
@@ -383,7 +483,10 @@ class FirecrawlApp:
             json_data.update(params)
         response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
         if response.status_code == 200:
-            return response.json()
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
         else:
             self._handle_error(response, 'start batch scrape job')
@@ -423,11 +526,16 @@ class FirecrawlApp:
         headers = self._prepare_headers()
         response = self._get_request(f'{self.api_url}{endpoint}', headers)
         if response.status_code == 200:
-            status_data = response.json()
+            try:
+                status_data = response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
             if status_data['status'] == 'completed':
                 if 'data' in status_data:
                     data = status_data['data']
                     while 'next' in status_data:
+                        if len(status_data['data']) == 0:
+                            break
                         next_url = status_data.get('next')
                         if not next_url:
                             logger.warning("Expected 'next' URL is missing.")
@@ -437,30 +545,60 @@ class FirecrawlApp:
                             if status_response.status_code != 200:
                                 logger.error(f"Failed to fetch next page: {status_response.status_code}")
                                 break
-                            status_data = status_response.json()
-                            data.extend(status_data.get('data', []))
+                            try:
+                                next_data = status_response.json()
+                            except:
+                                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+                            data.extend(next_data.get('data', []))
+                            status_data = next_data
                         except Exception as e:
                             logger.error(f"Error during pagination request: {e}")
                             break
-                        status_data.pop('next', None)
                     status_data['data'] = data
-            return {
-                'success': True,
+            response = {
                 'status': status_data.get('status'),
                 'total': status_data.get('total'),
                 'completed': status_data.get('completed'),
                 'creditsUsed': status_data.get('creditsUsed'),
                 'expiresAt': status_data.get('expiresAt'),
-                'data': status_data.get('data'),
-                'error': status_data.get('error'),
-                'next': status_data.get('next', None)
+                'data': status_data.get('data')
+            }
+            if 'error' in status_data:
+                response['error'] = status_data['error']
+            if 'next' in status_data:
+                response['next'] = status_data['next']
+            return {
+                'success': False if 'error' in status_data else True,
+                **response
             }
         else:
             self._handle_error(response, 'check batch scrape status')
+    def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]:
+        """
+        Returns information about batch scrape errors.
+        Args:
+            id (str): The ID of the crawl job.
+        Returns:
+            Dict[str, Any]: Information about crawl errors.
+        """
+        headers = self._prepare_headers()
+        response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
+        if response.status_code == 200:
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+        else:
+            self._handle_error(response, "check batch scrape errors")
-    def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
+    def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any:
         """
         Extracts information from a URL using the Firecrawl API.
@@ -473,33 +611,140 @@ class FirecrawlApp:
         """
         headers = self._prepare_headers()
-        if not params or not params.get('prompt'):
-            raise ValueError("Prompt is required")
+        if not params or (not params.get('prompt') and not params.get('schema')):
+            raise ValueError("Either prompt or schema is required")
-        if not params.get('schema'):
-            raise ValueError("Schema is required for extraction")
+        schema = params.get('schema')
+        if schema:
+            if hasattr(schema, 'model_json_schema'):
+                # Convert Pydantic model to JSON schema
+                schema = schema.model_json_schema()
+            # Otherwise assume it's already a JSON schema dict
         jsonData = {'urls': urls, **params}
-        jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
+        request_data = {
+            **jsonData,
+            'allowExternalLinks': params.get('allow_external_links', False),
+            'schema': schema,
+            'origin': 'api-sdk'
+        }
         try:
+            # Send the initial extract request
             response = self._post_request(
                 f'{self.api_url}/v1/extract',
-                {
-                    **jsonData,
-                    'allowExternalLinks': params.get('allow_external_links', False),
-                    'schema': jsonSchema
-                },
+                request_data,
                 headers
             )
             if response.status_code == 200:
-                return response.json()
+                try:
+                    data = response.json()
+                except:
+                    raise Exception(f'Failed to parse Firecrawl response as JSON.')
+                if data['success']:
+                    job_id = data.get('id')
+                    if not job_id:
+                        raise Exception('Job ID not returned from extract request.')
+                    # Poll for the extract status
+                    while True:
+                        status_response = self._get_request(
+                            f'{self.api_url}/v1/extract/{job_id}',
+                            headers
+                        )
+                        if status_response.status_code == 200:
+                            try:
+                                status_data = status_response.json()
+                            except:
+                                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+                            if status_data['status'] == 'completed':
+                                if status_data['success']:
+                                    return status_data
+                                else:
+                                    raise Exception(f'Failed to extract. Error: {status_data["error"]}')
+                            elif status_data['status'] in ['failed', 'cancelled']:
+                                raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
+                        else:
+                            self._handle_error(status_response, "extract-status")
+                        time.sleep(2)  # Polling interval
+                else:
+                    raise Exception(f'Failed to extract. Error: {data["error"]}')
             else:
                 self._handle_error(response, "extract")
         except Exception as e:
             raise ValueError(str(e), 500)
         return {'success': False, 'error': "Internal server error."}
+    def get_extract_status(self, job_id: str) -> Dict[str, Any]:
+        """
+        Retrieve the status of an extract job.
+        Args:
+            job_id (str): The ID of the extract job.
+        Returns:
+            Dict[str, Any]: The status of the extract job.
+        Raises:
+            ValueError: If there is an error retrieving the status.
+        """
+        headers = self._prepare_headers()
+        try:
+            response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
+            if response.status_code == 200:
+                try:
+                    return response.json()
+                except:
+                    raise Exception(f'Failed to parse Firecrawl response as JSON.')
+            else:
+                self._handle_error(response, "get extract status")
+        except Exception as e:
+            raise ValueError(str(e), 500)
+    def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Initiate an asynchronous extract job.
+        Args:
+            urls (List[str]): The URLs to extract data from.
+            params (Optional[Dict[str, Any]]): Additional parameters for the extract request.
+            idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
+        Returns:
+            Dict[str, Any]: The response from the extract operation.
+        Raises:
+            ValueError: If there is an error initiating the extract job.
+        """
+        headers = self._prepare_headers(idempotency_key)
+        schema = params.get('schema') if params else None
+        if schema:
+            if hasattr(schema, 'model_json_schema'):
+                # Convert Pydantic model to JSON schema
+                schema = schema.model_json_schema()
+            # Otherwise assume it's already a JSON schema dict
+        jsonData = {'urls': urls, **(params or {})}
+        request_data = {
+            **jsonData,
+            'allowExternalLinks': params.get('allow_external_links', False) if params else False,
+            'schema': schema
+        }
+        try:
+            response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
+            if response.status_code == 200:
+                try:
+                    return response.json()
+                except:
+                    raise Exception(f'Failed to parse Firecrawl response as JSON.')
+            else:
+                self._handle_error(response, "async extract")
+        except Exception as e:
+            raise ValueError(str(e), 500)
     def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
         """
@@ -625,14 +870,22 @@ class FirecrawlApp:
             status_response = self._get_request(api_url, headers)
             if status_response.status_code == 200:
-                status_data = status_response.json()
+                try:
+                    status_data = status_response.json()
+                except:
+                    raise Exception(f'Failed to parse Firecrawl response as JSON.')
                 if status_data['status'] == 'completed':
                     if 'data' in status_data:
                         data = status_data['data']
                         while 'next' in status_data:
-                          status_response = self._get_request(status_data['next'], headers)
-                          status_data = status_response.json()
-                          data.extend(status_data['data'])
+                            if len(status_data['data']) == 0:
+                                break
+                            status_response = self._get_request(status_data['next'], headers)
+                            try:
+                                status_data = status_response.json()
+                            except:
+                                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+                            data.extend(status_data.get('data', []))
                         status_data['data'] = data
                         return status_data
                     else:
@@ -656,8 +909,12 @@ class FirecrawlApp:
         Raises:
             Exception: An exception with a message containing the status code and error details from the response.
         """
-        error_message = response.json().get('error', 'No error message provided.')
-        error_details = response.json().get('details', 'No additional error details provided.')
+        try:
+            error_message = response.json().get('error', 'No error message provided.')
+            error_details = response.json().get('details', 'No additional error details provided.')
+        except:
+            raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
         if response.status_code == 402:
             message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
@@ -707,15 +964,15 @@ class CrawlWatcher:
     async def _handle_message(self, msg: Dict[str, Any]):
         if msg['type'] == 'done':
             self.status = 'completed'
-            self.dispatch_event('done', {'status': self.status, 'data': self.data})
+            self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})
         elif msg['type'] == 'error':
             self.status = 'failed'
-            self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error']})
+            self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id})
         elif msg['type'] == 'catchup':
             self.status = msg['data']['status']
             self.data.extend(msg['data'].get('data', []))
             for doc in self.data:
-                self.dispatch_event('document', doc)
+                self.dispatch_event('document', {'data': doc, 'id': self.id})
         elif msg['type'] == 'document':
             self.data.append(msg['data'])
-            self.dispatch_event('document', msg['data'])
+            self.dispatch_event('document', {'data': msg['data'], 'id': self.id})

{firecrawl-1.6.0.dist-info → firecrawl-1.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: firecrawl
-Version: 1.6.0
+Version: 1.10.0
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/mendableai/firecrawl
 Author: Mendable.ai
@@ -37,6 +37,7 @@ Requires-Dist: requests
 Requires-Dist: python-dotenv
 Requires-Dist: websockets
 Requires-Dist: nest-asyncio
+Requires-Dist: pydantic (>=2.10.3)
 # Firecrawl Python SDK

firecrawl-1.10.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+firecrawl/__init__.py,sha256=jO4L4KZKDbIL-Gef19zkY5xiEFYxuZUBCxM4B-TGaBI,2544
+firecrawl/firecrawl.py,sha256=WpIBgsUTq8KWaZeaiJJnCjEh48-ObOlOOfXBRie1Quc,40493
+firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+firecrawl/__tests__/e2e_withAuth/test.py,sha256=6OawnVF4IPeGyXg_Izi3t8U7MyT90roaJBJIG5UfllM,7935
+firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=tL5kJJ4el37Wc-Z2TRSuSWwWG2M40h3VPxHYuWijD00,19888
+firecrawl-1.10.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
+firecrawl-1.10.0.dist-info/METADATA,sha256=uYVHavePTK87Zo8Haw90Afdpkm4nkFGW5L4SuSP6u9I,10632
+firecrawl-1.10.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
+firecrawl-1.10.0.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
+firecrawl-1.10.0.dist-info/RECORD,,

firecrawl-1.6.0.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-firecrawl/__init__.py,sha256=9mQfSNKz0VYJilNxhiaYwxWw2gMvUA1Ql2SUnGXCivY,2543
-firecrawl/firecrawl.py,sha256=RcOaoGUs-JWvz2Xy8W5eEizoVZnE3RCbb8P75RAc1JQ,30207
-firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-firecrawl/__tests__/e2e_withAuth/test.py,sha256=L-umFR3WyrJso1EwqkxjbTMr5AEI4t5zDfhQcCzitOI,7911
-firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=KQMmGAtJAIafja6EGtJ-W9162w2Hm6PNjqKl3_RQXLA,16456
-firecrawl-1.6.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
-firecrawl-1.6.0.dist-info/METADATA,sha256=AvmxvRgdpvL-pTdz43kUd1DhgPX4evG1tV6yUJhUda8,10596
-firecrawl-1.6.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
-firecrawl-1.6.0.dist-info/top_level.txt,sha256=jTvz79zWhiyAezfmmHe4FQ-hR60C59UU5FrjMjijLu8,10
-firecrawl-1.6.0.dist-info/RECORD,,

{firecrawl-1.6.0.dist-info → firecrawl-1.10.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{firecrawl-1.6.0.dist-info → firecrawl-1.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{firecrawl-1.6.0.dist-info → firecrawl-1.10.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

firecrawl 1.6.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

firecrawl 1.6.0py3-none-any.whl → 1.10.0py3-none-any.whl