PyPI - firecrawl - Versions diffs - 4.4.0__tar.gz → 4.6.0__tar.gz - Mend

firecrawl 4.4.0tar.gz → 4.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of firecrawl might be problematic. Click here for more details.

Files changed (88) hide show

{firecrawl-4.4.0 → firecrawl-4.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: firecrawl
-Version: 4.4.0
+Version: 4.6.0
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/firecrawl/firecrawl
 Author: Mendable.ai

{firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__init__.py RENAMED Viewed

@@ -17,7 +17,7 @@ from .v1 import (
     V1ChangeTrackingOptions,
 )
-__version__ = "4.4.0"
+__version__ = "4.6.0"
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")

firecrawl-4.6.0/firecrawl/__tests__/unit/v2/methods/test_branding.py ADDED Viewed

@@ -0,0 +1,214 @@
+import pytest
+from unittest.mock import Mock, MagicMock
+from firecrawl.v2.methods.scrape import scrape
+from firecrawl.v2.types import ScrapeOptions, Document
+class TestBrandingFormat:
+    """Unit tests for branding format support."""
+    def test_scrape_with_branding_format_returns_branding_data(self):
+        """Test that scraping with branding format returns branding data."""
+        mock_response = Mock()
+        mock_response.ok = True
+        mock_response.json.return_value = {
+            "success": True,
+            "data": {
+                "markdown": "# Example",
+                "branding": {
+                    "colorScheme": "light",
+                    "colors": {
+                        "primary": "#E11D48",
+                        "secondary": "#3B82F6",
+                        "accent": "#F59E0B"
+                    },
+                    "typography": {
+                        "fontFamilies": {
+                            "primary": "Inter",
+                            "heading": "Poppins"
+                        },
+                        "fontSizes": {
+                            "h1": "2.5rem",
+                            "body": "1rem"
+                        }
+                    },
+                    "spacing": {
+                        "baseUnit": 8
+                    },
+                    "components": {
+                        "buttonPrimary": {
+                            "background": "#E11D48",
+                            "textColor": "#FFFFFF",
+                            "borderRadius": "0.5rem"
+                        }
+                    }
+                }
+            }
+        }
+        mock_client = Mock()
+        mock_client.post.return_value = mock_response
+        result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
+        assert result.branding is not None
+        assert result.branding.color_scheme == "light"
+        assert result.branding.colors["primary"] == "#E11D48"
+        assert result.branding.typography["fontFamilies"]["primary"] == "Inter"
+        assert result.branding.spacing["baseUnit"] == 8
+        assert result.branding.components["buttonPrimary"]["background"] == "#E11D48"
+    def test_scrape_with_branding_and_markdown_formats_returns_both(self):
+        """Test that scraping with both branding and markdown formats returns both."""
+        mock_response = Mock()
+        mock_response.ok = True
+        mock_response.json.return_value = {
+            "success": True,
+            "data": {
+                "markdown": "# Example Content",
+                "branding": {
+                    "colorScheme": "dark",
+                    "colors": {
+                        "primary": "#10B981"
+                    },
+                    "typography": {
+                        "fontFamilies": {
+                            "primary": "Roboto"
+                        }
+                    }
+                }
+            }
+        }
+        mock_client = Mock()
+        mock_client.post.return_value = mock_response
+        result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["markdown", "branding"]))
+        assert result.markdown == "# Example Content"
+        assert result.branding is not None
+        assert result.branding.color_scheme == "dark"
+        assert result.branding.colors["primary"] == "#10B981"
+    def test_scrape_without_branding_format_does_not_return_branding(self):
+        """Test that scraping without branding format does not return branding."""
+        mock_response = Mock()
+        mock_response.ok = True
+        mock_response.json.return_value = {
+            "success": True,
+            "data": {
+                "markdown": "# Example"
+            }
+        }
+        mock_client = Mock()
+        mock_client.post.return_value = mock_response
+        result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["markdown"]))
+        assert result.markdown == "# Example"
+        assert result.branding is None
+    def test_branding_format_with_all_nested_fields(self):
+        """Test branding format with all nested fields populated."""
+        mock_response = Mock()
+        mock_response.ok = True
+        mock_response.json.return_value = {
+            "success": True,
+            "data": {
+                "branding": {
+                    "colorScheme": "light",
+                    "logo": "https://example.com/logo.png",
+                    "fonts": [
+                        {"family": "Inter", "weight": 400},
+                        {"family": "Poppins", "weight": 700}
+                    ],
+                    "colors": {
+                        "primary": "#E11D48",
+                        "background": "#FFFFFF"
+                    },
+                    "typography": {
+                        "fontFamilies": {"primary": "Inter"},
+                        "fontStacks": {"body": ["Inter", "sans-serif"]},
+                        "fontSizes": {"h1": "2.5rem"},
+                        "lineHeights": {"body": 1.5},
+                        "fontWeights": {"regular": 400}
+                    },
+                    "spacing": {
+                        "baseUnit": 8,
+                        "padding": {"sm": 8, "md": 16}
+                    },
+                    "components": {
+                        "buttonPrimary": {
+                            "background": "#E11D48",
+                            "textColor": "#FFFFFF"
+                        }
+                    },
+                    "icons": {
+                        "style": "outline",
+                        "primaryColor": "#E11D48"
+                    },
+                    "images": {
+                        "logo": "https://example.com/logo.png",
+                        "favicon": "https://example.com/favicon.ico"
+                    },
+                    "animations": {
+                        "transitionDuration": "200ms",
+                        "easing": "ease-in-out"
+                    },
+                    "layout": {
+                        "grid": {"columns": 12, "maxWidth": "1200px"},
+                        "headerHeight": "64px"
+                    },
+                    "tone": {
+                        "voice": "professional",
+                        "emojiUsage": "minimal"
+                    },
+                    "personality": {
+                        "tone": "professional",
+                        "energy": "medium",
+                        "targetAudience": "developers"
+                    }
+                }
+            }
+        }
+        mock_client = Mock()
+        mock_client.post.return_value = mock_response
+        result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
+        assert result.branding is not None
+        assert result.branding.color_scheme == "light"
+        assert result.branding.logo == "https://example.com/logo.png"
+        assert len(result.branding.fonts) == 2
+        assert result.branding.typography["fontStacks"]["body"] == ["Inter", "sans-serif"]
+        assert result.branding.spacing["padding"] == {"sm": 8, "md": 16}
+        assert result.branding.icons["style"] == "outline"
+        assert result.branding.images["favicon"] == "https://example.com/favicon.ico"
+        assert result.branding.animations["easing"] == "ease-in-out"
+        assert result.branding.layout["grid"]["columns"] == 12
+        assert result.branding.personality["tone"] == "professional"
+    def test_branding_colorscheme_normalization(self):
+        """Test that colorScheme is normalized to color_scheme."""
+        mock_response = Mock()
+        mock_response.ok = True
+        mock_response.json.return_value = {
+            "success": True,
+            "data": {
+                "branding": {
+                    "colorScheme": "dark",
+                    "colors": {"primary": "#000000"}
+                }
+            }
+        }
+        mock_client = Mock()
+        mock_client.post.return_value = mock_response
+        result = scrape(mock_client, "https://example.com", ScrapeOptions(formats=["branding"]))
+        assert result.branding is not None
+        assert result.branding.color_scheme == "dark"
+        assert not hasattr(result.branding, "colorScheme")

{firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/__tests__/unit/v2/methods/test_pagination.py RENAMED Viewed

@@ -89,6 +89,40 @@ class TestCrawlPagination:
         assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
         assert len(result.data) == 1
         assert isinstance(result.data[0], Document)
+    def test_get_crawl_status_propagates_request_timeout(self):
+        """Ensure request_timeout is forwarded to the HTTP client."""
+        mock_response = Mock()
+        mock_response.ok = True
+        mock_response.json.return_value = {
+            "success": True,
+            "status": "completed",
+            "completed": 1,
+            "total": 1,
+            "creditsUsed": 1,
+            "expiresAt": "2024-01-01T00:00:00Z",
+            "next": None,
+            "data": [self.sample_doc],
+        }
+        self.mock_client.get.return_value = mock_response
+        timeout_seconds = 5.5
+        import firecrawl.v2.methods.crawl as crawl_module
+        assert crawl_module.__file__.endswith("firecrawl/v2/methods/crawl.py")
+        assert crawl_module.get_crawl_status.__kwdefaults__ is not None
+        assert "request_timeout" in crawl_module.get_crawl_status.__kwdefaults__
+        result = get_crawl_status(
+            self.mock_client,
+            self.job_id,
+            request_timeout=timeout_seconds,
+        )
+        assert result.status == "completed"
+        self.mock_client.get.assert_called_with(
+            f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
+        )
     def test_get_crawl_status_with_pagination(self):
         """Test get_crawl_status with auto_paginate=True."""
@@ -423,7 +457,42 @@ class TestAsyncPagination:
         assert result.next is None
         assert len(result.data) == 2
         assert self.mock_client.get.call_count == 2
+    @pytest.mark.asyncio
+    async def test_get_crawl_status_async_propagates_request_timeout(self):
+        """Ensure async request_timeout is forwarded to the HTTP client."""
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {
+            "success": True,
+            "status": "completed",
+            "completed": 1,
+            "total": 1,
+            "creditsUsed": 1,
+            "expiresAt": "2024-01-01T00:00:00Z",
+            "next": None,
+            "data": [self.sample_doc],
+        }
+        self.mock_client.get.return_value = mock_response
+        timeout_seconds = 3.3
+        import firecrawl.v2.methods.aio.crawl as crawl_module_async
+        assert crawl_module_async.__file__.endswith("firecrawl/v2/methods/aio/crawl.py")
+        assert crawl_module_async.get_crawl_status.__kwdefaults__ is not None
+        assert "request_timeout" in crawl_module_async.get_crawl_status.__kwdefaults__
+        result = await get_crawl_status_async(
+            self.mock_client,
+            self.job_id,
+            request_timeout=timeout_seconds,
+        )
+        assert result.status == "completed"
+        self.mock_client.get.assert_awaited_with(
+            f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
+        )
     @pytest.mark.asyncio
     async def test_get_batch_scrape_status_async_with_pagination(self):
         """Test async get_batch_scrape_status with pagination."""

{firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/client.py RENAMED Viewed

@@ -54,10 +54,14 @@ from .watcher import Watcher
 class FirecrawlClient:
     """
     Main Firecrawl v2 API client.
     This client provides a clean, modular interface to all Firecrawl functionality.
     """
+    @staticmethod
+    def _is_cloud_service(url: str) -> bool:
+        return "api.firecrawl.dev" in url.lower()
     def __init__(
         self,
         api_key: Optional[str] = None,
@@ -68,7 +72,7 @@ class FirecrawlClient:
     ):
         """
         Initialize the Firecrawl client.
         Args:
             api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
             api_url: Base URL for the Firecrawl API
@@ -78,13 +82,13 @@ class FirecrawlClient:
         """
         if api_key is None:
             api_key = os.getenv("FIRECRAWL_API_KEY")
-        if not api_key:
+        if self._is_cloud_service(api_url) and not api_key:
             raise ValueError(
-                "API key is required. Set FIRECRAWL_API_KEY environment variable "
+                "API key is required for the cloud API. Set FIRECRAWL_API_KEY environment variable "
                 "or pass api_key parameter."
             )
         self.config = ClientConfig(
             api_key=api_key,
             api_url=api_url,
@@ -92,7 +96,7 @@ class FirecrawlClient:
             max_retries=max_retries,
             backoff_factor=backoff_factor
         )
         self.http_client = HttpClient(api_key, api_url)
     def scrape(
@@ -236,6 +240,7 @@ class FirecrawlClient:
         zero_data_retention: bool = False,
         poll_interval: int = 2,
         timeout: Optional[int] = None,
+        request_timeout: Optional[float] = None,
         integration: Optional[str] = None,
     ) -> CrawlJob:
         """
@@ -259,7 +264,8 @@ class FirecrawlClient:
             scrape_options: Page scraping configuration
             zero_data_retention: Whether to delete data after 24 hours
             poll_interval: Seconds between status checks
-            timeout: Maximum seconds to wait (None for no timeout)
+            timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
+            request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout
         Returns:
             CrawlJob when job completes
@@ -290,10 +296,11 @@ class FirecrawlClient:
         )
         return crawl_module.crawl(
-            self.http_client,
-            request,
-            poll_interval=poll_interval,
-            timeout=timeout
+            self.http_client,
+            request,
+            poll_interval=poll_interval,
+            timeout=timeout,
+            request_timeout=request_timeout,
         )
     def start_crawl(
@@ -368,9 +375,11 @@ class FirecrawlClient:
         return crawl_module.start_crawl(self.http_client, request)
     def get_crawl_status(
-        self,
+        self,
         job_id: str,
-        pagination_config: Optional[PaginationConfig] = None
+        pagination_config: Optional[PaginationConfig] = None,
+        *,
+        request_timeout: Optional[float] = None,
     ) -> CrawlJob:
         """
         Get the status of a crawl job.
@@ -378,6 +387,9 @@ class FirecrawlClient:
         Args:
             job_id: ID of the crawl job
             pagination_config: Optional configuration for pagination behavior
+            request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
+                is enabled (default) and there are multiple pages of results, this timeout applies to
+                each page request separately, not to the entire operation
         Returns:
             CrawlJob with current status and data
@@ -386,9 +398,10 @@ class FirecrawlClient:
             Exception: If the status check fails
         """
         return crawl_module.get_crawl_status(
-            self.http_client,
+            self.http_client,
             job_id,
-            pagination_config=pagination_config
+            pagination_config=pagination_config,
+            request_timeout=request_timeout,
         )
     def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:

{firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/client_async.py RENAMED Viewed

@@ -4,6 +4,7 @@ Async v2 client mirroring the regular client surface using true async HTTP trans
 import os
 import asyncio
+import time
 from typing import Optional, List, Dict, Any, Union, Callable, Literal
 from .types import (
     ScrapeOptions,
@@ -47,11 +48,15 @@ from .methods.aio import extract as async_extract  # type: ignore[attr-defined]
 from .watcher_async import AsyncWatcher
 class AsyncFirecrawlClient:
+    @staticmethod
+    def _is_cloud_service(url: str) -> bool:
+        return "api.firecrawl.dev" in url.lower()
     def __init__(self, api_key: Optional[str] = None, api_url: str = "https://api.firecrawl.dev"):
         if api_key is None:
             api_key = os.getenv("FIRECRAWL_API_KEY")
-        if not api_key:
-            raise ValueError("API key is required. Set FIRECRAWL_API_KEY or pass api_key.")
+        if self._is_cloud_service(api_url) and not api_key:
+            raise ValueError("API key is required for the cloud API. Set FIRECRAWL_API_KEY or pass api_key.")
         self.http_client = HttpClient(api_key, api_url)
         self.async_http_client = AsyncHttpClient(api_key, api_url)
@@ -77,33 +82,91 @@ class AsyncFirecrawlClient:
         request = CrawlRequest(url=url, **kwargs)
         return await async_crawl.start_crawl(self.async_http_client, request)
-    async def wait_crawl(self, job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> CrawlJob:
-        # simple polling loop using blocking get (ok for test-level async)
-        start = asyncio.get_event_loop().time()
+    async def wait_crawl(
+        self,
+        job_id: str,
+        poll_interval: int = 2,
+        timeout: Optional[int] = None,
+        *,
+        request_timeout: Optional[float] = None,
+    ) -> CrawlJob:
+        """
+        Polls the status of a crawl job until it reaches a terminal state.
+        Args:
+            job_id (str): The ID of the crawl job to poll.
+            poll_interval (int, optional): Number of seconds to wait between polling attempts. Defaults to 2.
+            timeout (Optional[int], optional): Maximum number of seconds to wait for the entire crawl job to complete before timing out. If None, waits indefinitely. Defaults to None.
+            request_timeout (Optional[float], optional): Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout. If None, no per-request timeout is set. Defaults to None.
+        Returns:
+            CrawlJob: The final status of the crawl job when it reaches a terminal state.
+        Raises:
+            TimeoutError: If the crawl does not reach a terminal state within the specified timeout.
+        Terminal states:
+            - "completed": The crawl finished successfully.
+            - "failed": The crawl finished with an error.
+            - "cancelled": The crawl was cancelled.
+        """
+        start = time.monotonic()
         while True:
-            status = await async_crawl.get_crawl_status(self.async_http_client, job_id)
-            if status.status in ["completed", "failed"]:
+            status = await async_crawl.get_crawl_status(
+                self.async_http_client,
+                job_id,
+                request_timeout=request_timeout,
+            )
+            if status.status in ["completed", "failed", "cancelled"]:
                 return status
-            if timeout and (asyncio.get_event_loop().time() - start) > timeout:
+            if timeout and (time.monotonic() - start) > timeout:
                 raise TimeoutError("Crawl wait timed out")
             await asyncio.sleep(poll_interval)
     async def crawl(self, **kwargs) -> CrawlJob:
         # wrapper combining start and wait
-        resp = await self.start_crawl(**{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout")})
+        resp = await self.start_crawl(
+            **{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout", "request_timeout")}
+        )
         poll_interval = kwargs.get("poll_interval", 2)
         timeout = kwargs.get("timeout")
-        return await self.wait_crawl(resp.id, poll_interval=poll_interval, timeout=timeout)
+        request_timeout = kwargs.get("request_timeout")
+        effective_request_timeout = request_timeout if request_timeout is not None else timeout
+        return await self.wait_crawl(
+            resp.id,
+            poll_interval=poll_interval,
+            timeout=timeout,
+            request_timeout=effective_request_timeout,
+        )
     async def get_crawl_status(
-        self,
+        self,
         job_id: str,
-        pagination_config: Optional[PaginationConfig] = None
+        pagination_config: Optional[PaginationConfig] = None,
+        *,
+        request_timeout: Optional[float] = None,
     ) -> CrawlJob:
+        """
+        Get the status of a crawl job.
+        Args:
+            job_id: ID of the crawl job
+            pagination_config: Optional configuration for pagination behavior
+            request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
+                is enabled (default) and there are multiple pages of results, this timeout applies to
+                each page request separately, not to the entire operation
+        Returns:
+            CrawlJob with current status and data
+        Raises:
+            Exception: If the status check fails
+        """
         return await async_crawl.get_crawl_status(
-            self.async_http_client,
+            self.async_http_client,
             job_id,
-            pagination_config=pagination_config
+            pagination_config=pagination_config,
+            request_timeout=request_timeout,
         )
     async def cancel_crawl(self, job_id: str) -> bool:

{firecrawl-4.4.0 → firecrawl-4.6.0}/firecrawl/v2/methods/aio/crawl.py RENAMED Viewed

@@ -87,9 +87,11 @@ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlRe
 async def get_crawl_status(
-    client: AsyncHttpClient,
+    client: AsyncHttpClient,
     job_id: str,
-    pagination_config: Optional[PaginationConfig] = None
+    pagination_config: Optional[PaginationConfig] = None,
+    *,
+    request_timeout: Optional[float] = None,
 ) -> CrawlJob:
     """
     Get the status of a crawl job.
@@ -98,6 +100,9 @@ async def get_crawl_status(
         client: Async HTTP client instance
         job_id: ID of the crawl job
         pagination_config: Optional configuration for pagination limits
+        request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
+            is enabled (default) and there are multiple pages of results, this timeout applies to
+            each page request separately, not to the entire operation
     Returns:
         CrawlJob with job information
@@ -105,7 +110,7 @@ async def get_crawl_status(
     Raises:
         Exception: If the status check fails
     """
-    response = await client.get(f"/v2/crawl/{job_id}")
+    response = await client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
     if response.status_code >= 400:
         handle_response_error(response, "get crawl status")
     body = response.json()
@@ -120,10 +125,11 @@ async def get_crawl_status(
         auto_paginate = pagination_config.auto_paginate if pagination_config else True
         if auto_paginate and body.get("next"):
             documents = await _fetch_all_pages_async(
-                client,
-                body.get("next"),
-                documents,
-                pagination_config
+                client,
+                body.get("next"),
+                documents,
+                pagination_config,
+                request_timeout=request_timeout,
             )
         return CrawlJob(
@@ -142,7 +148,9 @@ async def _fetch_all_pages_async(
     client: AsyncHttpClient,
     next_url: str,
     initial_documents: List[Document],
-    pagination_config: Optional[PaginationConfig] = None
+    pagination_config: Optional[PaginationConfig] = None,
+    *,
+    request_timeout: Optional[float] = None,
 ) -> List[Document]:
     """
     Fetch all pages of crawl results asynchronously.
@@ -152,6 +160,7 @@ async def _fetch_all_pages_async(
         next_url: URL for the next page
         initial_documents: Documents from the first page
         pagination_config: Optional configuration for pagination limits
+        request_timeout: Optional timeout (in seconds) for the underlying HTTP request
     Returns:
         List of all documents from all pages
@@ -176,7 +185,7 @@ async def _fetch_all_pages_async(
             break
         # Fetch next page
-        response = await client.get(current_url)
+        response = await client.get(current_url, timeout=request_timeout)
         if response.status_code >= 400:
             # Log error but continue with what we have

firecrawl 4.4.0__tar.gz → 4.6.0__tar.gz

Potentially problematic release.

firecrawl 4.4.0tar.gz → 4.6.0tar.gz