PyPI - firecrawl - Versions diffs - 3.4.0__py3-none-any.whl → 4.0.0__py3-none-any.whl - Mend

firecrawl 3.4.0py3-none-any.whl → 4.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of firecrawl might be problematic. Click here for more details.

Files changed (16) hide show

firecrawl/__init__.py +1 -1
firecrawl/__tests__/e2e/v2/test_crawl.py +1 -1
firecrawl/__tests__/unit/v2/methods/test_pagination.py +602 -0
firecrawl/v2/client.py +23 -4
firecrawl/v2/client_async.py +21 -4
firecrawl/v2/methods/aio/batch.py +107 -8
firecrawl/v2/methods/aio/crawl.py +172 -3
firecrawl/v2/methods/batch.py +90 -5
firecrawl/v2/methods/crawl.py +95 -6
firecrawl/v2/types.py +7 -0
firecrawl/v2/utils/http_client.py +26 -3
{firecrawl-3.4.0.dist-info → firecrawl-4.0.0.dist-info}/METADATA +1 -1
{firecrawl-3.4.0.dist-info → firecrawl-4.0.0.dist-info}/RECORD +16 -15
{firecrawl-3.4.0.dist-info → firecrawl-4.0.0.dist-info}/WHEEL +0 -0
{firecrawl-3.4.0.dist-info → firecrawl-4.0.0.dist-info}/licenses/LICENSE +0 -0
{firecrawl-3.4.0.dist-info → firecrawl-4.0.0.dist-info}/top_level.txt +0 -0

firecrawl/v2/client_async.py CHANGED Viewed

@@ -31,6 +31,7 @@ from .types import (
     ExecuteJavascriptAction,
     PDFAction,
     Location,
+    PaginationConfig,
 )
 from .utils.http_client import HttpClient
 from .utils.http_client_async import AsyncHttpClient
@@ -94,8 +95,16 @@ class AsyncFirecrawlClient:
         timeout = kwargs.get("timeout")
         return await self.wait_crawl(resp.id, poll_interval=poll_interval, timeout=timeout)
-    async def get_crawl_status(self, job_id: str) -> CrawlJob:
-        return await async_crawl.get_crawl_status(self.async_http_client, job_id)
+    async def get_crawl_status(
+        self,
+        job_id: str,
+        pagination_config: Optional[PaginationConfig] = None
+    ) -> CrawlJob:
+        return await async_crawl.get_crawl_status(
+            self.async_http_client,
+            job_id,
+            pagination_config=pagination_config
+        )
     async def cancel_crawl(self, job_id: str) -> bool:
         return await async_crawl.cancel_crawl(self.async_http_client, job_id)
@@ -154,8 +163,16 @@ class AsyncFirecrawlClient:
         timeout = kwargs.get("timeout")
         return await self.wait_batch_scrape(job_id, poll_interval=poll_interval, timeout=timeout)
-    async def get_batch_scrape_status(self, job_id: str):
-        return await async_batch.get_batch_scrape_status(self.async_http_client, job_id)
+    async def get_batch_scrape_status(
+        self,
+        job_id: str,
+        pagination_config: Optional[PaginationConfig] = None
+    ):
+        return await async_batch.get_batch_scrape_status(
+            self.async_http_client,
+            job_id,
+            pagination_config=pagination_config
+        )
     async def cancel_batch_scrape(self, job_id: str) -> bool:
         return await async_batch.cancel_batch_scrape(self.async_http_client, job_id)

firecrawl/v2/methods/aio/batch.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from typing import Optional, List, Dict, Any
-from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob
+from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob, PaginationConfig
 from ...utils.http_client_async import AsyncHttpClient
 from ...utils.validation import prepare_scrape_options
 from ...utils.error_handler import handle_response_error
+from ...utils.normalize import normalize_document_input
+import time
 def _prepare(urls: List[str], *, options: Optional[ScrapeOptions] = None, **kwargs) -> Dict[str, Any]:
@@ -39,7 +41,25 @@ async def start_batch_scrape(client: AsyncHttpClient, urls: List[str], **kwargs)
     return BatchScrapeResponse(id=body.get("id"), url=body.get("url"), invalid_urls=body.get("invalidURLs"))
-async def get_batch_scrape_status(client: AsyncHttpClient, job_id: str) -> BatchScrapeJob:
+async def get_batch_scrape_status(
+    client: AsyncHttpClient,
+    job_id: str,
+    pagination_config: Optional[PaginationConfig] = None
+) -> BatchScrapeJob:
+    """
+    Get the status of a batch scrape job.
+    Args:
+        client: Async HTTP client instance
+        job_id: ID of the batch scrape job
+        pagination_config: Optional configuration for pagination behavior
+    Returns:
+        BatchScrapeJob containing job status and data
+    Raises:
+        Exception: If the status check fails
+    """
     response = await client.get(f"/v2/batch/scrape/{job_id}")
     if response.status_code >= 400:
         handle_response_error(response, "get batch scrape status")
@@ -49,23 +69,102 @@ async def get_batch_scrape_status(client: AsyncHttpClient, job_id: str) -> Batch
     docs: List[Document] = []
     for doc in body.get("data", []) or []:
         if isinstance(doc, dict):
-            normalized = dict(doc)
-            if 'rawHtml' in normalized and 'raw_html' not in normalized:
-                normalized['raw_html'] = normalized.pop('rawHtml')
-            if 'changeTracking' in normalized and 'change_tracking' not in normalized:
-                normalized['change_tracking'] = normalized.pop('changeTracking')
+            normalized = normalize_document_input(doc)
             docs.append(Document(**normalized))
+    # Handle pagination if requested
+    auto_paginate = pagination_config.auto_paginate if pagination_config else True
+    if auto_paginate and body.get("next"):
+        docs = await _fetch_all_batch_pages_async(
+            client,
+            body.get("next"),
+            docs,
+            pagination_config
+        )
     return BatchScrapeJob(
         status=body.get("status"),
         completed=body.get("completed", 0),
         total=body.get("total", 0),
         credits_used=body.get("creditsUsed"),
         expires_at=body.get("expiresAt"),
-        next=body.get("next"),
+        next=body.get("next") if not auto_paginate else None,
         data=docs,
     )
+async def _fetch_all_batch_pages_async(
+    client: AsyncHttpClient,
+    next_url: str,
+    initial_documents: List[Document],
+    pagination_config: Optional[PaginationConfig] = None
+) -> List[Document]:
+    """
+    Fetch all pages of batch scrape results asynchronously.
+    Args:
+        client: Async HTTP client instance
+        next_url: URL for the next page
+        initial_documents: Documents from the first page
+        pagination_config: Optional configuration for pagination limits
+    Returns:
+        List of all documents from all pages
+    """
+    documents = initial_documents.copy()
+    current_url = next_url
+    page_count = 0
+    # Apply pagination limits
+    max_pages = pagination_config.max_pages if pagination_config else None
+    max_results = pagination_config.max_results if pagination_config else None
+    max_wait_time = pagination_config.max_wait_time if pagination_config else None
+    start_time = time.monotonic()
+    while current_url:
+        # Check pagination limits
+        if (max_pages is not None) and (page_count >= max_pages):
+            break
+        if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
+            break
+        # Fetch next page
+        response = await client.get(current_url)
+        if response.status_code >= 400:
+            # Log error but continue with what we have
+            import logging
+            logger = logging.getLogger("firecrawl")
+            logger.warning(f"Failed to fetch next page: {response.status_code}")
+            break
+        page_data = response.json()
+        if not page_data.get("success"):
+            break
+        # Add documents from this page
+        for doc in page_data.get("data", []) or []:
+            if isinstance(doc, dict):
+                # Check max_results limit
+                if (max_results is not None) and (len(documents) >= max_results):
+                    break
+                normalized = normalize_document_input(doc)
+                documents.append(Document(**normalized))
+        # Check if we hit max_results limit
+        if (max_results is not None) and (len(documents) >= max_results):
+            break
+        # Get next URL
+        current_url = page_data.get("next")
+        page_count += 1
+    return documents
 async def cancel_batch_scrape(client: AsyncHttpClient, job_id: str) -> bool:
     response = await client.delete(f"/v2/batch/scrape/{job_id}")
     if response.status_code >= 400:

firecrawl/v2/methods/aio/crawl.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, List
 from ...types import (
     CrawlRequest,
     CrawlJob,
@@ -10,11 +10,13 @@ from ...types import (
     CrawlErrorsResponse,
     ActiveCrawlsResponse,
     ActiveCrawl,
+    PaginationConfig,
 )
 from ...utils.error_handler import handle_response_error
 from ...utils.validation import prepare_scrape_options
 from ...utils.http_client_async import AsyncHttpClient
 from ...utils.normalize import normalize_document_input
+import time
 def _prepare_crawl_request(request: CrawlRequest) -> dict:
@@ -58,6 +60,20 @@ def _prepare_crawl_request(request: CrawlRequest) -> dict:
 async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlResponse:
+    """
+    Start a crawl job for a website.
+    Args:
+        client: Async HTTP client instance
+        request: CrawlRequest containing URL and options
+    Returns:
+        CrawlResponse with job information
+    Raises:
+        ValueError: If request is invalid
+        Exception: If the crawl operation fails to start
+    """
     payload = _prepare_crawl_request(request)
     response = await client.post("/v2/crawl", payload)
     if response.status_code >= 400:
@@ -68,7 +84,25 @@ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlRe
     raise Exception(body.get("error", "Unknown error occurred"))
-async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
+async def get_crawl_status(
+    client: AsyncHttpClient,
+    job_id: str,
+    pagination_config: Optional[PaginationConfig] = None
+) -> CrawlJob:
+    """
+    Get the status of a crawl job.
+    Args:
+        client: Async HTTP client instance
+        job_id: ID of the crawl job
+        pagination_config: Optional configuration for pagination limits
+    Returns:
+        CrawlJob with job information
+    Raises:
+        Exception: If the status check fails
+    """
     response = await client.get(f"/v2/crawl/{job_id}")
     if response.status_code >= 400:
         handle_response_error(response, "get crawl status")
@@ -79,19 +113,115 @@ async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
             if isinstance(doc_data, dict):
                 normalized = normalize_document_input(doc_data)
                 documents.append(Document(**normalized))
+        # Handle pagination if requested
+        auto_paginate = pagination_config.auto_paginate if pagination_config else True
+        if auto_paginate and body.get("next"):
+            documents = await _fetch_all_pages_async(
+                client,
+                body.get("next"),
+                documents,
+                pagination_config
+            )
         return CrawlJob(
             status=body.get("status"),
             completed=body.get("completed", 0),
             total=body.get("total", 0),
             credits_used=body.get("creditsUsed", 0),
             expires_at=body.get("expiresAt"),
-            next=body.get("next"),
+            next=body.get("next") if not auto_paginate else None,
             data=documents,
         )
     raise Exception(body.get("error", "Unknown error occurred"))
+async def _fetch_all_pages_async(
+    client: AsyncHttpClient,
+    next_url: str,
+    initial_documents: List[Document],
+    pagination_config: Optional[PaginationConfig] = None
+) -> List[Document]:
+    """
+    Fetch all pages of crawl results asynchronously.
+    Args:
+        client: Async HTTP client instance
+        next_url: URL for the next page
+        initial_documents: Documents from the first page
+        pagination_config: Optional configuration for pagination limits
+    Returns:
+        List of all documents from all pages
+    """
+    documents = initial_documents.copy()
+    current_url = next_url
+    page_count = 0
+    # Apply pagination limits
+    max_pages = pagination_config.max_pages if pagination_config else None
+    max_results = pagination_config.max_results if pagination_config else None
+    max_wait_time = pagination_config.max_wait_time if pagination_config else None
+    start_time = time.monotonic()
+    while current_url:
+        # Check pagination limits (treat 0 as a valid limit)
+        if (max_pages is not None) and page_count >= max_pages:
+            break
+        if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
+            break
+        # Fetch next page
+        response = await client.get(current_url)
+        if response.status_code >= 400:
+            # Log error but continue with what we have
+            import logging
+            logger = logging.getLogger("firecrawl")
+            logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
+            break
+        page_data = response.json()
+        if not page_data.get("success"):
+            break
+        # Add documents from this page
+        for doc_data in page_data.get("data", []):
+            if isinstance(doc_data, dict):
+                # Check max_results limit
+                if (max_results is not None) and (len(documents) >= max_results):
+                    break
+                normalized = normalize_document_input(doc_data)
+                documents.append(Document(**normalized))
+        # Check if we hit max_results limit
+        if (max_results is not None) and (len(documents) >= max_results):
+            break
+        # Get next URL
+        current_url = page_data.get("next")
+        page_count += 1
+    return documents
 async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
+    """
+    Cancel a crawl job.
+    Args:
+        client: Async HTTP client instance
+        job_id: ID of the crawl job
+    Returns:
+        True if cancellation was successful
+    Raises:
+        Exception: If the cancellation operation fails
+    """
     response = await client.delete(f"/v2/crawl/{job_id}")
     if response.status_code >= 400:
         handle_response_error(response, "cancel crawl")
@@ -100,6 +230,20 @@ async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
 async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
+    """
+    Preview crawl parameters before starting a crawl job.
+    Args:
+        client: Async HTTP client instance
+        request: CrawlParamsRequest containing URL and prompt
+    Returns:
+        CrawlParamsData containing crawl configuration
+    Raises:
+        ValueError: If request is invalid
+        Exception: If the parameter preview fails
+    """
     if not request.url or not request.url.strip():
         raise ValueError("URL cannot be empty")
     if not request.prompt or not request.prompt.strip():
@@ -138,6 +282,19 @@ async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequ
 async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlErrorsResponse:
+    """
+    Get errors from a crawl job.
+    Args:
+        client: Async HTTP client instance
+        crawl_id: ID of the crawl job
+    Returns:
+        CrawlErrorsResponse with errors and robots blocked
+    Raises:
+        Exception: If the error check operation fails
+    """
     response = await client.get(f"/v2/crawl/{crawl_id}/errors")
     if response.status_code >= 400:
         handle_response_error(response, "check crawl errors")
@@ -151,6 +308,18 @@ async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlError
 async def get_active_crawls(client: AsyncHttpClient) -> ActiveCrawlsResponse:
+    """
+    Get active crawl jobs.
+    Args:
+        client: Async HTTP client instance
+    Returns:
+        ActiveCrawlsResponse with active crawl jobs
+    Raises:
+        Exception: If the active crawl jobs operation fails
+    """
     response = await client.get("/v2/crawl/active")
     if response.status_code >= 400:
         handle_response_error(response, "get active crawls")

firecrawl/v2/methods/batch.py CHANGED Viewed

@@ -11,6 +11,7 @@ from ..types import (
     ScrapeOptions,
     Document,
     WebhookConfig,
+    PaginationConfig,
 )
 from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
 from ..utils.normalize import normalize_document_input
@@ -77,7 +78,8 @@ def start_batch_scrape(
 def get_batch_scrape_status(
     client: HttpClient,
-    job_id: str
+    job_id: str,
+    pagination_config: Optional[PaginationConfig] = None
 ) -> BatchScrapeJob:
     """
     Get the status of a batch scrape job.
@@ -85,9 +87,10 @@ def get_batch_scrape_status(
     Args:
         client: HTTP client instance
         job_id: ID of the batch scrape job
+        pagination_config: Optional configuration for pagination behavior
     Returns:
-        BatchScrapeStatusResponse containing job status and data
+        BatchScrapeJob containing job status and data
     Raises:
         FirecrawlError: If the status check fails
@@ -111,17 +114,99 @@ def get_batch_scrape_status(
             normalized = normalize_document_input(doc)
             documents.append(Document(**normalized))
+    # Handle pagination if requested
+    auto_paginate = pagination_config.auto_paginate if pagination_config else True
+    if auto_paginate and body.get("next"):
+        documents = _fetch_all_batch_pages(
+            client,
+            body.get("next"),
+            documents,
+            pagination_config
+        )
     return BatchScrapeJob(
         status=body.get("status"),
         completed=body.get("completed", 0),
         total=body.get("total", 0),
         credits_used=body.get("creditsUsed"),
         expires_at=body.get("expiresAt"),
-        next=body.get("next"),
+        next=body.get("next") if not auto_paginate else None,
         data=documents,
     )
+def _fetch_all_batch_pages(
+    client: HttpClient,
+    next_url: str,
+    initial_documents: List[Document],
+    pagination_config: Optional[PaginationConfig] = None
+) -> List[Document]:
+    """
+    Fetch all pages of batch scrape results.
+    Args:
+        client: HTTP client instance
+        next_url: URL for the next page
+        initial_documents: Documents from the first page
+        pagination_config: Optional configuration for pagination limits
+    Returns:
+        List of all documents from all pages
+    """
+    documents = initial_documents.copy()
+    current_url = next_url
+    page_count = 0
+    # Apply pagination limits
+    max_pages = pagination_config.max_pages if pagination_config else None
+    max_results = pagination_config.max_results if pagination_config else None
+    max_wait_time = pagination_config.max_wait_time if pagination_config else None
+    start_time = time.monotonic()
+    while current_url:
+        # Check pagination limits (treat 0 as a valid limit)
+        if (max_pages is not None) and page_count >= max_pages:
+            break
+        if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
+            break
+        # Fetch next page
+        response = client.get(current_url)
+        if not response.ok:
+            # Log error but continue with what we have
+            import logging
+            logger = logging.getLogger("firecrawl")
+            logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
+            break
+        page_data = response.json()
+        if not page_data.get("success"):
+            break
+        # Add documents from this page
+        for doc in page_data.get("data", []) or []:
+            if isinstance(doc, dict):
+                # Check max_results limit
+                if max_results is not None and len(documents) >= max_results:
+                    break
+                normalized = normalize_document_input(doc)
+                documents.append(Document(**normalized))
+        # Check if we hit max_results limit after adding all docs from this page
+        if max_results is not None and len(documents) >= max_results:
+            break
+        # Get next URL
+        current_url = page_data.get("next")
+        page_count += 1
+    return documents
 def cancel_batch_scrape(
     client: HttpClient,
     job_id: str
@@ -173,7 +258,7 @@ def wait_for_batch_completion(
         FirecrawlError: If the job fails or timeout is reached
         TimeoutError: If timeout is reached
     """
-    start_time = time.time()
+    start_time = time.monotonic()
     while True:
         status_job = get_batch_scrape_status(client, job_id)
@@ -183,7 +268,7 @@ def wait_for_batch_completion(
             return status_job
         # Check timeout
-        if timeout and (time.time() - start_time) > timeout:
+        if timeout and (time.monotonic() - start_time) > timeout:
             raise TimeoutError(f"Batch scrape job {job_id} did not complete within {timeout} seconds")
         # Wait before next poll

firecrawl 3.4.0__py3-none-any.whl → 4.0.0__py3-none-any.whl

Potentially problematic release.

firecrawl 3.4.0py3-none-any.whl → 4.0.0py3-none-any.whl