PyPI - firecrawl-py - Versions diffs - 3.1.1__tar.gz → 3.2.1__tar.gz - Mend

firecrawl-py 3.1.1tar.gz → 3.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of firecrawl-py might be problematic. Click here for more details.

Files changed (87) hide show

{firecrawl_py-3.1.1/firecrawl_py.egg-info → firecrawl_py-3.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: firecrawl-py
-Version: 3.1.1
+Version: 3.2.1
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/firecrawl/firecrawl
 Author: Mendable.ai
@@ -38,7 +38,7 @@ Requires-Dist: httpx
 Requires-Dist: python-dotenv
 Requires-Dist: websockets
 Requires-Dist: nest-asyncio
-Requires-Dist: pydantic
+Requires-Dist: pydantic>=2.0
 Requires-Dist: aiohttp
 Dynamic: author
 Dynamic: home-page

{firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__init__.py RENAMED Viewed

@@ -17,7 +17,7 @@ from .v1 import (
     V1ChangeTrackingOptions,
 )
-__version__ = "3.1.1"
+__version__ = "3.2.1"
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")

firecrawl_py-3.2.1/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py ADDED Viewed

@@ -0,0 +1,248 @@
+import os
+import pytest
+from dotenv import load_dotenv
+from firecrawl import AsyncFirecrawl
+from firecrawl.types import (
+    SearchData,
+    Document,
+    ScrapeOptions,
+    ScrapeFormats,
+    SearchResultWeb,
+    SearchResultNews,
+    SearchResultImages,
+)
+load_dotenv()
+firecrawl = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
+def _collect_texts(entries):
+    texts = []
+    for r in entries or []:
+        title = getattr(r, 'title', None) if hasattr(r, 'title') else None
+        desc = getattr(r, 'description', None) if hasattr(r, 'description') else None
+        if title:
+            texts.append(str(title).lower())
+        if desc:
+            texts.append(str(desc).lower())
+    return texts
+def _is_document(entry) -> bool:
+    try:
+        from firecrawl.v2.types import Document
+        return isinstance(entry, Document) or \
+               hasattr(entry, 'markdown') or \
+               hasattr(entry, 'html') or \
+               hasattr(entry, 'raw_html') or \
+               hasattr(entry, 'json') or \
+               hasattr(entry, 'screenshot') or \
+               hasattr(entry, 'change_tracking') or \
+               hasattr(entry, 'summary')
+    except Exception:
+        return hasattr(entry, 'markdown') or \
+               hasattr(entry, 'html') or \
+               hasattr(entry, 'raw_html') or \
+               hasattr(entry, 'json') or \
+               hasattr(entry, 'screenshot') or \
+               hasattr(entry, 'change_tracking') or \
+               hasattr(entry, 'summary')
+@pytest.mark.asyncio
+async def test_async_search_minimal_request():
+    results = await firecrawl.search(
+        query="What is the capital of France?"
+    )
+    assert isinstance(results, SearchData)
+    assert hasattr(results, 'web')
+    assert results.web is not None
+    assert len(results.web) > 0
+    assert hasattr(results, 'news')
+    assert results.news is None
+    assert hasattr(results, 'images')
+    assert results.images is None
+    for result in results.web:
+        assert isinstance(result, SearchResultWeb)
+        assert hasattr(result, 'url')
+        assert hasattr(result, 'title')
+        assert hasattr(result, 'description')
+        assert result.url.startswith('http')
+        assert result.title is not None
+        assert result.description is not None
+    all_text = ' '.join(_collect_texts(results.web))
+    assert 'paris' in all_text
+    assert results.news is None
+    assert results.images is None
+@pytest.mark.asyncio
+async def test_async_search_with_sources():
+    results = await firecrawl.search(
+        query="firecrawl",
+        sources=["web", "news", "images"],
+        limit=3
+    )
+    assert isinstance(results, SearchData)
+    assert results.web is not None
+    assert len(results.web) <= 3
+    assert isinstance(results.web[0], SearchResultWeb)
+    if results.news is not None:
+        assert len(results.news) <= 3
+        assert isinstance(results.news[0], SearchResultNews)
+    if results.images is not None:
+        assert len(results.images) <= 3
+        assert isinstance(results.images[0], SearchResultImages)
+    web_titles = [result.title.lower() for result in results.web]
+    web_descriptions = [result.description.lower() for result in results.web]
+    all_web_text = ' '.join(web_titles + web_descriptions)
+    assert 'firecrawl' in all_web_text
+@pytest.mark.asyncio
+async def test_async_search_result_structure():
+    results = await firecrawl.search(
+        query="test query",
+        limit=1
+    )
+    if results.web and len(results.web) > 0:
+        result = results.web[0]
+        assert hasattr(result, 'url')
+        assert hasattr(result, 'title')
+        assert hasattr(result, 'description')
+        assert isinstance(result.url, str)
+        assert isinstance(result.title, str) or result.title is None
+        assert isinstance(result.description, str) or result.description is None
+        assert result.url.startswith('http')
+@pytest.mark.asyncio
+async def test_async_search_all_parameters():
+    from firecrawl.types import ScrapeOptions, Location, WaitAction
+    schema = {
+        "type": "object",
+        "properties": {
+            "title": {"type": "string"},
+            "description": {"type": "string"},
+            "url": {"type": "string"}
+        },
+        "required": ["title", "description"]
+    }
+    results = await firecrawl.search(
+        query="artificial intelligence",
+        sources=[
+            {"type": "web"},
+            {"type": "news"}
+        ],
+        limit=3,
+        tbs="qdr:m",
+        location="US",
+        ignore_invalid_urls=True,
+        timeout=60000,
+        scrape_options=ScrapeOptions(
+            formats=[
+                "markdown",
+                "html",
+                {
+                    "type": "json",
+                    "prompt": "Extract the title and description from the page",
+                    "schema": schema
+                },
+                {"type": "summary"}
+            ],
+            headers={"User-Agent": "Firecrawl-Test/1.0"},
+            include_tags=["h1", "h2", "p"],
+            exclude_tags=["nav", "footer"],
+            only_main_content=True,
+            wait_for=2000,
+            mobile=False,
+            skip_tls_verification=False,
+            remove_base64_images=True,
+            block_ads=True,
+            proxy="basic",
+            max_age=3600000,
+            store_in_cache=True,
+            location=Location(
+                country="US",
+                languages=["en"]
+            ),
+            actions=[
+                WaitAction(milliseconds=1000)
+            ]
+        )
+    )
+    assert isinstance(results, SearchData)
+    assert hasattr(results, 'web')
+    assert hasattr(results, 'news')
+    assert hasattr(results, 'images')
+    assert results.web is not None
+    assert len(results.web) <= 3
+    non_doc_entries = [r for r in (results.web or []) if not _is_document(r)]
+    if non_doc_entries:
+        all_web_text = ' '.join(_collect_texts(non_doc_entries))
+        ai_terms = ['artificial', 'intelligence', 'ai', 'machine', 'learning']
+        assert any(term in all_web_text for term in ai_terms)
+    for result in results.web:
+        assert isinstance(result, (SearchResultWeb, Document))
+        if isinstance(result, Document):
+            assert (result.markdown is not None) or (result.html is not None)
+        else:
+            assert hasattr(result, 'url')
+            assert isinstance(result.url, str) and result.url.startswith('http')
+    if results.news is not None:
+        assert len(results.news) <= 3
+        for result in results.news:
+            assert isinstance(result, (SearchResultNews, Document))
+            if isinstance(result, Document):
+                assert (result.markdown is not None) or (result.html is not None)
+            else:
+                assert hasattr(result, 'url')
+                assert isinstance(result.url, str) and result.url.startswith('http')
+    assert results.images is None
+@pytest.mark.asyncio
+async def test_async_search_formats_flexibility():
+    # Test with list format
+    results1 = await firecrawl.search(
+        query="python programming",
+        limit=1,
+        scrape_options=ScrapeOptions(
+            formats=["markdown"]
+        )
+    )
+    # Test with ScrapeFormats object
+    results2 = await firecrawl.search(
+        query="python programming",
+        limit=1,
+        scrape_options=ScrapeOptions(
+            formats=ScrapeFormats(markdown=True)
+        )
+    )
+    assert isinstance(results1, SearchData)
+    assert isinstance(results2, SearchData)
+    assert results1.web is not None
+    assert results2.web is not None
+@pytest.mark.asyncio
+async def test_async_search_with_json_format_object():
+    json_schema = {
+        "type": "object",
+        "properties": {
+            "title": {"type": "string"}
+        },
+        "required": ["title"],
+    }
+    results = await firecrawl.search(
+        query="site:docs.firecrawl.dev",
+        limit=1,
+        scrape_options=ScrapeOptions(
+            formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}]
+        ),
+    )
+    assert isinstance(results, SearchData)
+    assert results.web is not None and len(results.web) >= 0

{firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/__tests__/e2e/v2/test_search.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from firecrawl import Firecrawl
 import os
 from dotenv import load_dotenv
-from firecrawl.types import SearchData, SearchResult, Document, ScrapeFormats, ScrapeOptions
+from firecrawl.types import SearchData, Document, ScrapeOptions, SearchResultWeb, SearchResultNews, SearchResultImages
 load_dotenv()
@@ -53,7 +53,7 @@ def test_search_minimal_request():
     assert results.images is None
     for result in results.web:
-        assert isinstance(result, SearchResult)
+        assert isinstance(result, SearchResultWeb)
         assert hasattr(result, 'url')
         assert hasattr(result, 'title')
         assert hasattr(result, 'description')
@@ -73,7 +73,7 @@ def test_search_with_sources():
     """Test search with specific sources."""
     results = firecrawl.search(
         query="firecrawl",
-        sources=["web", "news"],
+        sources=["web", "news", "images"],
         limit=3
     )
@@ -81,11 +81,15 @@ def test_search_with_sources():
     assert results.web is not None
     assert len(results.web) <= 3
+    assert isinstance(results.web[0], SearchResultWeb)
     if results.news is not None:
         assert len(results.news) <= 3
+        assert isinstance(results.news[0], SearchResultNews)
-    assert results.images is None
+    if results.images is not None:
+        assert len(results.images) <= 3
+        assert isinstance(results.images[0], SearchResultImages)
     web_titles = [result.title.lower() for result in results.web]
     web_descriptions = [result.description.lower() for result in results.web]
@@ -193,7 +197,7 @@ def test_search_all_parameters():
     # Test that each result has proper structure
     for result in results.web:
-        assert isinstance(result, (SearchResult, Document))
+        assert isinstance(result, (SearchResultWeb, Document))
         if isinstance(result, Document):
             # Document path: ensure content present
             assert (result.markdown is not None) or (result.html is not None)
@@ -206,7 +210,7 @@ def test_search_all_parameters():
     if results.news is not None:
         assert len(results.news) <= 3
         for result in results.news:
-            assert isinstance(result, (SearchResult, Document))
+            assert isinstance(result, (SearchResultNews, Document))
             if isinstance(result, Document):
                 assert (result.markdown is not None) or (result.html is not None)
             else:

{firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/types.py RENAMED Viewed

@@ -48,7 +48,9 @@ from .v2.types import (
     JsonFormat,
     FormatOption,
     SearchRequest,
-    SearchResult,
+    SearchResultWeb,
+    SearchResultNews,
+    SearchResultImages,
     SearchData,
     SearchResponse,
@@ -124,7 +126,9 @@ __all__ = [
     'JsonFormat',
     'FormatOption',
     'SearchRequest',
-    'SearchResult',
+    'SearchResultWeb',
+    'SearchResultNews',
+    'SearchResultImages',
     'SearchData',
     'SearchResponse',

firecrawl_py-3.2.1/firecrawl/v2/methods/aio/search.py ADDED Viewed

@@ -0,0 +1,172 @@
+import re
+from typing import Dict, Any, Union, List, TypeVar, Type
+from ...types import (
+    SearchRequest,
+    SearchData,
+    Document,
+    SearchResultWeb,
+    SearchResultNews,
+    SearchResultImages,
+)
+from ...utils.http_client_async import AsyncHttpClient
+from ...utils.error_handler import handle_response_error
+from ...utils.validation import validate_scrape_options, prepare_scrape_options
+T = TypeVar("T")
+async def search(
+    client: AsyncHttpClient,
+    request: SearchRequest
+) -> SearchData:
+    """
+    Async search for documents.
+    Args:
+        client: Async HTTP client instance
+        request: Search request
+    Returns:
+        SearchData with search results grouped by source type
+    Raises:
+        FirecrawlError: If the search operation fails
+    """
+    request_data = _prepare_search_request(request)
+    try:
+        response = await client.post("/v2/search", request_data)
+        if response.status_code != 200:
+            handle_response_error(response, "search")
+        response_data = response.json()
+        if not response_data.get("success"):
+            handle_response_error(response, "search")
+        data = response_data.get("data", {}) or {}
+        out = SearchData()
+        if "web" in data:
+            out.web = _transform_array(data["web"], SearchResultWeb)
+        if "news" in data:
+            out.news = _transform_array(data["news"], SearchResultNews)
+        if "images" in data:
+            out.images = _transform_array(data["images"], SearchResultImages)
+        return out
+    except Exception as err:
+        if hasattr(err, "response"):
+            handle_response_error(getattr(err, "response"), "search")
+        raise err
+def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, Document]]:
+    """
+    Transforms an array of items into a list of result_type or Document.
+    If the item dict contains any of the special keys, it is treated as a Document.
+    Otherwise, it is treated as result_type.
+    If the item is not a dict, it is wrapped as result_type with url=item.
+    """
+    results: List[Union[T, Document]] = []
+    for item in arr:
+        if item and isinstance(item, dict):
+            if (
+                "markdown" in item or
+                "html" in item or
+                "rawHtml" in item or
+                "links" in item or
+                "screenshot" in item or
+                "changeTracking" in item or
+                "summary" in item or
+                "json" in item
+            ):
+                results.append(Document(**item))
+            else:
+                results.append(result_type(**item))
+        else:
+            results.append(result_type(url=item))
+    return results
+def _validate_search_request(request: SearchRequest) -> SearchRequest:
+    """
+    Validate and normalize search request.
+    Args:
+        request: Search request to validate
+    Returns:
+        Validated request
+    Raises:
+        ValueError: If request is invalid
+    """
+    if not request.query or not request.query.strip():
+        raise ValueError("Query cannot be empty")
+    if request.limit is not None:
+        if request.limit <= 0:
+            raise ValueError("Limit must be positive")
+        if request.limit > 100:
+            raise ValueError("Limit cannot exceed 100")
+    if request.timeout is not None:
+        if request.timeout <= 0:
+            raise ValueError("Timeout must be positive")
+        if request.timeout > 300000:
+            raise ValueError("Timeout cannot exceed 300000ms (5 minutes)")
+    if request.sources is not None:
+        valid_sources = {"web", "news", "images"}
+        for source in request.sources:
+            if isinstance(source, str):
+                if source not in valid_sources:
+                    raise ValueError(f"Invalid source type: {source}. Valid types: {valid_sources}")
+            elif hasattr(source, 'type'):
+                if source.type not in valid_sources:
+                    raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
+    if request.location is not None:
+        if not isinstance(request.location, str) or len(request.location.strip()) == 0:
+            raise ValueError("Location must be a non-empty string")
+    if request.tbs is not None:
+        valid_tbs_values = {
+            "qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y",
+            "d", "w", "m", "y"
+        }
+        if request.tbs in valid_tbs_values:
+            pass
+        elif request.tbs.startswith("cdr:"):
+            custom_date_pattern = r"^cdr:1,cd_min:\d{1,2}/\d{1,2}/\d{4},cd_max:\d{1,2}/\d{1,2}/\d{4}$"
+            if not re.match(custom_date_pattern, request.tbs):
+                raise ValueError(f"Invalid custom date range format: {request.tbs}. Expected format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
+        else:
+            raise ValueError(f"Invalid tbs value: {request.tbs}. Valid values: {valid_tbs_values} or custom date range format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
+    if request.scrape_options is not None:
+        validate_scrape_options(request.scrape_options)
+    return request
+def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
+    """
+    Prepare a search request payload.
+    Args:
+        request: Search request
+    Returns:
+        Request payload dictionary
+    """
+    validated_request = _validate_search_request(request)
+    data = validated_request.model_dump(exclude_none=True, by_alias=True)
+    if "limit" not in data and validated_request.limit is not None:
+        data["limit"] = validated_request.limit
+    if "timeout" not in data and validated_request.timeout is not None:
+        data["timeout"] = validated_request.timeout
+    if validated_request.ignore_invalid_urls is not None:
+        data["ignoreInvalidURLs"] = validated_request.ignore_invalid_urls
+        data.pop("ignore_invalid_urls", None)
+    if validated_request.scrape_options is not None:
+        scrape_data = prepare_scrape_options(validated_request.scrape_options)
+        if scrape_data:
+            data["scrapeOptions"] = scrape_data
+        data.pop("scrape_options", None)
+    return data

{firecrawl_py-3.1.1 → firecrawl_py-3.2.1}/firecrawl/v2/methods/search.py RENAMED Viewed

@@ -3,11 +3,12 @@ Search functionality for Firecrawl v2 API.
 """
 import re
-from typing import Optional, Dict, Any, Union
-from ..types import SearchRequest, SearchData, SearchResult, Document
+from typing import Dict, Any, Union, List, TypeVar, Type
+from ..types import SearchRequest, SearchData, Document, SearchResultWeb, SearchResultNews, SearchResultImages
 from ..utils.normalize import normalize_document_input
 from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
+T = TypeVar("T")
 def search(
     client: HttpClient,
@@ -27,48 +28,56 @@ def search(
         FirecrawlError: If the search operation fails
     """
     request_data = _prepare_search_request(request)
-    response = client.post("/v2/search", request_data)
-    if not response.ok:
-        handle_response_error(response, "search")
-    response_data = response.json()
-    if not response_data.get("success"):
-        # Handle error case
-        error_msg = response_data.get("error", "Unknown error occurred")
-        raise Exception(f"Search failed: {error_msg}")
-    data = response_data.get("data", {})
-    search_data = SearchData()
-    for source_type, source_documents in data.items():
-        if isinstance(source_documents, list):
-            results = []
-            for doc_data in source_documents:
-                if isinstance(doc_data, dict):
-                    # If page scraping options were provided, API returns full Document objects
-                    if request.scrape_options is not None and any(
-                        key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
-                    ):
-                        normalized = normalize_document_input(doc_data)
-                        results.append(Document(**normalized))
-                    else:
-                        # Minimal search result shape
-                        results.append(SearchResult(
-                            url=doc_data.get('url', ''),
-                            title=doc_data.get('title'),
-                            description=doc_data.get('description')
-                        ))
-                elif isinstance(doc_data, str):
-                    results.append(SearchResult(url=doc_data))
-            if hasattr(search_data, source_type):
-                setattr(search_data, source_type, results)
-    return search_data
+    try:
+        response = client.post("/v2/search", request_data)
+        if response.status_code != 200:
+            handle_response_error(response, "search")
+        response_data = response.json()
+        if not response_data.get("success"):
+            handle_response_error(response, "search")
+        data = response_data.get("data", {}) or {}
+        out = SearchData()
+        if "web" in data:
+            out.web = _transform_array(data["web"], SearchResultWeb)
+        if "news" in data:
+            out.news = _transform_array(data["news"], SearchResultNews)
+        if "images" in data:
+            out.images = _transform_array(data["images"], SearchResultImages)
+        return out
+    except Exception as err:
+        # If the error is an HTTP error from requests, handle it
+        # (simulate isAxiosError by checking for requests' HTTPError or Response)
+        if hasattr(err, "response"):
+            handle_response_error(getattr(err, "response"), "search")
+        raise err
+def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, 'Document']]:
+    """
+    Transforms an array of items into a list of result_type or Document.
+    If the item dict contains any of the special keys, it is treated as a Document.
+    Otherwise, it is treated as result_type.
+    If the item is not a dict, it is wrapped as result_type with url=item.
+    """
+    results: List[Union[T, 'Document']] = []
+    for item in arr:
+        if item and isinstance(item, dict):
+            if (
+                "markdown" in item or
+                "html" in item or
+                "rawHtml" in item or
+                "links" in item or
+                "screenshot" in item or
+                "changeTracking" in item or
+                "summary" in item or
+                "json" in item
+            ):
+                results.append(Document(**item))
+            else:
+                results.append(result_type(**item))
+        else:
+            # For non-dict items, assume it's a URL and wrap in result_type
+            results.append(result_type(url=item))
+    return results
 def _validate_search_request(request: SearchRequest) -> SearchRequest:
     """

firecrawl-py 3.1.1__tar.gz → 3.2.1__tar.gz

Potentially problematic release.

firecrawl-py 3.1.1tar.gz → 3.2.1tar.gz