PyPI - firecrawl - Versions diffs - 3.0.3__tar.gz → 3.2.0__tar.gz - Mend

firecrawl 3.0.3tar.gz → 3.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of firecrawl might be problematic. Click here for more details.

Files changed (87) hide show

{firecrawl-3.0.3 → firecrawl-3.2.0}/LICENSE RENAMED Viewed

File without changes

{firecrawl-3.0.3 → firecrawl-3.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: firecrawl
-Version: 3.0.3
+Version: 3.2.0
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/firecrawl/firecrawl
 Author: Mendable.ai
@@ -40,6 +40,10 @@ Requires-Dist: websockets
 Requires-Dist: nest-asyncio
 Requires-Dist: pydantic
 Requires-Dist: aiohttp
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-python
 # Firecrawl Python SDK

{firecrawl-3.0.3 → firecrawl-3.2.0}/firecrawl/__init__.py RENAMED Viewed

@@ -17,7 +17,7 @@ from .v1 import (
     V1ChangeTrackingOptions,
 )
-__version__ = "3.0.3"
+__version__ = "3.2.0"
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
@@ -84,4 +84,4 @@ __all__ = [
     'V1JsonConfig',
     'V1ScrapeOptions',
     'V1ChangeTrackingOptions',
-]
+]

{firecrawl-3.0.3 → firecrawl-3.2.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py RENAMED Viewed

@@ -96,7 +96,6 @@ async def test_async_get_crawl_status_shape():
     assert status.status in ("scraping", "completed", "failed")
     assert status.completed >= 0
     assert status.expires_at is not None
-    assert status.next is not None
     assert isinstance(status.data, list)

firecrawl-3.2.0/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py ADDED Viewed

@@ -0,0 +1,248 @@
+import os
+import pytest
+from dotenv import load_dotenv
+from firecrawl import AsyncFirecrawl
+from firecrawl.types import (
+    SearchData,
+    Document,
+    ScrapeOptions,
+    ScrapeFormats,
+    SearchResultWeb,
+    SearchResultNews,
+    SearchResultImages,
+)
+load_dotenv()
+firecrawl = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
+def _collect_texts(entries):
+    texts = []
+    for r in entries or []:
+        title = getattr(r, 'title', None) if hasattr(r, 'title') else None
+        desc = getattr(r, 'description', None) if hasattr(r, 'description') else None
+        if title:
+            texts.append(str(title).lower())
+        if desc:
+            texts.append(str(desc).lower())
+    return texts
+def _is_document(entry) -> bool:
+    try:
+        from firecrawl.v2.types import Document
+        return isinstance(entry, Document) or \
+               hasattr(entry, 'markdown') or \
+               hasattr(entry, 'html') or \
+               hasattr(entry, 'raw_html') or \
+               hasattr(entry, 'json') or \
+               hasattr(entry, 'screenshot') or \
+               hasattr(entry, 'change_tracking') or \
+               hasattr(entry, 'summary')
+    except Exception:
+        return hasattr(entry, 'markdown') or \
+               hasattr(entry, 'html') or \
+               hasattr(entry, 'raw_html') or \
+               hasattr(entry, 'json') or \
+               hasattr(entry, 'screenshot') or \
+               hasattr(entry, 'change_tracking') or \
+               hasattr(entry, 'summary')
+@pytest.mark.asyncio
+async def test_async_search_minimal_request():
+    results = await firecrawl.search(
+        query="What is the capital of France?"
+    )
+    assert isinstance(results, SearchData)
+    assert hasattr(results, 'web')
+    assert results.web is not None
+    assert len(results.web) > 0
+    assert hasattr(results, 'news')
+    assert results.news is None
+    assert hasattr(results, 'images')
+    assert results.images is None
+    for result in results.web:
+        assert isinstance(result, SearchResultWeb)
+        assert hasattr(result, 'url')
+        assert hasattr(result, 'title')
+        assert hasattr(result, 'description')
+        assert result.url.startswith('http')
+        assert result.title is not None
+        assert result.description is not None
+    all_text = ' '.join(_collect_texts(results.web))
+    assert 'paris' in all_text
+    assert results.news is None
+    assert results.images is None
+@pytest.mark.asyncio
+async def test_async_search_with_sources():
+    results = await firecrawl.search(
+        query="firecrawl",
+        sources=["web", "news", "images"],
+        limit=3
+    )
+    assert isinstance(results, SearchData)
+    assert results.web is not None
+    assert len(results.web) <= 3
+    assert isinstance(results.web[0], SearchResultWeb)
+    if results.news is not None:
+        assert len(results.news) <= 3
+        assert isinstance(results.news[0], SearchResultNews)
+    if results.images is not None:
+        assert len(results.images) <= 3
+        assert isinstance(results.images[0], SearchResultImages)
+    web_titles = [result.title.lower() for result in results.web]
+    web_descriptions = [result.description.lower() for result in results.web]
+    all_web_text = ' '.join(web_titles + web_descriptions)
+    assert 'firecrawl' in all_web_text
+@pytest.mark.asyncio
+async def test_async_search_result_structure():
+    results = await firecrawl.search(
+        query="test query",
+        limit=1
+    )
+    if results.web and len(results.web) > 0:
+        result = results.web[0]
+        assert hasattr(result, 'url')
+        assert hasattr(result, 'title')
+        assert hasattr(result, 'description')
+        assert isinstance(result.url, str)
+        assert isinstance(result.title, str) or result.title is None
+        assert isinstance(result.description, str) or result.description is None
+        assert result.url.startswith('http')
+@pytest.mark.asyncio
+async def test_async_search_all_parameters():
+    from firecrawl.types import ScrapeOptions, Location, WaitAction
+    schema = {
+        "type": "object",
+        "properties": {
+            "title": {"type": "string"},
+            "description": {"type": "string"},
+            "url": {"type": "string"}
+        },
+        "required": ["title", "description"]
+    }
+    results = await firecrawl.search(
+        query="artificial intelligence",
+        sources=[
+            {"type": "web"},
+            {"type": "news"}
+        ],
+        limit=3,
+        tbs="qdr:m",
+        location="US",
+        ignore_invalid_urls=True,
+        timeout=60000,
+        scrape_options=ScrapeOptions(
+            formats=[
+                "markdown",
+                "html",
+                {
+                    "type": "json",
+                    "prompt": "Extract the title and description from the page",
+                    "schema": schema
+                },
+                {"type": "summary"}
+            ],
+            headers={"User-Agent": "Firecrawl-Test/1.0"},
+            include_tags=["h1", "h2", "p"],
+            exclude_tags=["nav", "footer"],
+            only_main_content=True,
+            wait_for=2000,
+            mobile=False,
+            skip_tls_verification=False,
+            remove_base64_images=True,
+            block_ads=True,
+            proxy="basic",
+            max_age=3600000,
+            store_in_cache=True,
+            location=Location(
+                country="US",
+                languages=["en"]
+            ),
+            actions=[
+                WaitAction(milliseconds=1000)
+            ]
+        )
+    )
+    assert isinstance(results, SearchData)
+    assert hasattr(results, 'web')
+    assert hasattr(results, 'news')
+    assert hasattr(results, 'images')
+    assert results.web is not None
+    assert len(results.web) <= 3
+    non_doc_entries = [r for r in (results.web or []) if not _is_document(r)]
+    if non_doc_entries:
+        all_web_text = ' '.join(_collect_texts(non_doc_entries))
+        ai_terms = ['artificial', 'intelligence', 'ai', 'machine', 'learning']
+        assert any(term in all_web_text for term in ai_terms)
+    for result in results.web:
+        assert isinstance(result, (SearchResultWeb, Document))
+        if isinstance(result, Document):
+            assert (result.markdown is not None) or (result.html is not None)
+        else:
+            assert hasattr(result, 'url')
+            assert isinstance(result.url, str) and result.url.startswith('http')
+    if results.news is not None:
+        assert len(results.news) <= 3
+        for result in results.news:
+            assert isinstance(result, (SearchResultNews, Document))
+            if isinstance(result, Document):
+                assert (result.markdown is not None) or (result.html is not None)
+            else:
+                assert hasattr(result, 'url')
+                assert isinstance(result.url, str) and result.url.startswith('http')
+    assert results.images is None
+@pytest.mark.asyncio
+async def test_async_search_formats_flexibility():
+    # Test with list format
+    results1 = await firecrawl.search(
+        query="python programming",
+        limit=1,
+        scrape_options=ScrapeOptions(
+            formats=["markdown"]
+        )
+    )
+    # Test with ScrapeFormats object
+    results2 = await firecrawl.search(
+        query="python programming",
+        limit=1,
+        scrape_options=ScrapeOptions(
+            formats=ScrapeFormats(markdown=True)
+        )
+    )
+    assert isinstance(results1, SearchData)
+    assert isinstance(results2, SearchData)
+    assert results1.web is not None
+    assert results2.web is not None
+@pytest.mark.asyncio
+async def test_async_search_with_json_format_object():
+    json_schema = {
+        "type": "object",
+        "properties": {
+            "title": {"type": "string"}
+        },
+        "required": ["title"],
+    }
+    results = await firecrawl.search(
+        query="site:docs.firecrawl.dev",
+        limit=1,
+        scrape_options=ScrapeOptions(
+            formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}]
+        ),
+    )
+    assert isinstance(results, SearchData)
+    assert results.web is not None and len(results.web) >= 0

{firecrawl-3.0.3 → firecrawl-3.2.0}/firecrawl/__tests__/e2e/v2/test_search.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from firecrawl import Firecrawl
 import os
 from dotenv import load_dotenv
-from firecrawl.types import SearchData, SearchResult, Document, ScrapeFormats, ScrapeOptions
+from firecrawl.types import SearchData, Document, ScrapeOptions, SearchResultWeb, SearchResultNews, SearchResultImages
 load_dotenv()
@@ -53,7 +53,7 @@ def test_search_minimal_request():
     assert results.images is None
     for result in results.web:
-        assert isinstance(result, SearchResult)
+        assert isinstance(result, SearchResultWeb)
         assert hasattr(result, 'url')
         assert hasattr(result, 'title')
         assert hasattr(result, 'description')
@@ -73,7 +73,7 @@ def test_search_with_sources():
     """Test search with specific sources."""
     results = firecrawl.search(
         query="firecrawl",
-        sources=["web", "news"],
+        sources=["web", "news", "images"],
         limit=3
     )
@@ -81,11 +81,15 @@ def test_search_with_sources():
     assert results.web is not None
     assert len(results.web) <= 3
+    assert isinstance(results.web[0], SearchResultWeb)
     if results.news is not None:
         assert len(results.news) <= 3
+        assert isinstance(results.news[0], SearchResultNews)
-    assert results.images is None
+    if results.images is not None:
+        assert len(results.images) <= 3
+        assert isinstance(results.images[0], SearchResultImages)
     web_titles = [result.title.lower() for result in results.web]
     web_descriptions = [result.description.lower() for result in results.web]
@@ -193,7 +197,7 @@ def test_search_all_parameters():
     # Test that each result has proper structure
     for result in results.web:
-        assert isinstance(result, (SearchResult, Document))
+        assert isinstance(result, (SearchResultWeb, Document))
         if isinstance(result, Document):
             # Document path: ensure content present
             assert (result.markdown is not None) or (result.html is not None)
@@ -206,7 +210,7 @@ def test_search_all_parameters():
     if results.news is not None:
         assert len(results.news) <= 3
         for result in results.news:
-            assert isinstance(result, (SearchResult, Document))
+            assert isinstance(result, (SearchResultNews, Document))
             if isinstance(result, Document):
                 assert (result.markdown is not None) or (result.html is not None)
             else:

{firecrawl-3.0.3 → firecrawl-3.2.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py RENAMED Viewed

@@ -11,7 +11,7 @@ class TestSearchValidation:
         request = SearchRequest(query="")
         with pytest.raises(ValueError, match="Query cannot be empty"):
             _validate_search_request(request)
         request = SearchRequest(query="   ")
         with pytest.raises(ValueError, match="Query cannot be empty"):
             _validate_search_request(request)
@@ -22,12 +22,12 @@ class TestSearchValidation:
         request = SearchRequest(query="test", limit=0)
         with pytest.raises(ValueError, match="Limit must be positive"):
             _validate_search_request(request)
         # Negative limit
         request = SearchRequest(query="test", limit=-1)
         with pytest.raises(ValueError, match="Limit must be positive"):
             _validate_search_request(request)
         # Too high limit
         request = SearchRequest(query="test", limit=101)
         with pytest.raises(ValueError, match="Limit cannot exceed 100"):
@@ -39,12 +39,12 @@ class TestSearchValidation:
         request = SearchRequest(query="test", timeout=0)
         with pytest.raises(ValueError, match="Timeout must be positive"):
             _validate_search_request(request)
         # Negative timeout
         request = SearchRequest(query="test", timeout=-1000)
         with pytest.raises(ValueError, match="Timeout must be positive"):
             _validate_search_request(request)
         # Too high timeout
         request = SearchRequest(query="test", timeout=300001)
         with pytest.raises(ValueError, match="Timeout cannot exceed 300000ms"):
@@ -56,12 +56,12 @@ class TestSearchValidation:
         request = SearchRequest(query="test", sources=["invalid_source"])
         with pytest.raises(ValueError, match="Invalid source type"):
             _validate_search_request(request)
         # Invalid object source
         request = SearchRequest(query="test", sources=[Source(type="invalid_source")])
         with pytest.raises(ValueError, match="Invalid source type"):
             _validate_search_request(request)
         # Mixed valid/invalid sources
         request = SearchRequest(query="test", sources=["web", "invalid_source"])
         with pytest.raises(ValueError, match="Invalid source type"):
@@ -73,7 +73,7 @@ class TestSearchValidation:
         request = SearchRequest(query="test", location="")
         with pytest.raises(ValueError, match="Location must be a non-empty string"):
             _validate_search_request(request)
         # Whitespace location
         request = SearchRequest(query="test", location="   ")
         with pytest.raises(ValueError, match="Location must be a non-empty string"):
@@ -82,19 +82,49 @@ class TestSearchValidation:
     def test_validate_invalid_tbs(self):
         """Test validation of invalid tbs values."""
         invalid_tbs_values = ["invalid", "qdr:x", "yesterday", "last_week"]
         for invalid_tbs in invalid_tbs_values:
             request = SearchRequest(query="test", tbs=invalid_tbs)
             with pytest.raises(ValueError, match="Invalid tbs value"):
                 _validate_search_request(request)
+    def test_validate_custom_date_ranges(self):
+        """Test validation of custom date range formats."""
+        valid_custom_ranges = [
+            "cdr:1,cd_min:1/1/2024,cd_max:12/31/2024",
+            "cdr:1,cd_min:12/1/2024,cd_max:12/31/2024",
+            "cdr:1,cd_min:2/28/2023,cd_max:3/1/2023",
+            "cdr:1,cd_min:10/15/2023,cd_max:11/15/2023"
+        ]
+        for valid_range in valid_custom_ranges:
+            request = SearchRequest(query="test", tbs=valid_range)
+            validated = _validate_search_request(request)
+            assert validated == request
+    def test_validate_invalid_custom_date_ranges(self):
+        """Test validation of invalid custom date range formats."""
+        # Invalid custom date ranges
+        invalid_custom_ranges = [
+            "cdr:1,cd_min:2/28/2023",  # Missing cd_max
+            "cdr:1,cd_max:2/28/2023",  # Missing cd_min
+            "cdr:2,cd_min:1/1/2024,cd_max:12/31/2024",  # Wrong cdr value
+            "cdr:cd_min:1/1/2024,cd_max:12/31/2024",  # Missing :1
+            "custom:1,cd_min:1/1/2024,cd_max:12/31/2024"  # Wrong prefix
+        ]
+        for invalid_range in invalid_custom_ranges:
+            request = SearchRequest(query="test", tbs=invalid_range)
+            with pytest.raises(ValueError, match="Invalid"):
+                _validate_search_request(request)
     def test_validate_valid_requests(self):
         """Test that valid requests pass validation."""
         # Minimal valid request
         request = SearchRequest(query="test")
         validated = _validate_search_request(request)
         assert validated == request
         # Request with all optional parameters
         request = SearchRequest(
             query="test query",
@@ -107,7 +137,7 @@ class TestSearchValidation:
         )
         validated = _validate_search_request(request)
         assert validated == request
         # Request with object sources
         request = SearchRequest(
             query="test",
@@ -122,17 +152,17 @@ class TestSearchValidation:
         request = SearchRequest(query="test", limit=100)
         validated = _validate_search_request(request)
         assert validated == request
         # Maximum valid timeout
         request = SearchRequest(query="test", timeout=300000)
         validated = _validate_search_request(request)
         assert validated == request
         # Minimum valid limit
         request = SearchRequest(query="test", limit=1)
         validated = _validate_search_request(request)
         assert validated == request
         # Minimum valid timeout
         request = SearchRequest(query="test", timeout=1)
         validated = _validate_search_request(request)
@@ -191,16 +221,16 @@ class TestSearchRequestModel:
         data1 = request1.model_dump(by_alias=True)
         assert "ignore_invalid_urls" in data1  # No alias, uses snake_case
         assert data1["ignore_invalid_urls"] is None
         # Test with explicit False value
         request2 = SearchRequest(
             query="test",
             ignore_invalid_urls=False,
             scrape_options=ScrapeOptions(formats=["markdown"])
         )
         # Check that aliases are used in model_dump with by_alias=True
         data2 = request2.model_dump(by_alias=True)
         assert "ignore_invalid_urls" in data2  # No alias, uses snake_case
         assert "scrape_options" in data2  # No alias, uses snake_case
-        assert data2["ignore_invalid_urls"] is False
+        assert data2["ignore_invalid_urls"] is False

{firecrawl-3.0.3 → firecrawl-3.2.0}/firecrawl/client.py RENAMED Viewed

@@ -25,6 +25,7 @@ import logging
 from .v1 import V1FirecrawlApp, AsyncV1FirecrawlApp
 from .v2 import FirecrawlClient as V2FirecrawlClient
 from .v2.client_async import AsyncFirecrawlClient
+from .v2.types import Document
 logger = logging.getLogger("firecrawl")

{firecrawl-3.0.3 → firecrawl-3.2.0}/firecrawl/types.py RENAMED Viewed

@@ -48,7 +48,9 @@ from .v2.types import (
     JsonFormat,
     FormatOption,
     SearchRequest,
-    SearchResult,
+    SearchResultWeb,
+    SearchResultNews,
+    SearchResultImages,
     SearchData,
     SearchResponse,
@@ -124,7 +126,9 @@ __all__ = [
     'JsonFormat',
     'FormatOption',
     'SearchRequest',
-    'SearchResult',
+    'SearchResultWeb',
+    'SearchResultNews',
+    'SearchResultImages',
     'SearchData',
     'SearchResponse',

{firecrawl-3.0.3 → firecrawl-3.2.0}/firecrawl/v2/methods/aio/crawl.py RENAMED Viewed

@@ -14,6 +14,7 @@ from ...types import (
 from ...utils.error_handler import handle_response_error
 from ...utils.validation import prepare_scrape_options
 from ...utils.http_client_async import AsyncHttpClient
+from ...utils.normalize import normalize_document_input
 def _prepare_crawl_request(request: CrawlRequest) -> dict:
@@ -76,11 +77,7 @@ async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
         documents = []
         for doc_data in body.get("data", []):
             if isinstance(doc_data, dict):
-                normalized = dict(doc_data)
-                if 'rawHtml' in normalized and 'raw_html' not in normalized:
-                    normalized['raw_html'] = normalized.pop('rawHtml')
-                if 'changeTracking' in normalized and 'change_tracking' not in normalized:
-                    normalized['change_tracking'] = normalized.pop('changeTracking')
+                normalized = normalize_document_input(doc_data)
                 documents.append(Document(**normalized))
         return CrawlJob(
             status=body.get("status"),

{firecrawl-3.0.3 → firecrawl-3.2.0}/firecrawl/v2/methods/aio/scrape.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from typing import Optional, Dict, Any
 from ...types import ScrapeOptions, Document
+from ...utils.normalize import normalize_document_input
 from ...utils.error_handler import handle_response_error
 from ...utils.validation import prepare_scrape_options, validate_scrape_options
 from ...utils.http_client_async import AsyncHttpClient
@@ -27,10 +28,6 @@ async def scrape(client: AsyncHttpClient, url: str, options: Optional[ScrapeOpti
     if not body.get("success"):
         raise Exception(body.get("error", "Unknown error occurred"))
     document_data = body.get("data", {})
-    normalized = dict(document_data)
-    if 'rawHtml' in normalized and 'raw_html' not in normalized:
-        normalized['raw_html'] = normalized.pop('rawHtml')
-    if 'changeTracking' in normalized and 'change_tracking' not in normalized:
-        normalized['change_tracking'] = normalized.pop('changeTracking')
+    normalized = normalize_document_input(document_data)
     return Document(**normalized)

firecrawl 3.0.3__tar.gz → 3.2.0__tar.gz

Potentially problematic release.

firecrawl 3.0.3tar.gz → 3.2.0tar.gz