firecrawl 3.0.3__tar.gz → 3.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-3.0.3 → firecrawl-3.1.0}/PKG-INFO +1 -1
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__init__.py +1 -1
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -1
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/client.py +1 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/crawl.py +2 -5
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/scrape.py +2 -5
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/search.py +2 -5
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/batch.py +2 -5
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/crawl.py +2 -1
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/scrape.py +2 -6
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/search.py +2 -6
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/types.py +68 -2
- firecrawl-3.1.0/firecrawl/v2/utils/normalize.py +107 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/watcher.py +4 -15
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/watcher_async.py +2 -5
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/SOURCES.txt +1 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/LICENSE +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/README.md +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_extract.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_map.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_search.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/firecrawl.backup.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/types.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v1/__init__.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v1/client.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/__init__.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/client.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/client_async.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/__init__.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/batch.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/extract.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/map.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/aio/usage.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/extract.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/map.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/methods/usage.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/__init__.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/error_handler.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/get_version.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/http_client.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/http_client_async.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/v2/utils/validation.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/pyproject.toml +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/setup.cfg +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/setup.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/tests/test_change_tracking.py +0 -0
- {firecrawl-3.0.3 → firecrawl-3.1.0}/tests/test_timeout_conversion.py +0 -0
|
@@ -96,7 +96,6 @@ async def test_async_get_crawl_status_shape():
|
|
|
96
96
|
assert status.status in ("scraping", "completed", "failed")
|
|
97
97
|
assert status.completed >= 0
|
|
98
98
|
assert status.expires_at is not None
|
|
99
|
-
assert status.next is not None
|
|
100
99
|
assert isinstance(status.data, list)
|
|
101
100
|
|
|
102
101
|
|
|
@@ -14,6 +14,7 @@ from ...types import (
|
|
|
14
14
|
from ...utils.error_handler import handle_response_error
|
|
15
15
|
from ...utils.validation import prepare_scrape_options
|
|
16
16
|
from ...utils.http_client_async import AsyncHttpClient
|
|
17
|
+
from ...utils.normalize import normalize_document_input
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
@@ -76,11 +77,7 @@ async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
|
|
|
76
77
|
documents = []
|
|
77
78
|
for doc_data in body.get("data", []):
|
|
78
79
|
if isinstance(doc_data, dict):
|
|
79
|
-
normalized =
|
|
80
|
-
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
81
|
-
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
82
|
-
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
83
|
-
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
80
|
+
normalized = normalize_document_input(doc_data)
|
|
84
81
|
documents.append(Document(**normalized))
|
|
85
82
|
return CrawlJob(
|
|
86
83
|
status=body.get("status"),
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Optional, Dict, Any
|
|
2
2
|
from ...types import ScrapeOptions, Document
|
|
3
|
+
from ...utils.normalize import normalize_document_input
|
|
3
4
|
from ...utils.error_handler import handle_response_error
|
|
4
5
|
from ...utils.validation import prepare_scrape_options, validate_scrape_options
|
|
5
6
|
from ...utils.http_client_async import AsyncHttpClient
|
|
@@ -27,10 +28,6 @@ async def scrape(client: AsyncHttpClient, url: str, options: Optional[ScrapeOpti
|
|
|
27
28
|
if not body.get("success"):
|
|
28
29
|
raise Exception(body.get("error", "Unknown error occurred"))
|
|
29
30
|
document_data = body.get("data", {})
|
|
30
|
-
normalized =
|
|
31
|
-
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
32
|
-
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
33
|
-
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
34
|
-
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
31
|
+
normalized = normalize_document_input(document_data)
|
|
35
32
|
return Document(**normalized)
|
|
36
33
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Dict, Any
|
|
2
2
|
from ...types import SearchRequest, SearchData, SearchResult, Document
|
|
3
|
+
from ...utils.normalize import normalize_document_input
|
|
3
4
|
from ...utils.http_client_async import AsyncHttpClient
|
|
4
5
|
from ...utils.error_handler import handle_response_error
|
|
5
6
|
from ...utils.validation import prepare_scrape_options, validate_scrape_options
|
|
@@ -38,11 +39,7 @@ async def search(client: AsyncHttpClient, request: SearchRequest) -> SearchData:
|
|
|
38
39
|
if request.scrape_options is not None and any(
|
|
39
40
|
key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
|
|
40
41
|
):
|
|
41
|
-
normalized =
|
|
42
|
-
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
43
|
-
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
44
|
-
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
45
|
-
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
42
|
+
normalized = normalize_document_input(doc_data)
|
|
46
43
|
results.append(Document(**normalized))
|
|
47
44
|
else:
|
|
48
45
|
results.append(SearchResult(
|
|
@@ -13,6 +13,7 @@ from ..types import (
|
|
|
13
13
|
WebhookConfig,
|
|
14
14
|
)
|
|
15
15
|
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
16
|
+
from ..utils.normalize import normalize_document_input
|
|
16
17
|
from ..types import CrawlErrorsResponse
|
|
17
18
|
|
|
18
19
|
|
|
@@ -107,11 +108,7 @@ def get_batch_scrape_status(
|
|
|
107
108
|
documents: List[Document] = []
|
|
108
109
|
for doc in body.get("data", []) or []:
|
|
109
110
|
if isinstance(doc, dict):
|
|
110
|
-
normalized =
|
|
111
|
-
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
112
|
-
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
113
|
-
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
114
|
-
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
111
|
+
normalized = normalize_document_input(doc)
|
|
115
112
|
documents.append(Document(**normalized))
|
|
116
113
|
|
|
117
114
|
return BatchScrapeJob(
|
|
@@ -11,6 +11,7 @@ from ..types import (
|
|
|
11
11
|
WebhookConfig, CrawlErrorsResponse, ActiveCrawlsResponse, ActiveCrawl
|
|
12
12
|
)
|
|
13
13
|
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
14
|
+
from ..utils.normalize import normalize_document_input
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def _validate_crawl_request(request: CrawlRequest) -> None:
|
|
@@ -173,7 +174,7 @@ def get_crawl_status(client: HttpClient, job_id: str) -> CrawlJob:
|
|
|
173
174
|
# but we'll handle it gracefully
|
|
174
175
|
continue
|
|
175
176
|
else:
|
|
176
|
-
documents.append(Document(**doc_data))
|
|
177
|
+
documents.append(Document(**normalize_document_input(doc_data)))
|
|
177
178
|
|
|
178
179
|
# Create CrawlJob with current status and data
|
|
179
180
|
return CrawlJob(
|
|
@@ -4,6 +4,7 @@ Scraping functionality for Firecrawl v2 API.
|
|
|
4
4
|
|
|
5
5
|
from typing import Optional, Dict, Any
|
|
6
6
|
from ..types import ScrapeOptions, Document
|
|
7
|
+
from ..utils.normalize import normalize_document_input
|
|
7
8
|
from ..utils import HttpClient, handle_response_error, prepare_scrape_options, validate_scrape_options
|
|
8
9
|
|
|
9
10
|
|
|
@@ -59,10 +60,5 @@ def scrape(client: HttpClient, url: str, options: Optional[ScrapeOptions] = None
|
|
|
59
60
|
raise Exception(body.get("error", "Unknown error occurred"))
|
|
60
61
|
|
|
61
62
|
document_data = body.get("data", {})
|
|
62
|
-
|
|
63
|
-
normalized = dict(document_data)
|
|
64
|
-
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
65
|
-
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
66
|
-
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
67
|
-
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
63
|
+
normalized = normalize_document_input(document_data)
|
|
68
64
|
return Document(**normalized)
|
|
@@ -4,6 +4,7 @@ Search functionality for Firecrawl v2 API.
|
|
|
4
4
|
|
|
5
5
|
from typing import Optional, Dict, Any, Union
|
|
6
6
|
from ..types import SearchRequest, SearchData, SearchResult, Document
|
|
7
|
+
from ..utils.normalize import normalize_document_input
|
|
7
8
|
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
8
9
|
|
|
9
10
|
|
|
@@ -50,12 +51,7 @@ def search(
|
|
|
50
51
|
if request.scrape_options is not None and any(
|
|
51
52
|
key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
|
|
52
53
|
):
|
|
53
|
-
|
|
54
|
-
normalized = dict(doc_data)
|
|
55
|
-
if 'rawHtml' in normalized and 'raw_html' not in normalized:
|
|
56
|
-
normalized['raw_html'] = normalized.pop('rawHtml')
|
|
57
|
-
if 'changeTracking' in normalized and 'change_tracking' not in normalized:
|
|
58
|
-
normalized['change_tracking'] = normalized.pop('changeTracking')
|
|
54
|
+
normalized = normalize_document_input(doc_data)
|
|
59
55
|
results.append(Document(**normalized))
|
|
60
56
|
else:
|
|
61
57
|
# Minimal search result shape
|
|
@@ -7,7 +7,8 @@ This module contains clean, modern type definitions for the v2 API.
|
|
|
7
7
|
import warnings
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
|
|
10
|
-
|
|
10
|
+
import logging
|
|
11
|
+
from pydantic import BaseModel, Field, field_validator, ValidationError
|
|
11
12
|
|
|
12
13
|
# Suppress pydantic warnings about schema field shadowing
|
|
13
14
|
# Tested using schema_field alias="schema" but it doesn't work.
|
|
@@ -19,6 +20,9 @@ warnings.filterwarnings("ignore", message="Field name \"json\" in \"Document\" s
|
|
|
19
20
|
|
|
20
21
|
T = TypeVar('T')
|
|
21
22
|
|
|
23
|
+
# Module logger
|
|
24
|
+
logger = logging.getLogger("firecrawl")
|
|
25
|
+
|
|
22
26
|
# Base response types
|
|
23
27
|
class BaseResponse(BaseModel, Generic[T]):
|
|
24
28
|
"""Base response structure for all API responses."""
|
|
@@ -29,18 +33,57 @@ class BaseResponse(BaseModel, Generic[T]):
|
|
|
29
33
|
|
|
30
34
|
# Document and content types
|
|
31
35
|
class DocumentMetadata(BaseModel):
|
|
32
|
-
"""Metadata for scraped documents."""
|
|
36
|
+
"""Metadata for scraped documents (snake_case only; API camelCase normalized in code)."""
|
|
37
|
+
# Common metadata fields
|
|
33
38
|
title: Optional[str] = None
|
|
34
39
|
description: Optional[str] = None
|
|
40
|
+
url: Optional[str] = None
|
|
35
41
|
language: Optional[str] = None
|
|
36
42
|
keywords: Optional[Union[str, List[str]]] = None
|
|
37
43
|
robots: Optional[str] = None
|
|
44
|
+
|
|
45
|
+
# OpenGraph and social metadata
|
|
38
46
|
og_title: Optional[str] = None
|
|
39
47
|
og_description: Optional[str] = None
|
|
40
48
|
og_url: Optional[str] = None
|
|
41
49
|
og_image: Optional[str] = None
|
|
50
|
+
og_audio: Optional[str] = None
|
|
51
|
+
og_determiner: Optional[str] = None
|
|
52
|
+
og_locale: Optional[str] = None
|
|
53
|
+
og_locale_alternate: Optional[List[str]] = None
|
|
54
|
+
og_site_name: Optional[str] = None
|
|
55
|
+
og_video: Optional[str] = None
|
|
56
|
+
|
|
57
|
+
# Dublin Core and other site metadata
|
|
58
|
+
favicon: Optional[str] = None
|
|
59
|
+
dc_terms_created: Optional[str] = None
|
|
60
|
+
dc_date_created: Optional[str] = None
|
|
61
|
+
dc_date: Optional[str] = None
|
|
62
|
+
dc_terms_type: Optional[str] = None
|
|
63
|
+
dc_type: Optional[str] = None
|
|
64
|
+
dc_terms_audience: Optional[str] = None
|
|
65
|
+
dc_terms_subject: Optional[str] = None
|
|
66
|
+
dc_subject: Optional[str] = None
|
|
67
|
+
dc_description: Optional[str] = None
|
|
68
|
+
dc_terms_keywords: Optional[str] = None
|
|
69
|
+
|
|
70
|
+
modified_time: Optional[str] = None
|
|
71
|
+
published_time: Optional[str] = None
|
|
72
|
+
article_tag: Optional[str] = None
|
|
73
|
+
article_section: Optional[str] = None
|
|
74
|
+
|
|
75
|
+
# Response-level metadata
|
|
42
76
|
source_url: Optional[str] = None
|
|
43
77
|
status_code: Optional[int] = None
|
|
78
|
+
scrape_id: Optional[str] = None
|
|
79
|
+
num_pages: Optional[int] = None
|
|
80
|
+
content_type: Optional[str] = None
|
|
81
|
+
proxy_used: Optional[Literal["basic", "stealth"]] = None
|
|
82
|
+
cache_state: Optional[Literal["hit", "miss"]] = None
|
|
83
|
+
cached_at: Optional[str] = None
|
|
84
|
+
credits_used: Optional[int] = None
|
|
85
|
+
|
|
86
|
+
# Error information
|
|
44
87
|
error: Optional[str] = None
|
|
45
88
|
|
|
46
89
|
@staticmethod
|
|
@@ -85,6 +128,29 @@ class Document(BaseModel):
|
|
|
85
128
|
warning: Optional[str] = None
|
|
86
129
|
change_tracking: Optional[Dict[str, Any]] = None
|
|
87
130
|
|
|
131
|
+
@property
|
|
132
|
+
def metadata_typed(self) -> DocumentMetadata:
|
|
133
|
+
"""Always returns a DocumentMetadata instance for LSP-friendly access."""
|
|
134
|
+
md = self.metadata
|
|
135
|
+
if isinstance(md, DocumentMetadata):
|
|
136
|
+
return md
|
|
137
|
+
if isinstance(md, dict):
|
|
138
|
+
try:
|
|
139
|
+
return DocumentMetadata(**md)
|
|
140
|
+
except (ValidationError, TypeError) as exc:
|
|
141
|
+
logger.debug("Failed to construct DocumentMetadata from dict: %s", exc)
|
|
142
|
+
return DocumentMetadata()
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def metadata_dict(self) -> Dict[str, Any]:
|
|
146
|
+
"""Returns metadata as a plain dict (exclude None)."""
|
|
147
|
+
md = self.metadata
|
|
148
|
+
if isinstance(md, DocumentMetadata):
|
|
149
|
+
return md.model_dump(exclude_none=True)
|
|
150
|
+
if isinstance(md, dict):
|
|
151
|
+
return {k: v for k, v in md.items() if v is not None}
|
|
152
|
+
return {}
|
|
153
|
+
|
|
88
154
|
# Webhook types
|
|
89
155
|
class WebhookConfig(BaseModel):
|
|
90
156
|
"""Configuration for webhooks."""
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalization helpers for v2 API payloads to avoid relying on Pydantic aliases.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
from ..types import DocumentMetadata
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _map_metadata_keys(md: Dict[str, Any]) -> Dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Convert API v2 camelCase metadata keys to snake_case expected by DocumentMetadata.
|
|
12
|
+
Leaves unknown keys as-is.
|
|
13
|
+
"""
|
|
14
|
+
mapping = {
|
|
15
|
+
# OpenGraph
|
|
16
|
+
"ogTitle": "og_title",
|
|
17
|
+
"ogDescription": "og_description",
|
|
18
|
+
"ogUrl": "og_url",
|
|
19
|
+
"ogImage": "og_image",
|
|
20
|
+
"ogAudio": "og_audio",
|
|
21
|
+
"ogDeterminer": "og_determiner",
|
|
22
|
+
"ogLocale": "og_locale",
|
|
23
|
+
"ogLocaleAlternate": "og_locale_alternate",
|
|
24
|
+
"ogSiteName": "og_site_name",
|
|
25
|
+
"ogVideo": "og_video",
|
|
26
|
+
# Dublin Core and misc
|
|
27
|
+
"dcTermsCreated": "dc_terms_created",
|
|
28
|
+
"dcDateCreated": "dc_date_created",
|
|
29
|
+
"dcDate": "dc_date",
|
|
30
|
+
"dcTermsType": "dc_terms_type",
|
|
31
|
+
"dcType": "dc_type",
|
|
32
|
+
"dcTermsAudience": "dc_terms_audience",
|
|
33
|
+
"dcTermsSubject": "dc_terms_subject",
|
|
34
|
+
"dcSubject": "dc_subject",
|
|
35
|
+
"dcDescription": "dc_description",
|
|
36
|
+
"dcTermsKeywords": "dc_terms_keywords",
|
|
37
|
+
"modifiedTime": "modified_time",
|
|
38
|
+
"publishedTime": "published_time",
|
|
39
|
+
"articleTag": "article_tag",
|
|
40
|
+
"articleSection": "article_section",
|
|
41
|
+
# Response-level
|
|
42
|
+
"sourceURL": "source_url",
|
|
43
|
+
"statusCode": "status_code",
|
|
44
|
+
"scrapeId": "scrape_id",
|
|
45
|
+
"numPages": "num_pages",
|
|
46
|
+
"contentType": "content_type",
|
|
47
|
+
"proxyUsed": "proxy_used",
|
|
48
|
+
"cacheState": "cache_state",
|
|
49
|
+
"cachedAt": "cached_at",
|
|
50
|
+
"creditsUsed": "credits_used",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
out: Dict[str, Any] = {}
|
|
54
|
+
for k, v in md.items():
|
|
55
|
+
snake = mapping.get(k, k)
|
|
56
|
+
out[snake] = v
|
|
57
|
+
|
|
58
|
+
# Light coercions where server may send strings/lists
|
|
59
|
+
if isinstance(out.get("status_code"), str):
|
|
60
|
+
try:
|
|
61
|
+
out["status_code"] = int(out["status_code"]) # type: ignore
|
|
62
|
+
except ValueError:
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
# Generic rule: if a value is a list, join with ", " for string-like fields,
|
|
66
|
+
# except for explicit fields we preserve as lists.
|
|
67
|
+
preserve_list_fields: List[str] = [
|
|
68
|
+
"og_locale_alternate",
|
|
69
|
+
]
|
|
70
|
+
for f, val in list(out.items()):
|
|
71
|
+
if isinstance(val, list) and f not in preserve_list_fields:
|
|
72
|
+
try:
|
|
73
|
+
out[f] = ", ".join(str(x) for x in val)
|
|
74
|
+
except Exception:
|
|
75
|
+
# Fallback: keep original list if join fails
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
return out
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def normalize_document_input(doc: Dict[str, Any]) -> Dict[str, Any]:
|
|
82
|
+
"""
|
|
83
|
+
Normalize a raw Document dict from the API into the Python SDK's expected shape:
|
|
84
|
+
- Convert top-level keys rawHtml->raw_html, changeTracking->change_tracking
|
|
85
|
+
- Convert metadata keys from camelCase to snake_case
|
|
86
|
+
"""
|
|
87
|
+
normalized = dict(doc)
|
|
88
|
+
|
|
89
|
+
if "rawHtml" in normalized and "raw_html" not in normalized:
|
|
90
|
+
normalized["raw_html"] = normalized.pop("rawHtml")
|
|
91
|
+
|
|
92
|
+
if "changeTracking" in normalized and "change_tracking" not in normalized:
|
|
93
|
+
normalized["change_tracking"] = normalized.pop("changeTracking")
|
|
94
|
+
|
|
95
|
+
md = normalized.get("metadata")
|
|
96
|
+
if isinstance(md, dict):
|
|
97
|
+
mapped = _map_metadata_keys(md)
|
|
98
|
+
# Construct a concrete DocumentMetadata so downstream has a typed object
|
|
99
|
+
try:
|
|
100
|
+
normalized["metadata"] = DocumentMetadata(**mapped)
|
|
101
|
+
except Exception:
|
|
102
|
+
# Fallback to mapped dict if model construction fails for any reason
|
|
103
|
+
normalized["metadata"] = mapped
|
|
104
|
+
|
|
105
|
+
return normalized
|
|
106
|
+
|
|
107
|
+
|
|
@@ -15,6 +15,7 @@ from typing import Callable, List, Optional, Literal, Union, Dict, Any
|
|
|
15
15
|
import websockets
|
|
16
16
|
|
|
17
17
|
from .types import CrawlJob, BatchScrapeJob, Document
|
|
18
|
+
from .utils.normalize import normalize_document_input
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
JobKind = Literal["crawl", "batch"]
|
|
@@ -172,11 +173,7 @@ class Watcher:
|
|
|
172
173
|
docs: List[Document] = []
|
|
173
174
|
for doc in self.data:
|
|
174
175
|
if isinstance(doc, dict):
|
|
175
|
-
d =
|
|
176
|
-
if "rawHtml" in d and "raw_html" not in d:
|
|
177
|
-
d["raw_html"] = d.pop("rawHtml")
|
|
178
|
-
if "changeTracking" in d and "change_tracking" not in d:
|
|
179
|
-
d["change_tracking"] = d.pop("changeTracking")
|
|
176
|
+
d = normalize_document_input(doc)
|
|
180
177
|
docs.append(Document(**d))
|
|
181
178
|
if self._kind == "crawl":
|
|
182
179
|
job = CrawlJob(
|
|
@@ -212,11 +209,7 @@ class Watcher:
|
|
|
212
209
|
docs = []
|
|
213
210
|
for doc in payload.get("data", []):
|
|
214
211
|
if isinstance(doc, dict):
|
|
215
|
-
d =
|
|
216
|
-
if "rawHtml" in d and "raw_html" not in d:
|
|
217
|
-
d["raw_html"] = d.pop("rawHtml")
|
|
218
|
-
if "changeTracking" in d and "change_tracking" not in d:
|
|
219
|
-
d["change_tracking"] = d.pop("changeTracking")
|
|
212
|
+
d = normalize_document_input(doc)
|
|
220
213
|
docs.append(Document(**d))
|
|
221
214
|
job = CrawlJob(
|
|
222
215
|
status=status_str,
|
|
@@ -241,11 +234,7 @@ class Watcher:
|
|
|
241
234
|
docs = []
|
|
242
235
|
for doc in payload.get("data", []):
|
|
243
236
|
if isinstance(doc, dict):
|
|
244
|
-
d =
|
|
245
|
-
if "rawHtml" in d and "raw_html" not in d:
|
|
246
|
-
d["raw_html"] = d.pop("rawHtml")
|
|
247
|
-
if "changeTracking" in d and "change_tracking" not in d:
|
|
248
|
-
d["change_tracking"] = d.pop("changeTracking")
|
|
237
|
+
d = normalize_document_input(doc)
|
|
249
238
|
docs.append(Document(**d))
|
|
250
239
|
job = BatchScrapeJob(
|
|
251
240
|
status=status_str,
|
|
@@ -16,6 +16,7 @@ import websockets
|
|
|
16
16
|
from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionClosedError
|
|
17
17
|
|
|
18
18
|
from .types import BatchScrapeJob, CrawlJob, Document
|
|
19
|
+
from .utils.normalize import normalize_document_input
|
|
19
20
|
|
|
20
21
|
JobKind = Literal["crawl", "batch"]
|
|
21
22
|
|
|
@@ -216,11 +217,7 @@ class AsyncWatcher:
|
|
|
216
217
|
source_docs = docs_override if docs_override is not None else payload.get("data", []) or []
|
|
217
218
|
for doc in source_docs:
|
|
218
219
|
if isinstance(doc, dict):
|
|
219
|
-
d =
|
|
220
|
-
if "rawHtml" in d and "raw_html" not in d:
|
|
221
|
-
d["raw_html"] = d.pop("rawHtml")
|
|
222
|
-
if "changeTracking" in d and "change_tracking" not in d:
|
|
223
|
-
d["change_tracking"] = d.pop("changeTracking")
|
|
220
|
+
d = normalize_document_input(doc)
|
|
224
221
|
docs.append(Document(**d))
|
|
225
222
|
|
|
226
223
|
if self._kind == "crawl":
|
|
@@ -77,6 +77,7 @@ firecrawl/v2/utils/error_handler.py
|
|
|
77
77
|
firecrawl/v2/utils/get_version.py
|
|
78
78
|
firecrawl/v2/utils/http_client.py
|
|
79
79
|
firecrawl/v2/utils/http_client_async.py
|
|
80
|
+
firecrawl/v2/utils/normalize.py
|
|
80
81
|
firecrawl/v2/utils/validation.py
|
|
81
82
|
tests/test_change_tracking.py
|
|
82
83
|
tests/test_timeout_conversion.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py
RENAMED
|
File without changes
|
|
File without changes
|
{firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py
RENAMED
|
File without changes
|
|
File without changes
|
{firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{firecrawl-3.0.3 → firecrawl-3.1.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|