firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility modules for v2 API client.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .http_client import HttpClient
|
|
6
|
+
from .error_handler import FirecrawlError, handle_response_error
|
|
7
|
+
from .validation import validate_scrape_options, prepare_scrape_options
|
|
8
|
+
|
|
9
|
+
__all__ = ['HttpClient', 'FirecrawlError', 'handle_response_error', 'validate_scrape_options', 'prepare_scrape_options']
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Error handling utilities for v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from typing import Dict, Any, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FirecrawlError(Exception):
|
|
10
|
+
"""Base exception for Firecrawl API errors."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[requests.Response] = None):
|
|
13
|
+
super().__init__(message)
|
|
14
|
+
self.status_code = status_code
|
|
15
|
+
self.response = response
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BadRequestError(FirecrawlError):
|
|
19
|
+
"""Raised when the request is invalid (400)."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class UnauthorizedError(FirecrawlError):
|
|
25
|
+
"""Raised when the request is unauthorized (401)."""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PaymentRequiredError(FirecrawlError):
|
|
30
|
+
"""Raised when payment is required (402)."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class WebsiteNotSupportedError(FirecrawlError):
|
|
35
|
+
"""Raised when website is not supported (403)."""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class RequestTimeoutError(FirecrawlError):
|
|
40
|
+
"""Raised when request times out (408)."""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class RateLimitError(FirecrawlError):
|
|
45
|
+
"""Raised when the rate limit is exceeded (429)."""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class InternalServerError(FirecrawlError):
|
|
50
|
+
"""Raised when there's an internal server error (500)."""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def handle_response_error(response: requests.Response, action: str) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Handle API response errors and raise appropriate exceptions.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
response: The HTTP response object
|
|
60
|
+
action: Description of the action being performed
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
FirecrawlError: Appropriate error based on status code
|
|
64
|
+
"""
|
|
65
|
+
try:
|
|
66
|
+
response_json = response.json()
|
|
67
|
+
error_message = response_json.get('error', 'No error message provided.')
|
|
68
|
+
error_details = response_json.get('details', 'No additional error details provided.')
|
|
69
|
+
except:
|
|
70
|
+
# If we can't parse JSON, provide a helpful error message
|
|
71
|
+
try:
|
|
72
|
+
response_text = response.text[:500] # Limit to first 500 chars
|
|
73
|
+
if response_text.strip():
|
|
74
|
+
error_message = f"Server returned non-JSON response: {response_text}"
|
|
75
|
+
error_details = f"Full response status: {response.status_code}"
|
|
76
|
+
else:
|
|
77
|
+
error_message = f"Server returned empty response with status {response.status_code}"
|
|
78
|
+
error_details = "No additional details available"
|
|
79
|
+
except:
|
|
80
|
+
error_message = f"Server returned unreadable response with status {response.status_code}"
|
|
81
|
+
error_details = "No additional details available"
|
|
82
|
+
|
|
83
|
+
# Create appropriate error message
|
|
84
|
+
if response.status_code == 400:
|
|
85
|
+
message = f"Bad Request: Failed to {action}. {error_message} - {error_details}"
|
|
86
|
+
raise BadRequestError(message, response.status_code, response)
|
|
87
|
+
elif response.status_code == 401:
|
|
88
|
+
message = f"Unauthorized: Failed to {action}. {error_message} - {error_details}"
|
|
89
|
+
raise UnauthorizedError(message, response.status_code, response)
|
|
90
|
+
elif response.status_code == 402:
|
|
91
|
+
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
92
|
+
raise PaymentRequiredError(message, response.status_code, response)
|
|
93
|
+
elif response.status_code == 403:
|
|
94
|
+
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
95
|
+
raise WebsiteNotSupportedError(message, response.status_code, response)
|
|
96
|
+
elif response.status_code == 408:
|
|
97
|
+
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
98
|
+
raise RequestTimeoutError(message, response.status_code, response)
|
|
99
|
+
elif response.status_code == 429:
|
|
100
|
+
message = f"Rate Limit Exceeded: Failed to {action}. {error_message} - {error_details}"
|
|
101
|
+
raise RateLimitError(message, response.status_code, response)
|
|
102
|
+
elif response.status_code == 500:
|
|
103
|
+
message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
|
104
|
+
raise InternalServerError(message, response.status_code, response)
|
|
105
|
+
else:
|
|
106
|
+
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
|
|
107
|
+
raise FirecrawlError(message, response.status_code, response)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
def get_version():
|
|
6
|
+
try:
|
|
7
|
+
package_path = Path(__file__).parents[2]
|
|
8
|
+
version_file = (package_path / "__init__.py").read_text()
|
|
9
|
+
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
|
10
|
+
if version_match:
|
|
11
|
+
return version_match.group(1).strip()
|
|
12
|
+
return "3.x.x"
|
|
13
|
+
except Exception as e:
|
|
14
|
+
print(f"Failed to get version from __init__.py: {e}")
|
|
15
|
+
return "3.x.x"
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTTP client utilities for v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from typing import Dict, Any, Optional
|
|
7
|
+
from urllib.parse import urlparse, urlunparse, urljoin
|
|
8
|
+
import requests
|
|
9
|
+
from .get_version import get_version
|
|
10
|
+
|
|
11
|
+
version = get_version()
|
|
12
|
+
|
|
13
|
+
class HttpClient:
|
|
14
|
+
"""HTTP client with retry logic and error handling."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, api_key: Optional[str], api_url: str):
|
|
17
|
+
self.api_key = api_key
|
|
18
|
+
self.api_url = api_url
|
|
19
|
+
|
|
20
|
+
def _build_url(self, endpoint: str) -> str:
|
|
21
|
+
base = urlparse(self.api_url)
|
|
22
|
+
ep = urlparse(endpoint)
|
|
23
|
+
|
|
24
|
+
# Absolute or protocol-relative (has netloc)
|
|
25
|
+
if ep.netloc:
|
|
26
|
+
# Different host: keep path/query but force base host/scheme (no token leakage)
|
|
27
|
+
path = ep.path or "/"
|
|
28
|
+
if (ep.hostname or "") != (base.hostname or ""):
|
|
29
|
+
return urlunparse((base.scheme or "https", base.netloc, path, "", ep.query, ""))
|
|
30
|
+
# Same host: normalize scheme to base
|
|
31
|
+
return urlunparse((base.scheme or "https", base.netloc, path, "", ep.query, ""))
|
|
32
|
+
|
|
33
|
+
# Relative (including leading slash or not)
|
|
34
|
+
base_str = self.api_url if self.api_url.endswith("/") else f"{self.api_url}/"
|
|
35
|
+
# Guard protocol-relative like //host/path slipping through as “relative”
|
|
36
|
+
if endpoint.startswith("//"):
|
|
37
|
+
ep2 = urlparse(f"https:{endpoint}")
|
|
38
|
+
path = ep2.path or "/"
|
|
39
|
+
return urlunparse((base.scheme or "https", base.netloc, path, "", ep2.query, ""))
|
|
40
|
+
return urljoin(base_str, endpoint)
|
|
41
|
+
|
|
42
|
+
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
|
43
|
+
"""Prepare headers for API requests."""
|
|
44
|
+
headers = {
|
|
45
|
+
'Content-Type': 'application/json',
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if self.api_key:
|
|
49
|
+
headers['Authorization'] = f'Bearer {self.api_key}'
|
|
50
|
+
|
|
51
|
+
if idempotency_key:
|
|
52
|
+
headers['x-idempotency-key'] = idempotency_key
|
|
53
|
+
|
|
54
|
+
return headers
|
|
55
|
+
|
|
56
|
+
def post(
|
|
57
|
+
self,
|
|
58
|
+
endpoint: str,
|
|
59
|
+
data: Dict[str, Any],
|
|
60
|
+
headers: Optional[Dict[str, str]] = None,
|
|
61
|
+
timeout: Optional[float] = None,
|
|
62
|
+
retries: int = 3,
|
|
63
|
+
backoff_factor: float = 0.5
|
|
64
|
+
) -> requests.Response:
|
|
65
|
+
"""Make a POST request with retry logic."""
|
|
66
|
+
if headers is None:
|
|
67
|
+
headers = self._prepare_headers()
|
|
68
|
+
|
|
69
|
+
data['origin'] = f'python-sdk@{version}'
|
|
70
|
+
|
|
71
|
+
url = self._build_url(endpoint)
|
|
72
|
+
|
|
73
|
+
last_exception = None
|
|
74
|
+
|
|
75
|
+
for attempt in range(retries):
|
|
76
|
+
try:
|
|
77
|
+
response = requests.post(
|
|
78
|
+
url,
|
|
79
|
+
headers=headers,
|
|
80
|
+
json=data,
|
|
81
|
+
timeout=timeout
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if response.status_code == 502:
|
|
85
|
+
if attempt < retries - 1:
|
|
86
|
+
time.sleep(backoff_factor * (2 ** attempt))
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
return response
|
|
90
|
+
|
|
91
|
+
except requests.RequestException as e:
|
|
92
|
+
last_exception = e
|
|
93
|
+
if attempt == retries - 1:
|
|
94
|
+
raise e
|
|
95
|
+
time.sleep(backoff_factor * (2 ** attempt))
|
|
96
|
+
|
|
97
|
+
# This should never be reached due to the exception handling above
|
|
98
|
+
raise last_exception or Exception("Unexpected error in POST request")
|
|
99
|
+
|
|
100
|
+
def get(
|
|
101
|
+
self,
|
|
102
|
+
endpoint: str,
|
|
103
|
+
headers: Optional[Dict[str, str]] = None,
|
|
104
|
+
timeout: Optional[float] = None,
|
|
105
|
+
retries: int = 3,
|
|
106
|
+
backoff_factor: float = 0.5
|
|
107
|
+
) -> requests.Response:
|
|
108
|
+
"""Make a GET request with retry logic."""
|
|
109
|
+
if headers is None:
|
|
110
|
+
headers = self._prepare_headers()
|
|
111
|
+
|
|
112
|
+
url = self._build_url(endpoint)
|
|
113
|
+
|
|
114
|
+
last_exception = None
|
|
115
|
+
|
|
116
|
+
for attempt in range(retries):
|
|
117
|
+
try:
|
|
118
|
+
response = requests.get(
|
|
119
|
+
url,
|
|
120
|
+
headers=headers,
|
|
121
|
+
timeout=timeout
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if response.status_code == 502:
|
|
125
|
+
if attempt < retries - 1:
|
|
126
|
+
time.sleep(backoff_factor * (2 ** attempt))
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
return response
|
|
130
|
+
|
|
131
|
+
except requests.RequestException as e:
|
|
132
|
+
last_exception = e
|
|
133
|
+
if attempt == retries - 1:
|
|
134
|
+
raise e
|
|
135
|
+
time.sleep(backoff_factor * (2 ** attempt))
|
|
136
|
+
|
|
137
|
+
# This should never be reached due to the exception handling above
|
|
138
|
+
raise last_exception or Exception("Unexpected error in GET request")
|
|
139
|
+
|
|
140
|
+
def delete(
|
|
141
|
+
self,
|
|
142
|
+
endpoint: str,
|
|
143
|
+
headers: Optional[Dict[str, str]] = None,
|
|
144
|
+
timeout: Optional[float] = None,
|
|
145
|
+
retries: int = 3,
|
|
146
|
+
backoff_factor: float = 0.5
|
|
147
|
+
) -> requests.Response:
|
|
148
|
+
"""Make a DELETE request with retry logic."""
|
|
149
|
+
if headers is None:
|
|
150
|
+
headers = self._prepare_headers()
|
|
151
|
+
|
|
152
|
+
url = self._build_url(endpoint)
|
|
153
|
+
|
|
154
|
+
last_exception = None
|
|
155
|
+
|
|
156
|
+
for attempt in range(retries):
|
|
157
|
+
try:
|
|
158
|
+
response = requests.delete(
|
|
159
|
+
url,
|
|
160
|
+
headers=headers,
|
|
161
|
+
timeout=timeout
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if response.status_code == 502:
|
|
165
|
+
if attempt < retries - 1:
|
|
166
|
+
time.sleep(backoff_factor * (2 ** attempt))
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
return response
|
|
170
|
+
|
|
171
|
+
except requests.RequestException as e:
|
|
172
|
+
last_exception = e
|
|
173
|
+
if attempt == retries - 1:
|
|
174
|
+
raise e
|
|
175
|
+
time.sleep(backoff_factor * (2 ** attempt))
|
|
176
|
+
|
|
177
|
+
# This should never be reached due to the exception handling above
|
|
178
|
+
raise last_exception or Exception("Unexpected error in DELETE request")
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import httpx
|
|
2
|
+
from typing import Optional, Dict, Any
|
|
3
|
+
from .get_version import get_version
|
|
4
|
+
|
|
5
|
+
version = get_version()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AsyncHttpClient:
|
|
9
|
+
def __init__(self, api_key: Optional[str], api_url: str):
|
|
10
|
+
self.api_key = api_key
|
|
11
|
+
self.api_url = api_url
|
|
12
|
+
headers = {
|
|
13
|
+
"Content-Type": "application/json",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
if api_key:
|
|
17
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
18
|
+
|
|
19
|
+
self._client = httpx.AsyncClient(
|
|
20
|
+
base_url=api_url,
|
|
21
|
+
headers=headers,
|
|
22
|
+
limits=httpx.Limits(max_keepalive_connections=0),
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
async def close(self) -> None:
|
|
26
|
+
await self._client.aclose()
|
|
27
|
+
|
|
28
|
+
def _headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
|
29
|
+
headers: Dict[str, str] = {}
|
|
30
|
+
if idempotency_key:
|
|
31
|
+
headers["x-idempotency-key"] = idempotency_key
|
|
32
|
+
return headers
|
|
33
|
+
|
|
34
|
+
async def post(
|
|
35
|
+
self,
|
|
36
|
+
endpoint: str,
|
|
37
|
+
data: Dict[str, Any],
|
|
38
|
+
headers: Optional[Dict[str, str]] = None,
|
|
39
|
+
timeout: Optional[float] = None,
|
|
40
|
+
) -> httpx.Response:
|
|
41
|
+
payload = dict(data)
|
|
42
|
+
payload["origin"] = f"python-sdk@{version}"
|
|
43
|
+
return await self._client.post(
|
|
44
|
+
endpoint,
|
|
45
|
+
json=payload,
|
|
46
|
+
headers={**self._headers(), **(headers or {})},
|
|
47
|
+
timeout=timeout,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
async def get(
|
|
51
|
+
self,
|
|
52
|
+
endpoint: str,
|
|
53
|
+
headers: Optional[Dict[str, str]] = None,
|
|
54
|
+
timeout: Optional[float] = None,
|
|
55
|
+
) -> httpx.Response:
|
|
56
|
+
return await self._client.get(
|
|
57
|
+
endpoint, headers={**self._headers(), **(headers or {})}, timeout=timeout
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
async def delete(
|
|
61
|
+
self,
|
|
62
|
+
endpoint: str,
|
|
63
|
+
headers: Optional[Dict[str, str]] = None,
|
|
64
|
+
timeout: Optional[float] = None,
|
|
65
|
+
) -> httpx.Response:
|
|
66
|
+
return await self._client.delete(
|
|
67
|
+
endpoint, headers={**self._headers(), **(headers or {})}, timeout=timeout
|
|
68
|
+
)
|
|
69
|
+
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalization helpers for v2 API payloads to avoid relying on Pydantic aliases.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
from ..types import DocumentMetadata
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _map_metadata_keys(md: Dict[str, Any]) -> Dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Convert API v2 camelCase metadata keys to snake_case expected by DocumentMetadata.
|
|
12
|
+
Leaves unknown keys as-is.
|
|
13
|
+
"""
|
|
14
|
+
mapping = {
|
|
15
|
+
# OpenGraph
|
|
16
|
+
"ogTitle": "og_title",
|
|
17
|
+
"ogDescription": "og_description",
|
|
18
|
+
"ogUrl": "og_url",
|
|
19
|
+
"ogImage": "og_image",
|
|
20
|
+
"ogAudio": "og_audio",
|
|
21
|
+
"ogDeterminer": "og_determiner",
|
|
22
|
+
"ogLocale": "og_locale",
|
|
23
|
+
"ogLocaleAlternate": "og_locale_alternate",
|
|
24
|
+
"ogSiteName": "og_site_name",
|
|
25
|
+
"ogVideo": "og_video",
|
|
26
|
+
# Dublin Core and misc
|
|
27
|
+
"dcTermsCreated": "dc_terms_created",
|
|
28
|
+
"dcDateCreated": "dc_date_created",
|
|
29
|
+
"dcDate": "dc_date",
|
|
30
|
+
"dcTermsType": "dc_terms_type",
|
|
31
|
+
"dcType": "dc_type",
|
|
32
|
+
"dcTermsAudience": "dc_terms_audience",
|
|
33
|
+
"dcTermsSubject": "dc_terms_subject",
|
|
34
|
+
"dcSubject": "dc_subject",
|
|
35
|
+
"dcDescription": "dc_description",
|
|
36
|
+
"dcTermsKeywords": "dc_terms_keywords",
|
|
37
|
+
"modifiedTime": "modified_time",
|
|
38
|
+
"publishedTime": "published_time",
|
|
39
|
+
"articleTag": "article_tag",
|
|
40
|
+
"articleSection": "article_section",
|
|
41
|
+
# Response-level
|
|
42
|
+
"sourceURL": "source_url",
|
|
43
|
+
"statusCode": "status_code",
|
|
44
|
+
"scrapeId": "scrape_id",
|
|
45
|
+
"numPages": "num_pages",
|
|
46
|
+
"contentType": "content_type",
|
|
47
|
+
"proxyUsed": "proxy_used",
|
|
48
|
+
"cacheState": "cache_state",
|
|
49
|
+
"cachedAt": "cached_at",
|
|
50
|
+
"creditsUsed": "credits_used",
|
|
51
|
+
"concurrencyLimited": "concurrency_limited",
|
|
52
|
+
"concurrencyQueueDurationMs": "concurrency_queue_duration_ms",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
out: Dict[str, Any] = {}
|
|
56
|
+
for k, v in md.items():
|
|
57
|
+
snake = mapping.get(k, k)
|
|
58
|
+
out[snake] = v
|
|
59
|
+
|
|
60
|
+
# Light coercions where server may send strings/lists
|
|
61
|
+
if isinstance(out.get("status_code"), str):
|
|
62
|
+
try:
|
|
63
|
+
out["status_code"] = int(out["status_code"]) # type: ignore
|
|
64
|
+
except ValueError:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
# Preserve list values for unknown keys; only lightweight coercions above
|
|
68
|
+
return out
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def normalize_document_input(doc: Dict[str, Any]) -> Dict[str, Any]:
|
|
72
|
+
"""
|
|
73
|
+
Normalize a raw Document dict from the API into the Python SDK's expected shape:
|
|
74
|
+
- Convert top-level keys rawHtml->raw_html, changeTracking->change_tracking
|
|
75
|
+
- Convert metadata keys from camelCase to snake_case
|
|
76
|
+
- Convert branding.colorScheme to branding.color_scheme
|
|
77
|
+
"""
|
|
78
|
+
normalized = dict(doc)
|
|
79
|
+
|
|
80
|
+
if "rawHtml" in normalized and "raw_html" not in normalized:
|
|
81
|
+
normalized["raw_html"] = normalized.pop("rawHtml")
|
|
82
|
+
|
|
83
|
+
if "changeTracking" in normalized and "change_tracking" not in normalized:
|
|
84
|
+
normalized["change_tracking"] = normalized.pop("changeTracking")
|
|
85
|
+
|
|
86
|
+
md = normalized.get("metadata")
|
|
87
|
+
if isinstance(md, dict):
|
|
88
|
+
mapped = _map_metadata_keys(md)
|
|
89
|
+
# Construct a typed DocumentMetadata; extras allowed/preserved
|
|
90
|
+
try:
|
|
91
|
+
normalized["metadata"] = DocumentMetadata.model_validate(mapped)
|
|
92
|
+
except Exception:
|
|
93
|
+
normalized["metadata"] = mapped
|
|
94
|
+
|
|
95
|
+
# Normalize branding top-level camelCase keys
|
|
96
|
+
branding = normalized.get("branding")
|
|
97
|
+
if isinstance(branding, dict):
|
|
98
|
+
if "colorScheme" in branding and "color_scheme" not in branding:
|
|
99
|
+
branding["color_scheme"] = branding.pop("colorScheme")
|
|
100
|
+
|
|
101
|
+
return normalized
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _map_search_result_keys(result: Dict[str, Any], result_type: str) -> Dict[str, Any]:
|
|
105
|
+
if result_type == "images":
|
|
106
|
+
mapping = {
|
|
107
|
+
"imageUrl": "image_url",
|
|
108
|
+
"imageWidth": "image_width",
|
|
109
|
+
"imageHeight": "image_height",
|
|
110
|
+
}
|
|
111
|
+
elif result_type == "news":
|
|
112
|
+
mapping = {
|
|
113
|
+
"imageUrl": "image_url",
|
|
114
|
+
}
|
|
115
|
+
elif result_type == "web":
|
|
116
|
+
mapping = {}
|
|
117
|
+
else:
|
|
118
|
+
mapping = {}
|
|
119
|
+
|
|
120
|
+
out: Dict[str, Any] = {}
|
|
121
|
+
for k, v in result.items():
|
|
122
|
+
snake = mapping.get(k, k)
|
|
123
|
+
out[snake] = v
|
|
124
|
+
|
|
125
|
+
return out
|