firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
from ..types import ExtractResponse, ScrapeOptions
|
|
5
|
+
from ..types import AgentOptions
|
|
6
|
+
from ..utils.http_client import HttpClient
|
|
7
|
+
from ..utils.validation import prepare_scrape_options
|
|
8
|
+
from ..utils.error_handler import handle_response_error
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _prepare_extract_request(
|
|
12
|
+
urls: Optional[List[str]],
|
|
13
|
+
*,
|
|
14
|
+
prompt: Optional[str] = None,
|
|
15
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
16
|
+
system_prompt: Optional[str] = None,
|
|
17
|
+
allow_external_links: Optional[bool] = None,
|
|
18
|
+
enable_web_search: Optional[bool] = None,
|
|
19
|
+
show_sources: Optional[bool] = None,
|
|
20
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
21
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
22
|
+
integration: Optional[str] = None,
|
|
23
|
+
agent: Optional[AgentOptions] = None,
|
|
24
|
+
) -> Dict[str, Any]:
|
|
25
|
+
body: Dict[str, Any] = {}
|
|
26
|
+
if urls is not None:
|
|
27
|
+
body["urls"] = urls
|
|
28
|
+
if prompt is not None:
|
|
29
|
+
body["prompt"] = prompt
|
|
30
|
+
if schema is not None:
|
|
31
|
+
body["schema"] = schema
|
|
32
|
+
if system_prompt is not None:
|
|
33
|
+
body["systemPrompt"] = system_prompt
|
|
34
|
+
if allow_external_links is not None:
|
|
35
|
+
body["allowExternalLinks"] = allow_external_links
|
|
36
|
+
if enable_web_search is not None:
|
|
37
|
+
body["enableWebSearch"] = enable_web_search
|
|
38
|
+
if show_sources is not None:
|
|
39
|
+
body["showSources"] = show_sources
|
|
40
|
+
if ignore_invalid_urls is not None:
|
|
41
|
+
body["ignoreInvalidURLs"] = ignore_invalid_urls
|
|
42
|
+
if scrape_options is not None:
|
|
43
|
+
prepared = prepare_scrape_options(scrape_options)
|
|
44
|
+
if prepared:
|
|
45
|
+
body["scrapeOptions"] = prepared
|
|
46
|
+
if integration is not None and str(integration).strip():
|
|
47
|
+
body["integration"] = str(integration).strip()
|
|
48
|
+
if agent is not None:
|
|
49
|
+
try:
|
|
50
|
+
body["agent"] = agent.model_dump(exclude_none=True) # type: ignore[attr-defined]
|
|
51
|
+
except AttributeError:
|
|
52
|
+
body["agent"] = agent # fallback
|
|
53
|
+
return body
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _normalize_extract_response_payload(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
57
|
+
out = dict(payload)
|
|
58
|
+
if "expiresAt" in out and "expires_at" not in out:
|
|
59
|
+
out["expires_at"] = out["expiresAt"]
|
|
60
|
+
if "creditsUsed" in out and "credits_used" not in out:
|
|
61
|
+
out["credits_used"] = out["creditsUsed"]
|
|
62
|
+
if "tokensUsed" in out and "tokens_used" not in out:
|
|
63
|
+
out["tokens_used"] = out["tokensUsed"]
|
|
64
|
+
return out
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def start_extract(
|
|
68
|
+
client: HttpClient,
|
|
69
|
+
urls: Optional[List[str]],
|
|
70
|
+
*,
|
|
71
|
+
prompt: Optional[str] = None,
|
|
72
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
73
|
+
system_prompt: Optional[str] = None,
|
|
74
|
+
allow_external_links: Optional[bool] = None,
|
|
75
|
+
enable_web_search: Optional[bool] = None,
|
|
76
|
+
show_sources: Optional[bool] = None,
|
|
77
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
78
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
79
|
+
integration: Optional[str] = None,
|
|
80
|
+
agent: Optional[AgentOptions] = None,
|
|
81
|
+
) -> ExtractResponse:
|
|
82
|
+
body = _prepare_extract_request(
|
|
83
|
+
urls,
|
|
84
|
+
prompt=prompt,
|
|
85
|
+
schema=schema,
|
|
86
|
+
system_prompt=system_prompt,
|
|
87
|
+
allow_external_links=allow_external_links,
|
|
88
|
+
enable_web_search=enable_web_search,
|
|
89
|
+
show_sources=show_sources,
|
|
90
|
+
scrape_options=scrape_options,
|
|
91
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
92
|
+
integration=integration,
|
|
93
|
+
agent=agent,
|
|
94
|
+
)
|
|
95
|
+
resp = client.post("/v2/extract", body)
|
|
96
|
+
if not resp.ok:
|
|
97
|
+
handle_response_error(resp, "extract")
|
|
98
|
+
payload = _normalize_extract_response_payload(resp.json())
|
|
99
|
+
return ExtractResponse(**payload)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_extract_status(client: HttpClient, job_id: str) -> ExtractResponse:
|
|
103
|
+
resp = client.get(f"/v2/extract/{job_id}")
|
|
104
|
+
if not resp.ok:
|
|
105
|
+
handle_response_error(resp, "extract-status")
|
|
106
|
+
payload = _normalize_extract_response_payload(resp.json())
|
|
107
|
+
return ExtractResponse(**payload)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def wait_extract(
|
|
111
|
+
client: HttpClient,
|
|
112
|
+
job_id: str,
|
|
113
|
+
*,
|
|
114
|
+
poll_interval: int = 2,
|
|
115
|
+
timeout: Optional[int] = None,
|
|
116
|
+
) -> ExtractResponse:
|
|
117
|
+
start_ts = time.time()
|
|
118
|
+
while True:
|
|
119
|
+
status = get_extract_status(client, job_id)
|
|
120
|
+
if status.status in ("completed", "failed", "cancelled"):
|
|
121
|
+
return status
|
|
122
|
+
if timeout is not None and (time.time() - start_ts) > timeout:
|
|
123
|
+
return status
|
|
124
|
+
time.sleep(max(1, poll_interval))
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def extract(
|
|
128
|
+
client: HttpClient,
|
|
129
|
+
urls: Optional[List[str]],
|
|
130
|
+
*,
|
|
131
|
+
prompt: Optional[str] = None,
|
|
132
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
133
|
+
system_prompt: Optional[str] = None,
|
|
134
|
+
allow_external_links: Optional[bool] = None,
|
|
135
|
+
enable_web_search: Optional[bool] = None,
|
|
136
|
+
show_sources: Optional[bool] = None,
|
|
137
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
138
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
139
|
+
poll_interval: int = 2,
|
|
140
|
+
timeout: Optional[int] = None,
|
|
141
|
+
integration: Optional[str] = None,
|
|
142
|
+
agent: Optional[AgentOptions] = None,
|
|
143
|
+
) -> ExtractResponse:
|
|
144
|
+
started = start_extract(
|
|
145
|
+
client,
|
|
146
|
+
urls,
|
|
147
|
+
prompt=prompt,
|
|
148
|
+
schema=schema,
|
|
149
|
+
system_prompt=system_prompt,
|
|
150
|
+
allow_external_links=allow_external_links,
|
|
151
|
+
enable_web_search=enable_web_search,
|
|
152
|
+
show_sources=show_sources,
|
|
153
|
+
scrape_options=scrape_options,
|
|
154
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
155
|
+
integration=integration,
|
|
156
|
+
agent=agent,
|
|
157
|
+
)
|
|
158
|
+
job_id = getattr(started, "id", None)
|
|
159
|
+
if not job_id:
|
|
160
|
+
return started
|
|
161
|
+
return wait_extract(client, job_id, poll_interval=poll_interval, timeout=timeout)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mapping functionality for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Dict, Any
|
|
6
|
+
from ..types import MapOptions, MapData, LinkResult
|
|
7
|
+
from ..utils import HttpClient, handle_response_error
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict[str, Any]:
|
|
11
|
+
if not url or not url.strip():
|
|
12
|
+
raise ValueError("URL cannot be empty")
|
|
13
|
+
|
|
14
|
+
payload: Dict[str, Any] = {"url": url.strip()}
|
|
15
|
+
|
|
16
|
+
if options is not None:
|
|
17
|
+
# Unified sitemap parameter already provided in options
|
|
18
|
+
data: Dict[str, Any] = {}
|
|
19
|
+
if getattr(options, "sitemap", None) is not None:
|
|
20
|
+
data["sitemap"] = options.sitemap
|
|
21
|
+
|
|
22
|
+
if options.search is not None:
|
|
23
|
+
data["search"] = options.search
|
|
24
|
+
if options.include_subdomains is not None:
|
|
25
|
+
data["includeSubdomains"] = options.include_subdomains
|
|
26
|
+
if options.ignore_query_parameters is not None:
|
|
27
|
+
data["ignoreQueryParameters"] = options.ignore_query_parameters
|
|
28
|
+
if options.limit is not None:
|
|
29
|
+
data["limit"] = options.limit
|
|
30
|
+
if options.timeout is not None:
|
|
31
|
+
data["timeout"] = options.timeout
|
|
32
|
+
if options.integration is not None and options.integration.strip():
|
|
33
|
+
data["integration"] = options.integration.strip()
|
|
34
|
+
if options.location is not None:
|
|
35
|
+
data["location"] = options.location.model_dump(exclude_none=True)
|
|
36
|
+
payload.update(data)
|
|
37
|
+
|
|
38
|
+
return payload
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def map(client: HttpClient, url: str, options: Optional[MapOptions] = None) -> MapData:
|
|
42
|
+
"""
|
|
43
|
+
Map a URL and return MapData (links list with optional titles/descriptions).
|
|
44
|
+
"""
|
|
45
|
+
request_data = _prepare_map_request(url, options)
|
|
46
|
+
response = client.post("/v2/map", request_data)
|
|
47
|
+
if not response.ok:
|
|
48
|
+
handle_response_error(response, "map")
|
|
49
|
+
|
|
50
|
+
body = response.json()
|
|
51
|
+
if not body.get("success"):
|
|
52
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
53
|
+
|
|
54
|
+
# shouldnt return inside data?
|
|
55
|
+
# data = body.get("data", {})
|
|
56
|
+
# result_links: list[LinkResult] = []
|
|
57
|
+
# for item in data.get("links", []):
|
|
58
|
+
# if isinstance(item, dict):
|
|
59
|
+
# result_links.append(
|
|
60
|
+
# LinkResult(
|
|
61
|
+
# url=item.get("url", ""),
|
|
62
|
+
# title=item.get("title"),
|
|
63
|
+
# description=item.get("description"),
|
|
64
|
+
# )
|
|
65
|
+
# )
|
|
66
|
+
# elif isinstance(item, str):
|
|
67
|
+
# result_links.append(LinkResult(url=item))
|
|
68
|
+
|
|
69
|
+
result_links: list[LinkResult] = []
|
|
70
|
+
for item in body.get("links", []):
|
|
71
|
+
if isinstance(item, dict):
|
|
72
|
+
result_links.append(
|
|
73
|
+
LinkResult(
|
|
74
|
+
url=item.get("url", ""),
|
|
75
|
+
title=item.get("title"),
|
|
76
|
+
description=item.get("description"),
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
elif isinstance(item, str):
|
|
80
|
+
result_links.append(LinkResult(url=item))
|
|
81
|
+
|
|
82
|
+
return MapData(links=result_links)
|
|
83
|
+
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scraping functionality for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Dict, Any
|
|
6
|
+
from ..types import ScrapeOptions, Document
|
|
7
|
+
from ..utils.normalize import normalize_document_input
|
|
8
|
+
from ..utils import HttpClient, handle_response_error, prepare_scrape_options, validate_scrape_options
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _prepare_scrape_request(url: str, options: Optional[ScrapeOptions] = None) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Prepare a scrape request payload for v2 API.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
url: URL to scrape
|
|
17
|
+
options: ScrapeOptions (snake_case) to convert and include
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Request payload dictionary with camelCase fields
|
|
21
|
+
"""
|
|
22
|
+
if not url or not url.strip():
|
|
23
|
+
raise ValueError("URL cannot be empty")
|
|
24
|
+
|
|
25
|
+
request_data: Dict[str, Any] = {"url": url.strip()}
|
|
26
|
+
|
|
27
|
+
if options is not None:
|
|
28
|
+
validated = validate_scrape_options(options)
|
|
29
|
+
if validated is not None:
|
|
30
|
+
opts = prepare_scrape_options(validated)
|
|
31
|
+
if opts:
|
|
32
|
+
request_data.update(opts)
|
|
33
|
+
|
|
34
|
+
return request_data
|
|
35
|
+
|
|
36
|
+
def scrape(client: HttpClient, url: str, options: Optional[ScrapeOptions] = None) -> Document:
|
|
37
|
+
"""
|
|
38
|
+
Scrape a single URL and return the document.
|
|
39
|
+
|
|
40
|
+
The v2 API returns: { success: boolean, data: Document }
|
|
41
|
+
We surface just the Document to callers.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
client: HTTP client instance
|
|
45
|
+
url: URL to scrape
|
|
46
|
+
options: Scraping options (snake_case)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Document
|
|
50
|
+
"""
|
|
51
|
+
payload = _prepare_scrape_request(url, options)
|
|
52
|
+
|
|
53
|
+
response = client.post("/v2/scrape", payload)
|
|
54
|
+
|
|
55
|
+
if not response.ok:
|
|
56
|
+
handle_response_error(response, "scrape")
|
|
57
|
+
|
|
58
|
+
body = response.json()
|
|
59
|
+
if not body.get("success"):
|
|
60
|
+
raise Exception(body.get("error", "Unknown error occurred"))
|
|
61
|
+
|
|
62
|
+
document_data = body.get("data", {})
|
|
63
|
+
normalized = normalize_document_input(document_data)
|
|
64
|
+
return Document(**normalized)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Search functionality for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Dict, Any, Union, List, TypeVar, Type
|
|
7
|
+
from ..types import SearchRequest, SearchData, Document, SearchResultWeb, SearchResultNews, SearchResultImages
|
|
8
|
+
from ..utils.normalize import normalize_document_input, _map_search_result_keys
|
|
9
|
+
from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
|
|
10
|
+
|
|
11
|
+
T = TypeVar("T")
|
|
12
|
+
|
|
13
|
+
def search(
|
|
14
|
+
client: HttpClient,
|
|
15
|
+
request: SearchRequest
|
|
16
|
+
) -> SearchData:
|
|
17
|
+
"""
|
|
18
|
+
Search for documents.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
client: HTTP client instance
|
|
22
|
+
request: Search request
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
SearchData with search results grouped by source type
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
FirecrawlError: If the search operation fails
|
|
29
|
+
"""
|
|
30
|
+
request_data = _prepare_search_request(request)
|
|
31
|
+
try:
|
|
32
|
+
response = client.post("/v2/search", request_data)
|
|
33
|
+
if response.status_code != 200:
|
|
34
|
+
handle_response_error(response, "search")
|
|
35
|
+
response_data = response.json()
|
|
36
|
+
if not response_data.get("success"):
|
|
37
|
+
handle_response_error(response, "search")
|
|
38
|
+
data = response_data.get("data", {}) or {}
|
|
39
|
+
out = SearchData()
|
|
40
|
+
if "web" in data:
|
|
41
|
+
out.web = _transform_array(data["web"], SearchResultWeb)
|
|
42
|
+
if "news" in data:
|
|
43
|
+
out.news = _transform_array(data["news"], SearchResultNews)
|
|
44
|
+
if "images" in data:
|
|
45
|
+
out.images = _transform_array(data["images"], SearchResultImages)
|
|
46
|
+
return out
|
|
47
|
+
except Exception as err:
|
|
48
|
+
# If the error is an HTTP error from requests, handle it
|
|
49
|
+
# (simulate isAxiosError by checking for requests' HTTPError or Response)
|
|
50
|
+
if hasattr(err, "response"):
|
|
51
|
+
handle_response_error(getattr(err, "response"), "search")
|
|
52
|
+
raise err
|
|
53
|
+
|
|
54
|
+
def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, 'Document']]:
|
|
55
|
+
"""
|
|
56
|
+
Transforms an array of items into a list of result_type or Document.
|
|
57
|
+
If the item dict contains any of the special keys, it is treated as a Document.
|
|
58
|
+
Otherwise, it is treated as result_type.
|
|
59
|
+
If the item is not a dict, it is wrapped as result_type with url=item.
|
|
60
|
+
"""
|
|
61
|
+
results: List[Union[T, 'Document']] = []
|
|
62
|
+
for item in arr:
|
|
63
|
+
if item and isinstance(item, dict):
|
|
64
|
+
if (
|
|
65
|
+
"markdown" in item or
|
|
66
|
+
"html" in item or
|
|
67
|
+
"rawHtml" in item or
|
|
68
|
+
"links" in item or
|
|
69
|
+
"screenshot" in item or
|
|
70
|
+
"changeTracking" in item or
|
|
71
|
+
"summary" in item or
|
|
72
|
+
"json" in item
|
|
73
|
+
):
|
|
74
|
+
results.append(Document(**normalize_document_input(item)))
|
|
75
|
+
else:
|
|
76
|
+
result_type_name = None
|
|
77
|
+
if result_type == SearchResultImages:
|
|
78
|
+
result_type_name = "images"
|
|
79
|
+
elif result_type == SearchResultNews:
|
|
80
|
+
result_type_name = "news"
|
|
81
|
+
elif result_type == SearchResultWeb:
|
|
82
|
+
result_type_name = "web"
|
|
83
|
+
|
|
84
|
+
if result_type_name:
|
|
85
|
+
normalized_item = _map_search_result_keys(item, result_type_name)
|
|
86
|
+
results.append(result_type(**normalized_item))
|
|
87
|
+
else:
|
|
88
|
+
results.append(result_type(**item))
|
|
89
|
+
else:
|
|
90
|
+
results.append(result_type(url=item))
|
|
91
|
+
return results
|
|
92
|
+
|
|
93
|
+
def _validate_search_request(request: SearchRequest) -> SearchRequest:
|
|
94
|
+
"""
|
|
95
|
+
Validate and normalize search request.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
request: Search request to validate
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Validated request
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If request is invalid
|
|
105
|
+
"""
|
|
106
|
+
# Validate query
|
|
107
|
+
if not request.query or not request.query.strip():
|
|
108
|
+
raise ValueError("Query cannot be empty")
|
|
109
|
+
|
|
110
|
+
# Validate limit
|
|
111
|
+
if request.limit is not None:
|
|
112
|
+
if request.limit <= 0:
|
|
113
|
+
raise ValueError("Limit must be positive")
|
|
114
|
+
if request.limit > 100:
|
|
115
|
+
raise ValueError("Limit cannot exceed 100")
|
|
116
|
+
|
|
117
|
+
# Validate timeout
|
|
118
|
+
if request.timeout is not None:
|
|
119
|
+
if request.timeout <= 0:
|
|
120
|
+
raise ValueError("Timeout must be positive")
|
|
121
|
+
if request.timeout > 300000: # 5 minutes max
|
|
122
|
+
raise ValueError("Timeout cannot exceed 300000ms (5 minutes)")
|
|
123
|
+
|
|
124
|
+
# Validate sources (if provided)
|
|
125
|
+
if request.sources is not None:
|
|
126
|
+
valid_sources = {"web", "news", "images"}
|
|
127
|
+
for source in request.sources:
|
|
128
|
+
if isinstance(source, str):
|
|
129
|
+
if source not in valid_sources:
|
|
130
|
+
raise ValueError(f"Invalid source type: {source}. Valid types: {valid_sources}")
|
|
131
|
+
elif hasattr(source, 'type'):
|
|
132
|
+
if source.type not in valid_sources:
|
|
133
|
+
raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
|
|
134
|
+
|
|
135
|
+
# Validate categories (if provided)
|
|
136
|
+
if request.categories is not None:
|
|
137
|
+
valid_categories = {"github", "research", "pdf"}
|
|
138
|
+
for category in request.categories:
|
|
139
|
+
if isinstance(category, str):
|
|
140
|
+
if category not in valid_categories:
|
|
141
|
+
raise ValueError(f"Invalid category type: {category}. Valid types: {valid_categories}")
|
|
142
|
+
elif hasattr(category, 'type'):
|
|
143
|
+
if category.type not in valid_categories:
|
|
144
|
+
raise ValueError(f"Invalid category type: {category.type}. Valid types: {valid_categories}")
|
|
145
|
+
|
|
146
|
+
# Validate location (if provided)
|
|
147
|
+
if request.location is not None:
|
|
148
|
+
if not isinstance(request.location, str) or len(request.location.strip()) == 0:
|
|
149
|
+
raise ValueError("Location must be a non-empty string")
|
|
150
|
+
|
|
151
|
+
# Validate tbs (time-based search, if provided)
|
|
152
|
+
if request.tbs is not None:
|
|
153
|
+
valid_tbs_values = {
|
|
154
|
+
"qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y", # Google time filters
|
|
155
|
+
"d", "w", "m", "y" # Short forms
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if request.tbs in valid_tbs_values:
|
|
159
|
+
pass # Valid predefined value
|
|
160
|
+
elif request.tbs.startswith("cdr:"):
|
|
161
|
+
custom_date_pattern = r"^cdr:1,cd_min:\d{1,2}/\d{1,2}/\d{4},cd_max:\d{1,2}/\d{1,2}/\d{4}$"
|
|
162
|
+
if not re.match(custom_date_pattern, request.tbs):
|
|
163
|
+
raise ValueError(f"Invalid custom date range format: {request.tbs}. Expected format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
|
|
164
|
+
else:
|
|
165
|
+
raise ValueError(f"Invalid tbs value: {request.tbs}. Valid values: {valid_tbs_values} or custom date range format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
|
|
166
|
+
|
|
167
|
+
# Validate scrape_options (if provided)
|
|
168
|
+
if request.scrape_options is not None:
|
|
169
|
+
validate_scrape_options(request.scrape_options)
|
|
170
|
+
|
|
171
|
+
return request
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
|
|
175
|
+
"""
|
|
176
|
+
Prepare a search request payload.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
request: Search request
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Request payload dictionary
|
|
183
|
+
"""
|
|
184
|
+
validated_request = _validate_search_request(request)
|
|
185
|
+
data = validated_request.model_dump(exclude_none=True, by_alias=True)
|
|
186
|
+
|
|
187
|
+
# Ensure default values are included only if not explicitly set to None
|
|
188
|
+
if "limit" not in data and validated_request.limit is not None:
|
|
189
|
+
data["limit"] = validated_request.limit
|
|
190
|
+
if "timeout" not in data and validated_request.timeout is not None:
|
|
191
|
+
data["timeout"] = validated_request.timeout
|
|
192
|
+
|
|
193
|
+
# Handle snake_case to camelCase conversions manually
|
|
194
|
+
# (Pydantic Field() aliases interfere with value assignment)
|
|
195
|
+
|
|
196
|
+
# ignore_invalid_urls → ignoreInvalidURLs
|
|
197
|
+
if validated_request.ignore_invalid_urls is not None:
|
|
198
|
+
data["ignoreInvalidURLs"] = validated_request.ignore_invalid_urls
|
|
199
|
+
data.pop("ignore_invalid_urls", None)
|
|
200
|
+
|
|
201
|
+
# scrape_options → scrapeOptions
|
|
202
|
+
if validated_request.scrape_options is not None:
|
|
203
|
+
scrape_data = prepare_scrape_options(validated_request.scrape_options)
|
|
204
|
+
if scrape_data:
|
|
205
|
+
data["scrapeOptions"] = scrape_data
|
|
206
|
+
data.pop("scrape_options", None)
|
|
207
|
+
|
|
208
|
+
# Only include integration if it was explicitly provided and non-empty
|
|
209
|
+
integration_value = getattr(validated_request, "integration", None)
|
|
210
|
+
if integration_value is not None:
|
|
211
|
+
integration_str = str(integration_value).strip()
|
|
212
|
+
if integration_str:
|
|
213
|
+
data["integration"] = integration_str
|
|
214
|
+
|
|
215
|
+
return data
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from ..utils import HttpClient, handle_response_error
|
|
2
|
+
from ..types import ConcurrencyCheck, CreditUsage, QueueStatusResponse, TokenUsage, CreditUsageHistoricalResponse, TokenUsageHistoricalResponse
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_concurrency(client: HttpClient) -> ConcurrencyCheck:
|
|
6
|
+
resp = client.get("/v2/concurrency-check")
|
|
7
|
+
if not resp.ok:
|
|
8
|
+
handle_response_error(resp, "get concurrency")
|
|
9
|
+
body = resp.json()
|
|
10
|
+
if not body.get("success"):
|
|
11
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
12
|
+
data = body.get("data", body)
|
|
13
|
+
return ConcurrencyCheck(
|
|
14
|
+
concurrency=data.get("concurrency"),
|
|
15
|
+
max_concurrency=data.get("maxConcurrency", data.get("max_concurrency")),
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_credit_usage(client: HttpClient) -> CreditUsage:
|
|
20
|
+
resp = client.get("/v2/team/credit-usage")
|
|
21
|
+
if not resp.ok:
|
|
22
|
+
handle_response_error(resp, "get credit usage")
|
|
23
|
+
body = resp.json()
|
|
24
|
+
if not body.get("success"):
|
|
25
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
26
|
+
data = body.get("data", body)
|
|
27
|
+
return CreditUsage(
|
|
28
|
+
remaining_credits=data.get("remainingCredits", data.get("remaining_credits", 0)),
|
|
29
|
+
plan_credits=data.get("planCredits", data.get("plan_credits")),
|
|
30
|
+
billing_period_start=data.get("billingPeriodStart", data.get("billing_period_start")),
|
|
31
|
+
billing_period_end=data.get("billingPeriodEnd", data.get("billing_period_end")),
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_token_usage(client: HttpClient) -> TokenUsage:
|
|
36
|
+
resp = client.get("/v2/team/token-usage")
|
|
37
|
+
if not resp.ok:
|
|
38
|
+
handle_response_error(resp, "get token usage")
|
|
39
|
+
body = resp.json()
|
|
40
|
+
if not body.get("success"):
|
|
41
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
42
|
+
data = body.get("data", body)
|
|
43
|
+
return TokenUsage(
|
|
44
|
+
remaining_tokens=data.get("remainingTokens", data.get("remaining_tokens", 0)),
|
|
45
|
+
plan_tokens=data.get("planTokens", data.get("plan_tokens")),
|
|
46
|
+
billing_period_start=data.get("billingPeriodStart", data.get("billing_period_start")),
|
|
47
|
+
billing_period_end=data.get("billingPeriodEnd", data.get("billing_period_end")),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def get_queue_status(client: HttpClient) -> QueueStatusResponse:
|
|
51
|
+
resp = client.get("/v2/team/queue-status")
|
|
52
|
+
if not resp.ok:
|
|
53
|
+
handle_response_error(resp, "get queue status")
|
|
54
|
+
body = resp.json()
|
|
55
|
+
if not body.get("success"):
|
|
56
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
57
|
+
data = body.get("data", body)
|
|
58
|
+
return QueueStatusResponse(
|
|
59
|
+
jobs_in_queue=data.get("jobsInQueue", 0),
|
|
60
|
+
active_jobs_in_queue=data.get("activeJobsInQueue", 0),
|
|
61
|
+
waiting_jobs_in_queue=data.get("waitingJobsInQueue", 0),
|
|
62
|
+
max_concurrency=data.get("maxConcurrency", 0),
|
|
63
|
+
most_recent_success=data.get("mostRecentSuccess", None),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_credit_usage_historical(client: HttpClient, by_api_key: bool = False) -> CreditUsageHistoricalResponse:
|
|
68
|
+
resp = client.get(f"/v2/team/credit-usage/historical{'?byApiKey=true' if by_api_key else ''}")
|
|
69
|
+
if not resp.ok:
|
|
70
|
+
handle_response_error(resp, "get credit usage historical")
|
|
71
|
+
body = resp.json()
|
|
72
|
+
if not body.get("success"):
|
|
73
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
74
|
+
return CreditUsageHistoricalResponse(**body)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_token_usage_historical(client: HttpClient, by_api_key: bool = False) -> TokenUsageHistoricalResponse:
|
|
78
|
+
resp = client.get(f"/v2/team/token-usage/historical{'?byApiKey=true' if by_api_key else ''}")
|
|
79
|
+
if not resp.ok:
|
|
80
|
+
handle_response_error(resp, "get token usage historical")
|
|
81
|
+
body = resp.json()
|
|
82
|
+
if not body.get("success"):
|
|
83
|
+
raise Exception(body.get("error", "Unknown error"))
|
|
84
|
+
return TokenUsageHistoricalResponse(**body)
|