webscout 2025.10.14.1__py3-none-any.whl → 2025.10.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Extra/YTToolkit/README.md +1 -1
- webscout/Extra/tempmail/README.md +3 -3
- webscout/Provider/OPENAI/README.md +1 -1
- webscout/Provider/TTI/bing.py +4 -4
- webscout/__init__.py +1 -1
- webscout/cli.py +0 -147
- webscout/client.py +4 -5
- webscout/litprinter/__init__.py +0 -42
- webscout/scout/README.md +59 -8
- webscout/scout/core/scout.py +62 -0
- webscout/scout/element.py +251 -45
- webscout/search/__init__.py +5 -8
- webscout/search/bing_main.py +42 -0
- webscout/search/engines/bing/__init__.py +1 -0
- webscout/search/engines/bing/base.py +33 -0
- webscout/search/engines/bing/images.py +108 -0
- webscout/search/engines/bing/news.py +91 -0
- webscout/search/engines/bing/suggestions.py +34 -0
- webscout/search/engines/bing/text.py +106 -0
- webscout/search/engines/duckduckgo/maps.py +13 -0
- webscout/search/engines/yahoo/__init__.py +41 -0
- webscout/search/engines/yahoo/answers.py +16 -0
- webscout/search/engines/yahoo/base.py +34 -0
- webscout/search/engines/yahoo/images.py +324 -0
- webscout/search/engines/yahoo/maps.py +16 -0
- webscout/search/engines/yahoo/news.py +258 -0
- webscout/search/engines/yahoo/suggestions.py +140 -0
- webscout/search/engines/yahoo/text.py +273 -0
- webscout/search/engines/yahoo/translate.py +16 -0
- webscout/search/engines/yahoo/videos.py +302 -0
- webscout/search/engines/yahoo/weather.py +220 -0
- webscout/search/http_client.py +1 -1
- webscout/search/yahoo_main.py +54 -0
- webscout/{auth → server}/__init__.py +2 -23
- webscout/server/config.py +84 -0
- webscout/{auth → server}/request_processing.py +3 -28
- webscout/{auth → server}/routes.py +14 -170
- webscout/server/schemas.py +23 -0
- webscout/{auth → server}/server.py +11 -43
- webscout/server/simple_logger.py +84 -0
- webscout/version.py +1 -1
- webscout/version.py.bak +1 -1
- webscout/zeroart/README.md +17 -9
- webscout/zeroart/__init__.py +78 -6
- webscout/zeroart/effects.py +51 -1
- webscout/zeroart/fonts.py +559 -1
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/METADATA +15 -332
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/RECORD +55 -48
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/entry_points.txt +1 -1
- webscout/Bing_search.py +0 -417
- webscout/DWEBS.py +0 -529
- webscout/auth/api_key_manager.py +0 -189
- webscout/auth/auth_system.py +0 -85
- webscout/auth/config.py +0 -175
- webscout/auth/database.py +0 -755
- webscout/auth/middleware.py +0 -248
- webscout/auth/models.py +0 -185
- webscout/auth/rate_limiter.py +0 -254
- webscout/auth/schemas.py +0 -103
- webscout/auth/simple_logger.py +0 -236
- webscout/search/engines/bing.py +0 -84
- webscout/search/engines/bing_news.py +0 -52
- webscout/search/engines/yahoo.py +0 -65
- webscout/search/engines/yahoo_news.py +0 -64
- /webscout/{auth → server}/exceptions.py +0 -0
- /webscout/{auth → server}/providers.py +0 -0
- /webscout/{auth → server}/request_models.py +0 -0
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/WHEEL +0 -0
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-2025.10.14.1.dist-info → webscout-2025.10.16.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Yahoo news search engine with comprehensive features."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from secrets import token_urlsafe
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .base import YahooSearchEngine
|
|
10
|
+
from ...results import NewsResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_image(u: str) -> str:
|
|
14
|
+
"""Sanitize image URL.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
u: Image URL
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Cleaned URL or empty string
|
|
21
|
+
"""
|
|
22
|
+
if not u:
|
|
23
|
+
return ""
|
|
24
|
+
|
|
25
|
+
# Skip data URIs
|
|
26
|
+
if u.startswith("data:image"):
|
|
27
|
+
return ""
|
|
28
|
+
|
|
29
|
+
return u
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_source(s: str) -> str:
|
|
33
|
+
"""Remove ' via Yahoo' from source string.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
s: Source string
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Cleaned source name
|
|
40
|
+
"""
|
|
41
|
+
if not s:
|
|
42
|
+
return s
|
|
43
|
+
|
|
44
|
+
return s.replace(" via Yahoo", "").replace(" - Yahoo", "").strip()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class YahooNews(YahooSearchEngine[NewsResult]):
|
|
48
|
+
"""Yahoo news search engine with advanced filtering.
|
|
49
|
+
|
|
50
|
+
Features:
|
|
51
|
+
- Time-based filtering
|
|
52
|
+
- Category filtering
|
|
53
|
+
- Source filtering
|
|
54
|
+
- Pagination support
|
|
55
|
+
- Rich metadata extraction
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
name = "yahoo"
|
|
59
|
+
category = "news"
|
|
60
|
+
|
|
61
|
+
search_url = "https://news.search.yahoo.com/search"
|
|
62
|
+
search_method = "GET"
|
|
63
|
+
|
|
64
|
+
# XPath selectors for news articles
|
|
65
|
+
items_xpath = "//div[contains(@class, 'NewsArticle') or contains(@class, 'dd') and contains(@class, 'algo')]"
|
|
66
|
+
elements_xpath: Mapping[str, str] = {
|
|
67
|
+
"date": ".//span[contains(@class, 'fc-2nd') or contains(@class, 'age') or contains(@class, 's-time')]//text()",
|
|
68
|
+
"title": ".//h4//a//text() | .//h3//a//text()",
|
|
69
|
+
"url": ".//h4//a/@href | .//h3//a/@href",
|
|
70
|
+
"body": ".//p//text() | .//div[contains(@class, 'compText')]//text()",
|
|
71
|
+
"image": ".//img/@src",
|
|
72
|
+
"source": ".//span[contains(@class, 's-source') or contains(@class, 'source')]//text()",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
def build_payload(
|
|
76
|
+
self,
|
|
77
|
+
query: str,
|
|
78
|
+
region: str,
|
|
79
|
+
safesearch: str,
|
|
80
|
+
timelimit: str | None,
|
|
81
|
+
page: int = 1,
|
|
82
|
+
**kwargs: Any,
|
|
83
|
+
) -> dict[str, Any]:
|
|
84
|
+
"""Build news search payload.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
query: Search query
|
|
88
|
+
region: Region code
|
|
89
|
+
safesearch: Safe search level
|
|
90
|
+
timelimit: Time filter (d, w, m)
|
|
91
|
+
page: Page number
|
|
92
|
+
**kwargs: Additional parameters
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Query parameters dictionary
|
|
96
|
+
"""
|
|
97
|
+
# Generate dynamic URL tokens for tracking
|
|
98
|
+
self.search_url = (
|
|
99
|
+
f"https://news.search.yahoo.com/search"
|
|
100
|
+
f";_ylt={token_urlsafe(24 * 3 // 4)}"
|
|
101
|
+
f";_ylu={token_urlsafe(47 * 3 // 4)}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
payload = {
|
|
105
|
+
"p": query,
|
|
106
|
+
"ei": "UTF-8",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Pagination - Yahoo news uses 'b' parameter
|
|
110
|
+
if page > 1:
|
|
111
|
+
# Each page shows approximately 10 articles
|
|
112
|
+
payload["b"] = f"{(page - 1) * 10 + 1}"
|
|
113
|
+
|
|
114
|
+
# Time filter
|
|
115
|
+
if timelimit:
|
|
116
|
+
time_map = {
|
|
117
|
+
"d": "1d", # Past 24 hours
|
|
118
|
+
"w": "1w", # Past week
|
|
119
|
+
"m": "1m", # Past month
|
|
120
|
+
}
|
|
121
|
+
if timelimit in time_map:
|
|
122
|
+
payload["btf"] = time_map[timelimit]
|
|
123
|
+
|
|
124
|
+
# Additional filters
|
|
125
|
+
if "category" in kwargs:
|
|
126
|
+
payload["category"] = kwargs["category"]
|
|
127
|
+
|
|
128
|
+
if "sort" in kwargs:
|
|
129
|
+
# Sort by relevance or date
|
|
130
|
+
payload["sort"] = kwargs["sort"]
|
|
131
|
+
|
|
132
|
+
return payload
|
|
133
|
+
|
|
134
|
+
def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
|
|
135
|
+
"""Post-process news results.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
results: Raw extracted results
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Cleaned news results
|
|
142
|
+
"""
|
|
143
|
+
cleaned_results = []
|
|
144
|
+
|
|
145
|
+
for result in results:
|
|
146
|
+
# Clean image URL
|
|
147
|
+
result.image = extract_image(result.image)
|
|
148
|
+
|
|
149
|
+
# Clean source name
|
|
150
|
+
result.source = extract_source(result.source)
|
|
151
|
+
|
|
152
|
+
# Extract URL from redirect
|
|
153
|
+
if result.url and "/RU=" in result.url:
|
|
154
|
+
from urllib.parse import unquote
|
|
155
|
+
start = result.url.find("/RU=") + 4
|
|
156
|
+
end = result.url.find("/RK=", start)
|
|
157
|
+
if end == -1:
|
|
158
|
+
end = len(result.url)
|
|
159
|
+
result.url = unquote(result.url[start:end])
|
|
160
|
+
|
|
161
|
+
# Filter out results without essential fields
|
|
162
|
+
if result.title and result.url:
|
|
163
|
+
cleaned_results.append(result)
|
|
164
|
+
|
|
165
|
+
return cleaned_results
|
|
166
|
+
|
|
167
|
+
def search(
|
|
168
|
+
self,
|
|
169
|
+
query: str,
|
|
170
|
+
region: str = "us-en",
|
|
171
|
+
safesearch: str = "moderate",
|
|
172
|
+
timelimit: str | None = None,
|
|
173
|
+
page: int = 1,
|
|
174
|
+
max_results: int | None = None,
|
|
175
|
+
**kwargs: Any,
|
|
176
|
+
) -> list[NewsResult] | None:
|
|
177
|
+
"""Search Yahoo News with pagination.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
query: News search query
|
|
181
|
+
region: Region code
|
|
182
|
+
safesearch: Safe search level
|
|
183
|
+
timelimit: Time filter (d, w, m)
|
|
184
|
+
page: Starting page
|
|
185
|
+
max_results: Maximum results to return
|
|
186
|
+
**kwargs: Additional parameters (category, sort)
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
List of NewsResult objects
|
|
190
|
+
"""
|
|
191
|
+
results = []
|
|
192
|
+
current_page = page
|
|
193
|
+
max_pages = kwargs.get("max_pages", 10)
|
|
194
|
+
|
|
195
|
+
while current_page <= max_pages:
|
|
196
|
+
payload = self.build_payload(
|
|
197
|
+
query=query,
|
|
198
|
+
region=region,
|
|
199
|
+
safesearch=safesearch,
|
|
200
|
+
timelimit=timelimit,
|
|
201
|
+
page=current_page,
|
|
202
|
+
**kwargs
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
html_text = self.request(self.search_method, self.search_url, params=payload)
|
|
206
|
+
if not html_text:
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
html_text = self.pre_process_html(html_text)
|
|
210
|
+
page_results = self.extract_results(html_text)
|
|
211
|
+
|
|
212
|
+
if not page_results:
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
results.extend(page_results)
|
|
216
|
+
|
|
217
|
+
if max_results and len(results) >= max_results:
|
|
218
|
+
break
|
|
219
|
+
|
|
220
|
+
current_page += 1
|
|
221
|
+
|
|
222
|
+
results = self.post_extract_results(results)
|
|
223
|
+
|
|
224
|
+
if max_results:
|
|
225
|
+
results = results[:max_results]
|
|
226
|
+
|
|
227
|
+
return results if results else None
|
|
228
|
+
|
|
229
|
+
def run(
|
|
230
|
+
self,
|
|
231
|
+
keywords: str,
|
|
232
|
+
region: str = "us-en",
|
|
233
|
+
safesearch: str = "moderate",
|
|
234
|
+
timelimit: str | None = None,
|
|
235
|
+
max_results: int | None = None,
|
|
236
|
+
) -> list[dict[str, str]]:
|
|
237
|
+
"""Run news search and return results as dictionaries.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
keywords: Search query.
|
|
241
|
+
region: Region code.
|
|
242
|
+
safesearch: Safe search level.
|
|
243
|
+
timelimit: Time filter.
|
|
244
|
+
max_results: Maximum number of results.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
List of news result dictionaries.
|
|
248
|
+
"""
|
|
249
|
+
results = self.search(
|
|
250
|
+
query=keywords,
|
|
251
|
+
region=region,
|
|
252
|
+
safesearch=safesearch,
|
|
253
|
+
timelimit=timelimit,
|
|
254
|
+
max_results=max_results,
|
|
255
|
+
)
|
|
256
|
+
if results is None:
|
|
257
|
+
return []
|
|
258
|
+
return [result.to_dict() for result in results]
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Yahoo search suggestions engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .base import YahooSearchEngine
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class YahooSuggestions(YahooSearchEngine[str]):
|
|
12
|
+
"""Yahoo search suggestions engine.
|
|
13
|
+
|
|
14
|
+
Provides autocomplete suggestions as you type.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
name = "yahoo"
|
|
18
|
+
category = "suggestions"
|
|
19
|
+
|
|
20
|
+
search_url = "https://search.yahoo.com/sugg/gossip/gossip-us-ura"
|
|
21
|
+
search_method = "GET"
|
|
22
|
+
|
|
23
|
+
def build_payload(
|
|
24
|
+
self,
|
|
25
|
+
query: str,
|
|
26
|
+
region: str,
|
|
27
|
+
safesearch: str,
|
|
28
|
+
timelimit: str | None,
|
|
29
|
+
page: int = 1,
|
|
30
|
+
**kwargs: Any,
|
|
31
|
+
) -> dict[str, Any]:
|
|
32
|
+
"""Build suggestions payload.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
query: Partial search query
|
|
36
|
+
region: Region code
|
|
37
|
+
safesearch: Safe search level (unused)
|
|
38
|
+
timelimit: Time limit (unused)
|
|
39
|
+
page: Page number (unused)
|
|
40
|
+
**kwargs: Additional parameters
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Query parameters
|
|
44
|
+
"""
|
|
45
|
+
payload = {
|
|
46
|
+
"command": query,
|
|
47
|
+
"output": "sd1",
|
|
48
|
+
"nresults": kwargs.get("max_suggestions", 10),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return payload
|
|
52
|
+
|
|
53
|
+
def extract_results(self, html_text: str) -> list[str]:
|
|
54
|
+
"""Extract suggestions from JSON response.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
html_text: JSON response text
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
List of suggestion strings
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
data = json.loads(html_text)
|
|
64
|
+
|
|
65
|
+
# Yahoo returns suggestions in 'r' key
|
|
66
|
+
if "r" in data and isinstance(data["r"], list):
|
|
67
|
+
suggestions = []
|
|
68
|
+
for item in data["r"]:
|
|
69
|
+
if isinstance(item, dict) and "k" in item:
|
|
70
|
+
suggestions.append(item["k"])
|
|
71
|
+
elif isinstance(item, str):
|
|
72
|
+
suggestions.append(item)
|
|
73
|
+
return suggestions
|
|
74
|
+
|
|
75
|
+
return []
|
|
76
|
+
except (json.JSONDecodeError, KeyError, TypeError):
|
|
77
|
+
return []
|
|
78
|
+
|
|
79
|
+
def search(
|
|
80
|
+
self,
|
|
81
|
+
query: str,
|
|
82
|
+
region: str = "us-en",
|
|
83
|
+
safesearch: str = "moderate",
|
|
84
|
+
timelimit: str | None = None,
|
|
85
|
+
page: int = 1,
|
|
86
|
+
max_results: int | None = None,
|
|
87
|
+
**kwargs: Any,
|
|
88
|
+
) -> list[str] | None:
|
|
89
|
+
"""Get search suggestions for a query.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
query: Partial search query
|
|
93
|
+
region: Region code
|
|
94
|
+
safesearch: Safe search level
|
|
95
|
+
timelimit: Time limit
|
|
96
|
+
page: Page number
|
|
97
|
+
max_results: Maximum suggestions
|
|
98
|
+
**kwargs: Additional parameters
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List of suggestion strings
|
|
102
|
+
"""
|
|
103
|
+
if max_results:
|
|
104
|
+
kwargs["max_suggestions"] = max_results
|
|
105
|
+
|
|
106
|
+
payload = self.build_payload(
|
|
107
|
+
query=query,
|
|
108
|
+
region=region,
|
|
109
|
+
safesearch=safesearch,
|
|
110
|
+
timelimit=timelimit,
|
|
111
|
+
page=page,
|
|
112
|
+
**kwargs
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
response = self.request(self.search_method, self.search_url, params=payload)
|
|
116
|
+
if not response:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
suggestions = self.extract_results(response)
|
|
120
|
+
|
|
121
|
+
if max_results:
|
|
122
|
+
suggestions = suggestions[:max_results]
|
|
123
|
+
|
|
124
|
+
return suggestions if suggestions else None
|
|
125
|
+
|
|
126
|
+
def run(self, keywords: str, region: str = "us-en") -> list[str]:
|
|
127
|
+
"""Run suggestions search and return results.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
keywords: Search query.
|
|
131
|
+
region: Region code.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of suggestion strings.
|
|
135
|
+
"""
|
|
136
|
+
results = self.search(
|
|
137
|
+
query=keywords,
|
|
138
|
+
region=region,
|
|
139
|
+
)
|
|
140
|
+
return results if results else []
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""Yahoo text search engine with pagination support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import unquote_plus, urljoin
|
|
8
|
+
|
|
9
|
+
from .base import YahooSearchEngine
|
|
10
|
+
from ...results import TextResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_url(u: str) -> str:
|
|
14
|
+
"""Extract and sanitize URL from Yahoo redirect.
|
|
15
|
+
|
|
16
|
+
Yahoo uses /RU= redirect URLs that need to be decoded.
|
|
17
|
+
Example: /url?sa=t&url=https%3A%2F%2Fexample.com
|
|
18
|
+
"""
|
|
19
|
+
if not u:
|
|
20
|
+
return u
|
|
21
|
+
|
|
22
|
+
# Handle /RU= redirect format
|
|
23
|
+
if "/RU=" in u:
|
|
24
|
+
start = u.find("/RU=") + 4
|
|
25
|
+
end = u.find("/RK=", start)
|
|
26
|
+
if end == -1:
|
|
27
|
+
end = len(u)
|
|
28
|
+
return unquote_plus(u[start:end])
|
|
29
|
+
|
|
30
|
+
return u
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class YahooText(YahooSearchEngine[TextResult]):
|
|
34
|
+
"""Yahoo text search engine with full pagination support.
|
|
35
|
+
|
|
36
|
+
Features:
|
|
37
|
+
- Multi-page navigation like a human
|
|
38
|
+
- Automatic next page detection
|
|
39
|
+
- Clean result extraction
|
|
40
|
+
- Time filter support
|
|
41
|
+
- Region support
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
name = "yahoo"
|
|
45
|
+
category = "text"
|
|
46
|
+
|
|
47
|
+
search_url = "https://search.yahoo.com/search"
|
|
48
|
+
search_method = "GET"
|
|
49
|
+
|
|
50
|
+
# XPath selectors for result extraction
|
|
51
|
+
items_xpath = "//div[contains(@class, 'compTitle')]"
|
|
52
|
+
elements_xpath: Mapping[str, str] = {
|
|
53
|
+
"title": ".//h3//span//text()",
|
|
54
|
+
"href": ".//a/@href",
|
|
55
|
+
"body": "./following-sibling::div[contains(@class, 'compText')]//text()",
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
def build_payload(
|
|
59
|
+
self,
|
|
60
|
+
query: str,
|
|
61
|
+
region: str,
|
|
62
|
+
safesearch: str,
|
|
63
|
+
timelimit: str | None,
|
|
64
|
+
page: int = 1,
|
|
65
|
+
**kwargs: Any
|
|
66
|
+
) -> dict[str, Any]:
|
|
67
|
+
"""Build search payload for Yahoo.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
query: Search query string
|
|
71
|
+
region: Region code (e.g., 'us-en')
|
|
72
|
+
safesearch: Safe search level
|
|
73
|
+
timelimit: Time limit filter (d=day, w=week, m=month)
|
|
74
|
+
page: Page number (1-indexed)
|
|
75
|
+
**kwargs: Additional parameters
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Dictionary of query parameters
|
|
79
|
+
"""
|
|
80
|
+
payload = {
|
|
81
|
+
"p": query,
|
|
82
|
+
"ei": "UTF-8",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Pagination: Yahoo uses 'b' parameter for offset
|
|
86
|
+
# Page 1: no b parameter or b=1
|
|
87
|
+
# Page 2: b=8 (shows results 8-14)
|
|
88
|
+
# Page 3: b=15, etc.
|
|
89
|
+
if page > 1:
|
|
90
|
+
payload["b"] = f"{(page - 1) * 7 + 1}"
|
|
91
|
+
|
|
92
|
+
# Time filter
|
|
93
|
+
if timelimit:
|
|
94
|
+
payload["btf"] = timelimit
|
|
95
|
+
|
|
96
|
+
return payload
|
|
97
|
+
|
|
98
|
+
def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
|
|
99
|
+
"""Post-process and clean extracted results.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
results: Raw extracted results
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Cleaned and filtered results
|
|
106
|
+
"""
|
|
107
|
+
cleaned_results = []
|
|
108
|
+
|
|
109
|
+
for result in results:
|
|
110
|
+
# Extract real URL from redirect
|
|
111
|
+
if result.href:
|
|
112
|
+
result.href = extract_url(result.href)
|
|
113
|
+
|
|
114
|
+
# Filter out empty results
|
|
115
|
+
if result.title and result.href:
|
|
116
|
+
cleaned_results.append(result)
|
|
117
|
+
|
|
118
|
+
return cleaned_results
|
|
119
|
+
|
|
120
|
+
def search(
|
|
121
|
+
self,
|
|
122
|
+
query: str,
|
|
123
|
+
region: str = "us-en",
|
|
124
|
+
safesearch: str = "moderate",
|
|
125
|
+
timelimit: str | None = None,
|
|
126
|
+
page: int = 1,
|
|
127
|
+
max_results: int | None = None,
|
|
128
|
+
**kwargs: Any,
|
|
129
|
+
) -> list[TextResult] | None:
|
|
130
|
+
"""Search Yahoo with automatic pagination like a human browser.
|
|
131
|
+
|
|
132
|
+
This method automatically follows pagination links to gather results
|
|
133
|
+
across multiple pages, similar to how a human would browse search results.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
query: Search query string
|
|
137
|
+
region: Region code
|
|
138
|
+
safesearch: Safe search level
|
|
139
|
+
timelimit: Time filter (d=day, w=week, m=month, y=year)
|
|
140
|
+
page: Starting page number
|
|
141
|
+
max_results: Maximum number of results to return
|
|
142
|
+
**kwargs: Additional search parameters
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of TextResult objects, or None if search fails
|
|
146
|
+
"""
|
|
147
|
+
results = []
|
|
148
|
+
current_page = page
|
|
149
|
+
max_pages = kwargs.get("max_pages", 10) # Limit to prevent infinite loops
|
|
150
|
+
|
|
151
|
+
while current_page <= max_pages:
|
|
152
|
+
# Build payload for current page
|
|
153
|
+
payload = self.build_payload(
|
|
154
|
+
query=query,
|
|
155
|
+
region=region,
|
|
156
|
+
safesearch=safesearch,
|
|
157
|
+
timelimit=timelimit,
|
|
158
|
+
page=current_page,
|
|
159
|
+
**kwargs
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Make request
|
|
163
|
+
html_text = self.request(self.search_method, self.search_url, params=payload)
|
|
164
|
+
if not html_text:
|
|
165
|
+
break
|
|
166
|
+
|
|
167
|
+
# Pre-process HTML
|
|
168
|
+
html_text = self.pre_process_html(html_text)
|
|
169
|
+
|
|
170
|
+
# Extract results from current page
|
|
171
|
+
page_results = self.extract_results(html_text)
|
|
172
|
+
if not page_results:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
results.extend(page_results)
|
|
176
|
+
|
|
177
|
+
# Check if we have enough results
|
|
178
|
+
if max_results and len(results) >= max_results:
|
|
179
|
+
break
|
|
180
|
+
|
|
181
|
+
# Look for next page link
|
|
182
|
+
tree = self.extract_tree(html_text)
|
|
183
|
+
next_links = tree.xpath("//a[contains(text(), 'Next') or contains(@class, 'next')]/@href")
|
|
184
|
+
|
|
185
|
+
if not next_links:
|
|
186
|
+
# Try to find numbered page links
|
|
187
|
+
page_links = tree.xpath(f"//a[contains(text(), '{current_page + 1}')]/@href")
|
|
188
|
+
if not page_links:
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
current_page += 1
|
|
192
|
+
|
|
193
|
+
# Post-process all results
|
|
194
|
+
results = self.post_extract_results(results)
|
|
195
|
+
|
|
196
|
+
# Trim to max_results if specified
|
|
197
|
+
if max_results:
|
|
198
|
+
results = results[:max_results]
|
|
199
|
+
|
|
200
|
+
return results if results else None
|
|
201
|
+
|
|
202
|
+
def search_page(
|
|
203
|
+
self,
|
|
204
|
+
query: str,
|
|
205
|
+
region: str = "us-en",
|
|
206
|
+
safesearch: str = "moderate",
|
|
207
|
+
timelimit: str | None = None,
|
|
208
|
+
page: int = 1,
|
|
209
|
+
**kwargs: Any,
|
|
210
|
+
) -> list[TextResult] | None:
|
|
211
|
+
"""Search a single page (for compatibility).
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
query: Search query
|
|
215
|
+
region: Region code
|
|
216
|
+
safesearch: Safe search level
|
|
217
|
+
timelimit: Time filter
|
|
218
|
+
page: Page number
|
|
219
|
+
**kwargs: Additional parameters
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
List of results from the specified page
|
|
223
|
+
"""
|
|
224
|
+
payload = self.build_payload(
|
|
225
|
+
query=query,
|
|
226
|
+
region=region,
|
|
227
|
+
safesearch=safesearch,
|
|
228
|
+
timelimit=timelimit,
|
|
229
|
+
page=page,
|
|
230
|
+
**kwargs
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
html_text = self.request(self.search_method, self.search_url, params=payload)
|
|
234
|
+
if not html_text:
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
html_text = self.pre_process_html(html_text)
|
|
238
|
+
results = self.extract_results(html_text)
|
|
239
|
+
|
|
240
|
+
return self.post_extract_results(results) if results else None
|
|
241
|
+
|
|
242
|
+
def run(
|
|
243
|
+
self,
|
|
244
|
+
keywords: str,
|
|
245
|
+
region: str = "us-en",
|
|
246
|
+
safesearch: str = "moderate",
|
|
247
|
+
timelimit: str | None = None,
|
|
248
|
+
backend: str = "auto",
|
|
249
|
+
max_results: int | None = None,
|
|
250
|
+
) -> list[dict[str, str]]:
|
|
251
|
+
"""Run text search and return results as dictionaries.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
keywords: Search query.
|
|
255
|
+
region: Region code.
|
|
256
|
+
safesearch: Safe search level.
|
|
257
|
+
timelimit: Time filter.
|
|
258
|
+
backend: Backend type (ignored for Yahoo).
|
|
259
|
+
max_results: Maximum number of results.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
List of search result dictionaries.
|
|
263
|
+
"""
|
|
264
|
+
results = self.search(
|
|
265
|
+
query=keywords,
|
|
266
|
+
region=region,
|
|
267
|
+
safesearch=safesearch,
|
|
268
|
+
timelimit=timelimit,
|
|
269
|
+
max_results=max_results,
|
|
270
|
+
)
|
|
271
|
+
if results is None:
|
|
272
|
+
return []
|
|
273
|
+
return [result.to_dict() for result in results]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Yahoo translate search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import YahooSearchEngine
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class YahooTranslate(YahooSearchEngine):
|
|
9
|
+
"""Yahoo translation."""
|
|
10
|
+
|
|
11
|
+
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
12
|
+
"""Translate text using Yahoo.
|
|
13
|
+
|
|
14
|
+
Not supported.
|
|
15
|
+
"""
|
|
16
|
+
raise NotImplementedError("Yahoo does not support translation")
|