webscout 2025.10.15__py3-none-any.whl → 2025.10.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Extra/YTToolkit/README.md +1 -1
- webscout/Extra/tempmail/README.md +3 -3
- webscout/Provider/ClaudeOnline.py +350 -0
- webscout/Provider/OPENAI/README.md +1 -1
- webscout/Provider/TTI/bing.py +4 -4
- webscout/Provider/TTI/claudeonline.py +315 -0
- webscout/__init__.py +1 -1
- webscout/client.py +4 -5
- webscout/litprinter/__init__.py +0 -42
- webscout/scout/README.md +59 -8
- webscout/scout/core/scout.py +62 -0
- webscout/scout/element.py +251 -45
- webscout/search/__init__.py +3 -4
- webscout/search/engines/bing/images.py +5 -2
- webscout/search/engines/bing/news.py +6 -4
- webscout/search/engines/bing/text.py +5 -2
- webscout/search/engines/yahoo/__init__.py +41 -0
- webscout/search/engines/yahoo/answers.py +16 -0
- webscout/search/engines/yahoo/base.py +34 -0
- webscout/search/engines/yahoo/images.py +324 -0
- webscout/search/engines/yahoo/maps.py +16 -0
- webscout/search/engines/yahoo/news.py +258 -0
- webscout/search/engines/yahoo/suggestions.py +140 -0
- webscout/search/engines/yahoo/text.py +273 -0
- webscout/search/engines/yahoo/translate.py +16 -0
- webscout/search/engines/yahoo/videos.py +302 -0
- webscout/search/engines/yahoo/weather.py +220 -0
- webscout/search/http_client.py +1 -1
- webscout/search/yahoo_main.py +54 -0
- webscout/{auth → server}/__init__.py +2 -23
- webscout/server/config.py +84 -0
- webscout/{auth → server}/request_processing.py +3 -28
- webscout/{auth → server}/routes.py +6 -148
- webscout/server/schemas.py +23 -0
- webscout/{auth → server}/server.py +11 -43
- webscout/server/simple_logger.py +84 -0
- webscout/version.py +1 -1
- webscout/version.py.bak +1 -1
- webscout/zeroart/README.md +17 -9
- webscout/zeroart/__init__.py +78 -6
- webscout/zeroart/effects.py +51 -1
- webscout/zeroart/fonts.py +559 -1
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/METADATA +11 -54
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/RECORD +51 -46
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/entry_points.txt +1 -1
- webscout/Extra/weather.md +0 -281
- webscout/auth/api_key_manager.py +0 -189
- webscout/auth/auth_system.py +0 -85
- webscout/auth/config.py +0 -175
- webscout/auth/database.py +0 -755
- webscout/auth/middleware.py +0 -248
- webscout/auth/models.py +0 -185
- webscout/auth/rate_limiter.py +0 -254
- webscout/auth/schemas.py +0 -103
- webscout/auth/simple_logger.py +0 -236
- webscout/search/engines/yahoo.py +0 -65
- webscout/search/engines/yahoo_news.py +0 -64
- /webscout/{auth → server}/exceptions.py +0 -0
- /webscout/{auth → server}/providers.py +0 -0
- /webscout/{auth → server}/request_models.py +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/WHEEL +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-2025.10.15.dist-info → webscout-2025.10.17.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""Yahoo image search engine with advanced filters."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import urljoin
|
|
8
|
+
|
|
9
|
+
from .base import YahooSearchEngine
|
|
10
|
+
from ...results import ImagesResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class YahooImages(YahooSearchEngine[ImagesResult]):
|
|
14
|
+
"""Yahoo image search engine with filter support.
|
|
15
|
+
|
|
16
|
+
Features:
|
|
17
|
+
- Size filters (small, medium, large, wallpaper)
|
|
18
|
+
- Color filters (color, bw, red, orange, yellow, etc.)
|
|
19
|
+
- Type filters (photo, clipart, lineart, transparent)
|
|
20
|
+
- Layout filters (square, wide, tall)
|
|
21
|
+
- Time filters
|
|
22
|
+
- Pagination support
|
|
23
|
+
|
|
24
|
+
Note: Yahoo does not support reverse image search (searching by image upload/URL).
|
|
25
|
+
For reverse image search functionality, use Google Images or Bing Images instead.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "yahoo"
|
|
29
|
+
category = "images"
|
|
30
|
+
|
|
31
|
+
search_url = "https://images.search.yahoo.com/search/images"
|
|
32
|
+
search_method = "GET"
|
|
33
|
+
search_headers = {
|
|
34
|
+
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1"
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# XPath selectors
|
|
38
|
+
items_xpath = "//li[contains(@class, 'ld')]"
|
|
39
|
+
elements_xpath: Mapping[str, str] = {
|
|
40
|
+
"title": "@data",
|
|
41
|
+
"image": "@data",
|
|
42
|
+
"thumbnail": "@data",
|
|
43
|
+
"url": "@data",
|
|
44
|
+
"source": "@data",
|
|
45
|
+
"width": "@data",
|
|
46
|
+
"height": "@data",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Filter mappings
|
|
50
|
+
SIZE_FILTERS = {
|
|
51
|
+
"small": "small",
|
|
52
|
+
"medium": "medium",
|
|
53
|
+
"large": "large",
|
|
54
|
+
"wallpaper": "wallpaper",
|
|
55
|
+
"all": "",
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
COLOR_FILTERS = {
|
|
59
|
+
"color": "color",
|
|
60
|
+
"bw": "bw",
|
|
61
|
+
"black": "black",
|
|
62
|
+
"white": "white",
|
|
63
|
+
"red": "red",
|
|
64
|
+
"orange": "orange",
|
|
65
|
+
"yellow": "yellow",
|
|
66
|
+
"green": "green",
|
|
67
|
+
"teal": "teal",
|
|
68
|
+
"blue": "blue",
|
|
69
|
+
"purple": "purple",
|
|
70
|
+
"pink": "pink",
|
|
71
|
+
"brown": "brown",
|
|
72
|
+
"gray": "gray",
|
|
73
|
+
"all": "",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
TYPE_FILTERS = {
|
|
77
|
+
"photo": "photo",
|
|
78
|
+
"clipart": "clipart",
|
|
79
|
+
"lineart": "linedrawing",
|
|
80
|
+
"transparent": "transparent",
|
|
81
|
+
"gif": "animatedgif",
|
|
82
|
+
"all": "",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
LAYOUT_FILTERS = {
|
|
86
|
+
"square": "square",
|
|
87
|
+
"wide": "wide",
|
|
88
|
+
"tall": "tall",
|
|
89
|
+
"all": "",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def build_payload(
|
|
93
|
+
self,
|
|
94
|
+
query: str,
|
|
95
|
+
region: str,
|
|
96
|
+
safesearch: str,
|
|
97
|
+
timelimit: str | None,
|
|
98
|
+
page: int = 1,
|
|
99
|
+
**kwargs: Any,
|
|
100
|
+
) -> dict[str, Any]:
|
|
101
|
+
"""Build image search payload with filters.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
query: Search query
|
|
105
|
+
region: Region code
|
|
106
|
+
safesearch: Safe search level (on/moderate/off)
|
|
107
|
+
timelimit: Time filter (d, w, m)
|
|
108
|
+
page: Page number
|
|
109
|
+
**kwargs: Additional filters including:
|
|
110
|
+
- size: Image size filter
|
|
111
|
+
- color: Color filter
|
|
112
|
+
- type: Image type filter
|
|
113
|
+
- layout: Layout/aspect ratio filter
|
|
114
|
+
- license: Usage rights filter
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Query parameters dictionary
|
|
118
|
+
"""
|
|
119
|
+
payload = {
|
|
120
|
+
"p": query,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Pagination - Yahoo images use 'b' parameter
|
|
124
|
+
if page > 1:
|
|
125
|
+
# Each page shows approximately 40 images
|
|
126
|
+
payload["b"] = f"{(page - 1) * 40 + 1}"
|
|
127
|
+
|
|
128
|
+
# Safe search
|
|
129
|
+
if safesearch == "on":
|
|
130
|
+
payload["safe"] = "active"
|
|
131
|
+
elif safesearch == "off":
|
|
132
|
+
payload["safe"] = "off"
|
|
133
|
+
|
|
134
|
+
# Time filter
|
|
135
|
+
if timelimit:
|
|
136
|
+
time_map = {
|
|
137
|
+
"d": "1d", # Past 24 hours
|
|
138
|
+
"w": "1w", # Past week
|
|
139
|
+
"m": "1m", # Past month
|
|
140
|
+
}
|
|
141
|
+
if timelimit in time_map:
|
|
142
|
+
payload["age"] = time_map[timelimit]
|
|
143
|
+
|
|
144
|
+
# Size filter
|
|
145
|
+
if "size" in kwargs and kwargs["size"] in self.SIZE_FILTERS:
|
|
146
|
+
size_val = self.SIZE_FILTERS[kwargs["size"]]
|
|
147
|
+
if size_val:
|
|
148
|
+
payload["imgsz"] = size_val
|
|
149
|
+
|
|
150
|
+
# Color filter
|
|
151
|
+
if "color" in kwargs and kwargs["color"] in self.COLOR_FILTERS:
|
|
152
|
+
color_val = self.COLOR_FILTERS[kwargs["color"]]
|
|
153
|
+
if color_val:
|
|
154
|
+
payload["imgc"] = color_val
|
|
155
|
+
|
|
156
|
+
# Type filter
|
|
157
|
+
if "type" in kwargs and kwargs["type"] in self.TYPE_FILTERS:
|
|
158
|
+
type_val = self.TYPE_FILTERS[kwargs["type"]]
|
|
159
|
+
if type_val:
|
|
160
|
+
payload["imgt"] = type_val
|
|
161
|
+
|
|
162
|
+
# Layout filter
|
|
163
|
+
if "layout" in kwargs and kwargs["layout"] in self.LAYOUT_FILTERS:
|
|
164
|
+
layout_val = self.LAYOUT_FILTERS[kwargs["layout"]]
|
|
165
|
+
if layout_val:
|
|
166
|
+
payload["imgsp"] = layout_val
|
|
167
|
+
|
|
168
|
+
return payload
|
|
169
|
+
|
|
170
|
+
def post_extract_results(self, results: list[ImagesResult]) -> list[ImagesResult]:
|
|
171
|
+
"""Post-process image results to parse JSON data.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
results: Raw extracted results
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Cleaned results with proper URLs and metadata
|
|
178
|
+
"""
|
|
179
|
+
import json
|
|
180
|
+
from urllib.parse import unquote
|
|
181
|
+
|
|
182
|
+
cleaned_results = []
|
|
183
|
+
|
|
184
|
+
for result in results:
|
|
185
|
+
# Parse JSON data from the data attribute
|
|
186
|
+
if result.title and result.title.startswith('{'):
|
|
187
|
+
try:
|
|
188
|
+
data = json.loads(result.title)
|
|
189
|
+
|
|
190
|
+
# Extract title
|
|
191
|
+
result.title = data.get('desc', '') or data.get('tit', '')
|
|
192
|
+
|
|
193
|
+
# Extract URLs
|
|
194
|
+
result.url = data.get('rurl', '')
|
|
195
|
+
result.thumbnail = data.get('turl', '')
|
|
196
|
+
result.image = data.get('turlL', '') or data.get('turl', '')
|
|
197
|
+
|
|
198
|
+
# Extract dimensions
|
|
199
|
+
result.width = int(data.get('imgW', 0))
|
|
200
|
+
result.height = int(data.get('imgH', 0))
|
|
201
|
+
|
|
202
|
+
except (json.JSONDecodeError, KeyError, ValueError):
|
|
203
|
+
# If JSON parsing fails, keep original data
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
# Clean URLs if they exist
|
|
207
|
+
if result.url:
|
|
208
|
+
result.url = unquote(result.url)
|
|
209
|
+
if result.image:
|
|
210
|
+
result.image = unquote(result.image)
|
|
211
|
+
if result.thumbnail:
|
|
212
|
+
result.thumbnail = unquote(result.thumbnail)
|
|
213
|
+
|
|
214
|
+
cleaned_results.append(result)
|
|
215
|
+
|
|
216
|
+
return cleaned_results
|
|
217
|
+
|
|
218
|
+
def search(
|
|
219
|
+
self,
|
|
220
|
+
query: str,
|
|
221
|
+
region: str = "us-en",
|
|
222
|
+
safesearch: str = "moderate",
|
|
223
|
+
timelimit: str | None = None,
|
|
224
|
+
page: int = 1,
|
|
225
|
+
max_results: int | None = None,
|
|
226
|
+
**kwargs: Any,
|
|
227
|
+
) -> list[ImagesResult] | None:
|
|
228
|
+
"""Search Yahoo Images with pagination.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
query: Image search query
|
|
232
|
+
region: Region code
|
|
233
|
+
safesearch: Safe search level
|
|
234
|
+
timelimit: Time filter
|
|
235
|
+
page: Starting page
|
|
236
|
+
max_results: Maximum results to return
|
|
237
|
+
**kwargs: Additional filters (size, color, type, layout)
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
List of ImageResult objects
|
|
241
|
+
"""
|
|
242
|
+
results = []
|
|
243
|
+
current_page = page
|
|
244
|
+
max_pages = kwargs.get("max_pages", 5)
|
|
245
|
+
|
|
246
|
+
while current_page <= max_pages:
|
|
247
|
+
payload = self.build_payload(
|
|
248
|
+
query=query,
|
|
249
|
+
region=region,
|
|
250
|
+
safesearch=safesearch,
|
|
251
|
+
timelimit=timelimit,
|
|
252
|
+
page=current_page,
|
|
253
|
+
**kwargs
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
html_text = self.request(self.search_method, self.search_url, params=payload)
|
|
257
|
+
if not html_text:
|
|
258
|
+
break
|
|
259
|
+
|
|
260
|
+
html_text = self.pre_process_html(html_text)
|
|
261
|
+
page_results = self.extract_results(html_text)
|
|
262
|
+
|
|
263
|
+
if not page_results:
|
|
264
|
+
break
|
|
265
|
+
|
|
266
|
+
results.extend(page_results)
|
|
267
|
+
|
|
268
|
+
if max_results and len(results) >= max_results:
|
|
269
|
+
break
|
|
270
|
+
|
|
271
|
+
current_page += 1
|
|
272
|
+
|
|
273
|
+
results = self.post_extract_results(results)
|
|
274
|
+
|
|
275
|
+
if max_results:
|
|
276
|
+
results = results[:max_results]
|
|
277
|
+
|
|
278
|
+
return results if results else None
|
|
279
|
+
|
|
280
|
+
def run(
|
|
281
|
+
self,
|
|
282
|
+
keywords: str,
|
|
283
|
+
region: str = "us-en",
|
|
284
|
+
safesearch: str = "moderate",
|
|
285
|
+
timelimit: str | None = None,
|
|
286
|
+
size: str | None = None,
|
|
287
|
+
color: str | None = None,
|
|
288
|
+
type_image: str | None = None,
|
|
289
|
+
layout: str | None = None,
|
|
290
|
+
license_image: str | None = None,
|
|
291
|
+
max_results: int | None = None,
|
|
292
|
+
) -> list[dict[str, str]]:
|
|
293
|
+
"""Run image search and return results as dictionaries.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
keywords: Search query.
|
|
297
|
+
region: Region code.
|
|
298
|
+
safesearch: Safe search level.
|
|
299
|
+
timelimit: Time filter.
|
|
300
|
+
size: Image size filter.
|
|
301
|
+
color: Color filter.
|
|
302
|
+
type_image: Image type filter.
|
|
303
|
+
layout: Layout filter.
|
|
304
|
+
license_image: License filter.
|
|
305
|
+
max_results: Maximum number of results.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
List of image result dictionaries.
|
|
309
|
+
"""
|
|
310
|
+
results = self.search(
|
|
311
|
+
query=keywords,
|
|
312
|
+
region=region,
|
|
313
|
+
safesearch=safesearch,
|
|
314
|
+
timelimit=timelimit,
|
|
315
|
+
size=size,
|
|
316
|
+
color=color,
|
|
317
|
+
type_image=type_image,
|
|
318
|
+
layout=layout,
|
|
319
|
+
license_image=license_image,
|
|
320
|
+
max_results=max_results,
|
|
321
|
+
)
|
|
322
|
+
if results is None:
|
|
323
|
+
return []
|
|
324
|
+
return [result.to_dict() for result in results]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Yahoo maps search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import YahooSearchEngine
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class YahooMaps(YahooSearchEngine):
|
|
9
|
+
"""Yahoo maps search."""
|
|
10
|
+
|
|
11
|
+
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
12
|
+
"""Get maps results from Yahoo.
|
|
13
|
+
|
|
14
|
+
Not supported.
|
|
15
|
+
"""
|
|
16
|
+
raise NotImplementedError("Yahoo does not support maps search")
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Yahoo news search engine with comprehensive features."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from secrets import token_urlsafe
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .base import YahooSearchEngine
|
|
10
|
+
from ...results import NewsResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_image(u: str) -> str:
|
|
14
|
+
"""Sanitize image URL.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
u: Image URL
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Cleaned URL or empty string
|
|
21
|
+
"""
|
|
22
|
+
if not u:
|
|
23
|
+
return ""
|
|
24
|
+
|
|
25
|
+
# Skip data URIs
|
|
26
|
+
if u.startswith("data:image"):
|
|
27
|
+
return ""
|
|
28
|
+
|
|
29
|
+
return u
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_source(s: str) -> str:
|
|
33
|
+
"""Remove ' via Yahoo' from source string.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
s: Source string
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Cleaned source name
|
|
40
|
+
"""
|
|
41
|
+
if not s:
|
|
42
|
+
return s
|
|
43
|
+
|
|
44
|
+
return s.replace(" via Yahoo", "").replace(" - Yahoo", "").strip()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class YahooNews(YahooSearchEngine[NewsResult]):
|
|
48
|
+
"""Yahoo news search engine with advanced filtering.
|
|
49
|
+
|
|
50
|
+
Features:
|
|
51
|
+
- Time-based filtering
|
|
52
|
+
- Category filtering
|
|
53
|
+
- Source filtering
|
|
54
|
+
- Pagination support
|
|
55
|
+
- Rich metadata extraction
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
name = "yahoo"
|
|
59
|
+
category = "news"
|
|
60
|
+
|
|
61
|
+
search_url = "https://news.search.yahoo.com/search"
|
|
62
|
+
search_method = "GET"
|
|
63
|
+
|
|
64
|
+
# XPath selectors for news articles
|
|
65
|
+
items_xpath = "//div[contains(@class, 'NewsArticle') or contains(@class, 'dd') and contains(@class, 'algo')]"
|
|
66
|
+
elements_xpath: Mapping[str, str] = {
|
|
67
|
+
"date": ".//span[contains(@class, 'fc-2nd') or contains(@class, 'age') or contains(@class, 's-time')]//text()",
|
|
68
|
+
"title": ".//h4//a//text() | .//h3//a//text()",
|
|
69
|
+
"url": ".//h4//a/@href | .//h3//a/@href",
|
|
70
|
+
"body": ".//p//text() | .//div[contains(@class, 'compText')]//text()",
|
|
71
|
+
"image": ".//img/@src",
|
|
72
|
+
"source": ".//span[contains(@class, 's-source') or contains(@class, 'source')]//text()",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
def build_payload(
|
|
76
|
+
self,
|
|
77
|
+
query: str,
|
|
78
|
+
region: str,
|
|
79
|
+
safesearch: str,
|
|
80
|
+
timelimit: str | None,
|
|
81
|
+
page: int = 1,
|
|
82
|
+
**kwargs: Any,
|
|
83
|
+
) -> dict[str, Any]:
|
|
84
|
+
"""Build news search payload.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
query: Search query
|
|
88
|
+
region: Region code
|
|
89
|
+
safesearch: Safe search level
|
|
90
|
+
timelimit: Time filter (d, w, m)
|
|
91
|
+
page: Page number
|
|
92
|
+
**kwargs: Additional parameters
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Query parameters dictionary
|
|
96
|
+
"""
|
|
97
|
+
# Generate dynamic URL tokens for tracking
|
|
98
|
+
self.search_url = (
|
|
99
|
+
f"https://news.search.yahoo.com/search"
|
|
100
|
+
f";_ylt={token_urlsafe(24 * 3 // 4)}"
|
|
101
|
+
f";_ylu={token_urlsafe(47 * 3 // 4)}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
payload = {
|
|
105
|
+
"p": query,
|
|
106
|
+
"ei": "UTF-8",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
# Pagination - Yahoo news uses 'b' parameter
|
|
110
|
+
if page > 1:
|
|
111
|
+
# Each page shows approximately 10 articles
|
|
112
|
+
payload["b"] = f"{(page - 1) * 10 + 1}"
|
|
113
|
+
|
|
114
|
+
# Time filter
|
|
115
|
+
if timelimit:
|
|
116
|
+
time_map = {
|
|
117
|
+
"d": "1d", # Past 24 hours
|
|
118
|
+
"w": "1w", # Past week
|
|
119
|
+
"m": "1m", # Past month
|
|
120
|
+
}
|
|
121
|
+
if timelimit in time_map:
|
|
122
|
+
payload["btf"] = time_map[timelimit]
|
|
123
|
+
|
|
124
|
+
# Additional filters
|
|
125
|
+
if "category" in kwargs:
|
|
126
|
+
payload["category"] = kwargs["category"]
|
|
127
|
+
|
|
128
|
+
if "sort" in kwargs:
|
|
129
|
+
# Sort by relevance or date
|
|
130
|
+
payload["sort"] = kwargs["sort"]
|
|
131
|
+
|
|
132
|
+
return payload
|
|
133
|
+
|
|
134
|
+
def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
|
|
135
|
+
"""Post-process news results.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
results: Raw extracted results
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Cleaned news results
|
|
142
|
+
"""
|
|
143
|
+
cleaned_results = []
|
|
144
|
+
|
|
145
|
+
for result in results:
|
|
146
|
+
# Clean image URL
|
|
147
|
+
result.image = extract_image(result.image)
|
|
148
|
+
|
|
149
|
+
# Clean source name
|
|
150
|
+
result.source = extract_source(result.source)
|
|
151
|
+
|
|
152
|
+
# Extract URL from redirect
|
|
153
|
+
if result.url and "/RU=" in result.url:
|
|
154
|
+
from urllib.parse import unquote
|
|
155
|
+
start = result.url.find("/RU=") + 4
|
|
156
|
+
end = result.url.find("/RK=", start)
|
|
157
|
+
if end == -1:
|
|
158
|
+
end = len(result.url)
|
|
159
|
+
result.url = unquote(result.url[start:end])
|
|
160
|
+
|
|
161
|
+
# Filter out results without essential fields
|
|
162
|
+
if result.title and result.url:
|
|
163
|
+
cleaned_results.append(result)
|
|
164
|
+
|
|
165
|
+
return cleaned_results
|
|
166
|
+
|
|
167
|
+
def search(
|
|
168
|
+
self,
|
|
169
|
+
query: str,
|
|
170
|
+
region: str = "us-en",
|
|
171
|
+
safesearch: str = "moderate",
|
|
172
|
+
timelimit: str | None = None,
|
|
173
|
+
page: int = 1,
|
|
174
|
+
max_results: int | None = None,
|
|
175
|
+
**kwargs: Any,
|
|
176
|
+
) -> list[NewsResult] | None:
|
|
177
|
+
"""Search Yahoo News with pagination.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
query: News search query
|
|
181
|
+
region: Region code
|
|
182
|
+
safesearch: Safe search level
|
|
183
|
+
timelimit: Time filter (d, w, m)
|
|
184
|
+
page: Starting page
|
|
185
|
+
max_results: Maximum results to return
|
|
186
|
+
**kwargs: Additional parameters (category, sort)
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
List of NewsResult objects
|
|
190
|
+
"""
|
|
191
|
+
results = []
|
|
192
|
+
current_page = page
|
|
193
|
+
max_pages = kwargs.get("max_pages", 10)
|
|
194
|
+
|
|
195
|
+
while current_page <= max_pages:
|
|
196
|
+
payload = self.build_payload(
|
|
197
|
+
query=query,
|
|
198
|
+
region=region,
|
|
199
|
+
safesearch=safesearch,
|
|
200
|
+
timelimit=timelimit,
|
|
201
|
+
page=current_page,
|
|
202
|
+
**kwargs
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
html_text = self.request(self.search_method, self.search_url, params=payload)
|
|
206
|
+
if not html_text:
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
html_text = self.pre_process_html(html_text)
|
|
210
|
+
page_results = self.extract_results(html_text)
|
|
211
|
+
|
|
212
|
+
if not page_results:
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
results.extend(page_results)
|
|
216
|
+
|
|
217
|
+
if max_results and len(results) >= max_results:
|
|
218
|
+
break
|
|
219
|
+
|
|
220
|
+
current_page += 1
|
|
221
|
+
|
|
222
|
+
results = self.post_extract_results(results)
|
|
223
|
+
|
|
224
|
+
if max_results:
|
|
225
|
+
results = results[:max_results]
|
|
226
|
+
|
|
227
|
+
return results if results else None
|
|
228
|
+
|
|
229
|
+
def run(
|
|
230
|
+
self,
|
|
231
|
+
keywords: str,
|
|
232
|
+
region: str = "us-en",
|
|
233
|
+
safesearch: str = "moderate",
|
|
234
|
+
timelimit: str | None = None,
|
|
235
|
+
max_results: int | None = None,
|
|
236
|
+
) -> list[dict[str, str]]:
|
|
237
|
+
"""Run news search and return results as dictionaries.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
keywords: Search query.
|
|
241
|
+
region: Region code.
|
|
242
|
+
safesearch: Safe search level.
|
|
243
|
+
timelimit: Time filter.
|
|
244
|
+
max_results: Maximum number of results.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
List of news result dictionaries.
|
|
248
|
+
"""
|
|
249
|
+
results = self.search(
|
|
250
|
+
query=keywords,
|
|
251
|
+
region=region,
|
|
252
|
+
safesearch=safesearch,
|
|
253
|
+
timelimit=timelimit,
|
|
254
|
+
max_results=max_results,
|
|
255
|
+
)
|
|
256
|
+
if results is None:
|
|
257
|
+
return []
|
|
258
|
+
return [result.to_dict() for result in results]
|