webscout 2025.10.11__py3-none-any.whl → 2025.10.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (47) hide show
  1. webscout/Provider/Andi.py +1 -1
  2. webscout/Provider/ChatGPTClone.py +2 -1
  3. webscout/__init__.py +1 -4
  4. webscout/auth/routes.py +2 -3
  5. webscout/cli.py +1 -1
  6. webscout/search/__init__.py +51 -0
  7. webscout/search/base.py +195 -0
  8. webscout/search/duckduckgo_main.py +54 -0
  9. webscout/search/engines/__init__.py +48 -0
  10. webscout/search/engines/bing.py +84 -0
  11. webscout/search/engines/bing_news.py +52 -0
  12. webscout/search/engines/brave.py +43 -0
  13. webscout/search/engines/duckduckgo/__init__.py +25 -0
  14. webscout/search/engines/duckduckgo/answers.py +78 -0
  15. webscout/search/engines/duckduckgo/base.py +187 -0
  16. webscout/search/engines/duckduckgo/images.py +97 -0
  17. webscout/search/engines/duckduckgo/maps.py +168 -0
  18. webscout/search/engines/duckduckgo/news.py +68 -0
  19. webscout/search/engines/duckduckgo/suggestions.py +21 -0
  20. webscout/search/engines/duckduckgo/text.py +211 -0
  21. webscout/search/engines/duckduckgo/translate.py +47 -0
  22. webscout/search/engines/duckduckgo/videos.py +63 -0
  23. webscout/search/engines/duckduckgo/weather.py +74 -0
  24. webscout/search/engines/mojeek.py +37 -0
  25. webscout/search/engines/wikipedia.py +56 -0
  26. webscout/search/engines/yahoo.py +65 -0
  27. webscout/search/engines/yahoo_news.py +64 -0
  28. webscout/search/engines/yandex.py +43 -0
  29. webscout/search/engines/yep/__init__.py +13 -0
  30. webscout/search/engines/yep/base.py +32 -0
  31. webscout/search/engines/yep/images.py +99 -0
  32. webscout/search/engines/yep/suggestions.py +35 -0
  33. webscout/search/engines/yep/text.py +114 -0
  34. webscout/search/http_client.py +156 -0
  35. webscout/search/results.py +137 -0
  36. webscout/search/yep_main.py +44 -0
  37. webscout/version.py +1 -1
  38. webscout/version.py.bak +2 -0
  39. {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/METADATA +3 -4
  40. {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/RECORD +44 -15
  41. webscout/webscout_search.py +0 -1183
  42. webscout/webscout_search_async.py +0 -649
  43. webscout/yep_search.py +0 -346
  44. {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/WHEEL +0 -0
  45. {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/entry_points.txt +0 -0
  46. {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/licenses/LICENSE.md +0 -0
  47. {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,211 @@
1
+ """DuckDuckGo text search."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from random import shuffle
7
+
8
+ from ....exceptions import WebscoutE
9
+ from .base import DuckDuckGoBase
10
+
11
+
12
+ class DuckDuckGoTextSearch(DuckDuckGoBase):
13
+ """DuckDuckGo text/web search."""
14
+
15
+ def run(self, *args, **kwargs) -> list[dict[str, str]]:
16
+ """Perform text search on DuckDuckGo.
17
+
18
+ Args:
19
+ keywords: Search query.
20
+ region: Region code (e.g., wt-wt, us-en).
21
+ safesearch: on, moderate, or off.
22
+ timelimit: d, w, m, or y.
23
+ backend: html, lite, or auto.
24
+ max_results: Maximum number of results.
25
+
26
+ Returns:
27
+ List of search result dictionaries.
28
+ """
29
+ keywords = args[0] if args else kwargs.get("keywords")
30
+ region = args[1] if len(args) > 1 else kwargs.get("region", "wt-wt")
31
+ safesearch = args[2] if len(args) > 2 else kwargs.get("safesearch", "moderate")
32
+ timelimit = args[3] if len(args) > 3 else kwargs.get("timelimit")
33
+ backend = args[4] if len(args) > 4 else kwargs.get("backend", "auto")
34
+ max_results = args[5] if len(args) > 5 else kwargs.get("max_results")
35
+
36
+ if backend in ("api", "ecosia"):
37
+ warnings.warn(f"{backend=} is deprecated, using backend='auto'", stacklevel=2)
38
+ backend = "auto"
39
+ backends = ["html", "lite"] if backend == "auto" else [backend]
40
+ shuffle(backends)
41
+
42
+ results, err = [], None
43
+ for b in backends:
44
+ try:
45
+ if b == "html":
46
+ results = self._text_html(keywords, region, timelimit, max_results)
47
+ elif b == "lite":
48
+ results = self._text_lite(keywords, region, timelimit, max_results)
49
+ return results
50
+ except Exception as ex:
51
+ err = ex
52
+
53
+ raise WebscoutE(err)
54
+
55
+ def _text_html(
56
+ self,
57
+ keywords: str,
58
+ region: str = "wt-wt",
59
+ timelimit: str | None = None,
60
+ max_results: int | None = None,
61
+ ) -> list[dict[str, str]]:
62
+ """Text search using HTML backend."""
63
+ assert keywords, "keywords is mandatory"
64
+
65
+ payload = {
66
+ "q": keywords,
67
+ "s": "0",
68
+ "o": "json",
69
+ "api": "d.js",
70
+ "vqd": "",
71
+ "kl": region,
72
+ "bing_market": region,
73
+ }
74
+ if timelimit:
75
+ payload["df"] = timelimit
76
+ if max_results and max_results > 20:
77
+ vqd = self._get_vqd(keywords)
78
+ payload["vqd"] = vqd
79
+
80
+ cache = set()
81
+ results: list[dict[str, str]] = []
82
+
83
+ def _text_html_page(s: int) -> list[dict[str, str]]:
84
+ payload["s"] = f"{s}"
85
+ resp_content = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload).content
86
+ if b"No results." in resp_content:
87
+ return []
88
+
89
+ page_results = []
90
+ tree = self.parser.fromstring(resp_content)
91
+ elements = tree.xpath("//div[h2]")
92
+ if not isinstance(elements, list):
93
+ return []
94
+ for e in elements:
95
+ if isinstance(e, self.parser.etree.Element):
96
+ hrefxpath = e.xpath("./a/@href")
97
+ href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, list) else None
98
+ if (
99
+ href
100
+ and href not in cache
101
+ and not href.startswith(
102
+ ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
103
+ )
104
+ ):
105
+ cache.add(href)
106
+ titlexpath = e.xpath("./h2/a/text()")
107
+ title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, list) else ""
108
+ bodyxpath = e.xpath("./a//text()")
109
+ body = "".join(str(x) for x in bodyxpath) if bodyxpath and isinstance(bodyxpath, list) else ""
110
+ result = {
111
+ "title": self._normalize(title),
112
+ "href": self._normalize_url(href),
113
+ "body": self._normalize(body),
114
+ }
115
+ page_results.append(result)
116
+ return page_results
117
+
118
+ slist = [0]
119
+ if max_results:
120
+ max_results = min(max_results, 2023)
121
+ slist.extend(range(23, max_results, 50))
122
+ try:
123
+ for r in self._executor.map(_text_html_page, slist):
124
+ results.extend(r)
125
+ except Exception as e:
126
+ raise e
127
+
128
+ return list(self.islice(results, max_results))
129
+
130
+ def _text_lite(
131
+ self,
132
+ keywords: str,
133
+ region: str = "wt-wt",
134
+ timelimit: str | None = None,
135
+ max_results: int | None = None,
136
+ ) -> list[dict[str, str]]:
137
+ """Text search using lite backend."""
138
+ assert keywords, "keywords is mandatory"
139
+
140
+ payload = {
141
+ "q": keywords,
142
+ "s": "0",
143
+ "o": "json",
144
+ "api": "d.js",
145
+ "vqd": "",
146
+ "kl": region,
147
+ "bing_market": region,
148
+ }
149
+ if timelimit:
150
+ payload["df"] = timelimit
151
+
152
+ cache = set()
153
+ results: list[dict[str, str]] = []
154
+
155
+ def _text_lite_page(s: int) -> list[dict[str, str]]:
156
+ payload["s"] = f"{s}"
157
+ resp_content = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload).content
158
+ if b"No more results." in resp_content:
159
+ return []
160
+
161
+ page_results = []
162
+ tree = self.parser.fromstring(resp_content)
163
+ elements = tree.xpath("//table[last()]//tr")
164
+ if not isinstance(elements, list):
165
+ return []
166
+
167
+ data = zip(self.cycle(range(1, 5)), elements)
168
+ for i, e in data:
169
+ if isinstance(e, self.parser.etree.Element):
170
+ if i == 1:
171
+ hrefxpath = e.xpath(".//a//@href")
172
+ href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, list) else None
173
+ if (
174
+ href is None
175
+ or href in cache
176
+ or href.startswith(
177
+ ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
178
+ )
179
+ ):
180
+ [next(data, None) for _ in range(3)] # skip block(i=1,2,3,4)
181
+ else:
182
+ cache.add(href)
183
+ titlexpath = e.xpath(".//a//text()")
184
+ title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, list) else ""
185
+ elif i == 2:
186
+ bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
187
+ body = (
188
+ "".join(str(x) for x in bodyxpath).strip()
189
+ if bodyxpath and isinstance(bodyxpath, list)
190
+ else ""
191
+ )
192
+ if href:
193
+ result = {
194
+ "title": self._normalize(title),
195
+ "href": self._normalize_url(href),
196
+ "body": self._normalize(body),
197
+ }
198
+ page_results.append(result)
199
+ return page_results
200
+
201
+ slist = [0]
202
+ if max_results:
203
+ max_results = min(max_results, 2023)
204
+ slist.extend(range(23, max_results, 50))
205
+ try:
206
+ for r in self._executor.map(_text_lite_page, slist):
207
+ results.extend(r)
208
+ except Exception as e:
209
+ raise e
210
+
211
+ return list(self.islice(results, max_results))
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from ....exceptions import WebscoutE
4
+ from .base import DuckDuckGoBase
5
+
6
+
7
+ class DuckDuckGoTranslate(DuckDuckGoBase):
8
+ def run(self, *args, **kwargs) -> list[dict[str, str]]:
9
+ keywords = args[0] if args else kwargs.get("keywords")
10
+ from_ = args[1] if len(args) > 1 else kwargs.get("from_")
11
+ to = args[2] if len(args) > 2 else kwargs.get("to", "en")
12
+
13
+ assert keywords, "keywords is mandatory"
14
+
15
+ vqd = self._get_vqd("translate")
16
+
17
+ payload = {
18
+ "vqd": vqd,
19
+ "query": "translate",
20
+ "to": to,
21
+ }
22
+ if from_:
23
+ payload["from"] = from_
24
+
25
+ def _translate_keyword(keyword: str) -> dict[str, str]:
26
+ resp_content = self._get_url(
27
+ "POST",
28
+ "https://duckduckgo.com/translation.js",
29
+ params=payload,
30
+ content=keyword.encode(),
31
+ ).content
32
+ page_data: dict[str, str] = self.json_loads(resp_content)
33
+ page_data["original"] = keyword
34
+ return page_data
35
+
36
+ if isinstance(keywords, str):
37
+ keywords = [keywords]
38
+
39
+ results = []
40
+ try:
41
+ for r in self._executor.map(_translate_keyword, keywords):
42
+ results.append(r)
43
+ except Exception as e:
44
+ raise e
45
+
46
+ return results
47
+
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ from ....exceptions import WebscoutE
4
+ from .base import DuckDuckGoBase
5
+
6
+
7
+ class DuckDuckGoVideos(DuckDuckGoBase):
8
+ def run(self, *args, **kwargs) -> list[dict[str, str]]:
9
+ keywords = args[0] if args else kwargs.get("keywords")
10
+ region = args[1] if len(args) > 1 else kwargs.get("region", "wt-wt")
11
+ safesearch = args[2] if len(args) > 2 else kwargs.get("safesearch", "moderate")
12
+ timelimit = args[3] if len(args) > 3 else kwargs.get("timelimit")
13
+ resolution = args[4] if len(args) > 4 else kwargs.get("resolution")
14
+ duration = args[5] if len(args) > 5 else kwargs.get("duration")
15
+ license_videos = args[6] if len(args) > 6 else kwargs.get("license_videos")
16
+ max_results = args[7] if len(args) > 7 else kwargs.get("max_results")
17
+
18
+ assert keywords, "keywords is mandatory"
19
+
20
+ vqd = self._get_vqd(keywords)
21
+
22
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
23
+ timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
24
+ resolution = f"videoDefinition:{resolution}" if resolution else ""
25
+ duration = f"videoDuration:{duration}" if duration else ""
26
+ license_videos = f"videoLicense:{license_videos}" if license_videos else ""
27
+ payload = {
28
+ "l": region,
29
+ "o": "json",
30
+ "q": keywords,
31
+ "vqd": vqd,
32
+ "f": f"{timelimit},{resolution},{duration},{license_videos}",
33
+ "p": safesearch_base[safesearch.lower()],
34
+ }
35
+
36
+ cache = set()
37
+ results: list[dict[str, str]] = []
38
+
39
+ def _videos_page(s: int) -> list[dict[str, str]]:
40
+ payload["s"] = f"{s}"
41
+ resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload).content
42
+ resp_json = self.json_loads(resp_content)
43
+
44
+ page_data = resp_json.get("results", [])
45
+ page_results = []
46
+ for row in page_data:
47
+ if row["content"] not in cache:
48
+ cache.add(row["content"])
49
+ page_results.append(row)
50
+ return page_results
51
+
52
+ slist = [0]
53
+ if max_results:
54
+ max_results = min(max_results, 400)
55
+ slist.extend(range(60, max_results, 60))
56
+ try:
57
+ for r in self._executor.map(_videos_page, slist):
58
+ results.extend(r)
59
+ except Exception as e:
60
+ raise e
61
+
62
+ return list(self.islice(results, max_results))
63
+
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import datetime
5
+ from urllib.parse import quote
6
+
7
+ from ....exceptions import WebscoutE
8
+ from .base import DuckDuckGoBase
9
+
10
+
11
+ class DuckDuckGoWeather(DuckDuckGoBase):
12
+ def run(self, *args, **kwargs) -> dict[str, any]:
13
+ location = args[0] if args else kwargs.get("location")
14
+ language = args[1] if len(args) > 1 else kwargs.get("language", "en")
15
+
16
+ assert location, "location is mandatory"
17
+ lang = language.split('-')[0]
18
+ url = f"https://duckduckgo.com/js/spice/forecast/{quote(location)}/{lang}"
19
+
20
+ resp = self._get_url("GET", url).content
21
+ resp_text = resp.decode('utf-8')
22
+
23
+ if "ddg_spice_forecast(" not in resp_text:
24
+ raise WebscoutE(f"No weather data found for {location}")
25
+
26
+ json_text = resp_text[resp_text.find('(') + 1:resp_text.rfind(')')]
27
+ try:
28
+ result = json.loads(json_text)
29
+ except Exception as e:
30
+ raise WebscoutE(f"Error parsing weather JSON: {e}")
31
+
32
+ if not result or 'currentWeather' not in result or 'forecastDaily' not in result:
33
+ raise WebscoutE(f"Invalid weather data format for {location}")
34
+
35
+ formatted_data = {
36
+ "location": result["currentWeather"]["metadata"].get("ddg-location", "Unknown"),
37
+ "current": {
38
+ "condition": result["currentWeather"].get("conditionCode"),
39
+ "temperature_c": result["currentWeather"].get("temperature"),
40
+ "feels_like_c": result["currentWeather"].get("temperatureApparent"),
41
+ "humidity": result["currentWeather"].get("humidity"),
42
+ "wind_speed_ms": result["currentWeather"].get("windSpeed"),
43
+ "wind_direction": result["currentWeather"].get("windDirection"),
44
+ "visibility_m": result["currentWeather"].get("visibility"),
45
+ },
46
+ "daily_forecast": [],
47
+ "hourly_forecast": []
48
+ }
49
+
50
+ for day in result["forecastDaily"]["days"]:
51
+ formatted_data["daily_forecast"].append({
52
+ "date": datetime.fromisoformat(day["forecastStart"].replace("Z", "+00:00")).strftime("%Y-%m-%d"),
53
+ "condition": day["daytimeForecast"].get("conditionCode"),
54
+ "max_temp_c": day["temperatureMax"],
55
+ "min_temp_c": day["temperatureMin"],
56
+ "sunrise": datetime.fromisoformat(day["sunrise"].replace("Z", "+00:00")).strftime("%H:%M"),
57
+ "sunset": datetime.fromisoformat(day["sunset"].replace("Z", "+00:00")).strftime("%H:%M"),
58
+ })
59
+
60
+ if 'forecastHourly' in result and 'hours' in result['forecastHourly']:
61
+ for hour in result['forecastHourly']['hours']:
62
+ formatted_data["hourly_forecast"].append({
63
+ "time": datetime.fromisoformat(hour["forecastStart"].replace("Z", "+00:00")).strftime("%H:%M"),
64
+ "condition": hour.get("conditionCode"),
65
+ "temperature_c": hour.get("temperature"),
66
+ "feels_like_c": hour.get("temperatureApparent"),
67
+ "humidity": hour.get("humidity"),
68
+ "wind_speed_ms": hour.get("windSpeed"),
69
+ "wind_direction": hour.get("windDirection"),
70
+ "visibility_m": hour.get("visibility"),
71
+ })
72
+
73
+ return formatted_data
74
+
@@ -0,0 +1,37 @@
1
+ """Mojeek search engine implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping
6
+ from typing import Any
7
+
8
+ from ..base import BaseSearchEngine
9
+ from ..results import TextResult
10
+
11
+
12
+ class Mojeek(BaseSearchEngine[TextResult]):
13
+ """Mojeek search engine."""
14
+
15
+ name = "mojeek"
16
+ category = "text"
17
+ provider = "mojeek"
18
+
19
+ search_url = "https://www.mojeek.com/search"
20
+ search_method = "GET"
21
+
22
+ items_xpath = "//ul[contains(@class, 'results')]/li"
23
+ elements_xpath: Mapping[str, str] = {
24
+ "title": ".//h2//text()",
25
+ "href": ".//h2/a/@href",
26
+ "body": ".//p[@class='s']//text()",
27
+ }
28
+
29
+ def build_payload(
30
+ self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
31
+ ) -> dict[str, Any]:
32
+ """Build a payload for the search request."""
33
+ safesearch_base = {"on": "1", "moderate": "0", "off": "0"}
34
+ payload = {"q": query, "safe": safesearch_base[safesearch.lower()]}
35
+ if page > 1:
36
+ payload["s"] = f"{(page - 1) * 10 + 1}"
37
+ return payload
@@ -0,0 +1,56 @@
1
+ """Wikipedia text search engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+ from urllib.parse import quote
8
+
9
+ from ..base import BaseSearchEngine
10
+ from ..results import TextResult
11
+ from ...utils import json_loads
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class Wikipedia(BaseSearchEngine[TextResult]):
17
+ """Wikipedia text search engine."""
18
+
19
+ name = "wikipedia"
20
+ category = "text"
21
+ provider = "wikipedia"
22
+ priority = 2
23
+
24
+ search_url = "https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={query}"
25
+ search_method = "GET"
26
+
27
+ def build_payload(
28
+ self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
29
+ ) -> dict[str, Any]:
30
+ """Build a payload for the search request."""
31
+ _country, lang = region.lower().split("-")
32
+ encoded_query = quote(query)
33
+ self.search_url = (
34
+ f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&limit=1&search={encoded_query}"
35
+ )
36
+ payload: dict[str, Any] = {}
37
+ self.lang = lang # used in extract_results
38
+ return payload
39
+
40
+ def extract_results(self, html_text: str) -> list[TextResult]:
41
+ """Extract search results from html text."""
42
+ json_data = json_loads(html_text)
43
+ if not json_data or len(json_data) < 4:
44
+ return []
45
+
46
+ results = []
47
+ titles, descriptions, urls = json_data[1], json_data[2], json_data[3]
48
+
49
+ for title, description, url in zip(titles, descriptions, urls):
50
+ result = TextResult()
51
+ result.title = title
52
+ result.body = description
53
+ result.href = url
54
+ results.append(result)
55
+
56
+ return results
@@ -0,0 +1,65 @@
1
+ """Yahoo search engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping
6
+ from secrets import token_urlsafe
7
+ from typing import Any
8
+ from urllib.parse import unquote_plus
9
+
10
+ from ..base import BaseSearchEngine
11
+ from ..results import TextResult
12
+
13
+
14
+ def extract_url(u: str) -> str:
15
+ """Sanitize url."""
16
+ if "/RU=" in u:
17
+ start = u.find("/RU=") + 4
18
+ end = u.find("/RK=", start)
19
+ if end == -1:
20
+ end = len(u)
21
+ return unquote_plus(u[start:end])
22
+ return u
23
+
24
+
25
+ class Yahoo(BaseSearchEngine[TextResult]):
26
+ """Yahoo search engine."""
27
+
28
+ name = "yahoo"
29
+ category = "text"
30
+ provider = "bing"
31
+
32
+ search_url = "https://search.yahoo.com/search"
33
+ search_method = "GET"
34
+
35
+ items_xpath = "//div[contains(@class, 'relsrch')]"
36
+ elements_xpath: Mapping[str, str] = {
37
+ "title": ".//div[contains(@class, 'Title')]//h3//text()",
38
+ "href": ".//div[contains(@class, 'Title')]//a/@href",
39
+ "body": ".//div[contains(@class, 'Text')]//text()",
40
+ }
41
+
42
+ def build_payload(
43
+ self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
44
+ ) -> dict[str, Any]:
45
+ """Build a payload for the search request."""
46
+ self.search_url = (
47
+ f"https://search.yahoo.com/search;_ylt={token_urlsafe(24 * 3 // 4)};_ylu={token_urlsafe(47 * 3 // 4)}"
48
+ )
49
+ payload = {"p": query}
50
+ if page > 1:
51
+ payload["b"] = f"{(page - 1) * 7 + 1}"
52
+ if timelimit:
53
+ payload["btf"] = timelimit
54
+ return payload
55
+
56
+ def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
57
+ """Post-process search results."""
58
+ post_results = []
59
+ for result in results:
60
+ if result.href.startswith("https://www.bing.com/aclick?"):
61
+ continue
62
+ if "/RU=" in result.href:
63
+ result.href = extract_url(result.href)
64
+ post_results.append(result)
65
+ return post_results
@@ -0,0 +1,64 @@
1
+ """Yahoo news search engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping
6
+ from secrets import token_urlsafe
7
+ from typing import Any
8
+
9
+ from ..base import BaseSearchEngine
10
+ from ..results import NewsResult
11
+
12
+
13
+ def extract_image(u: str) -> str:
14
+ """Sanitize image url."""
15
+ if u and u.startswith("data:image"):
16
+ return ""
17
+ return u
18
+
19
+
20
+ def extract_source(s: str) -> str:
21
+ """Remove ' via Yahoo' from string."""
22
+ return s.replace(" via Yahoo", "") if s else s
23
+
24
+
25
+ class YahooNews(BaseSearchEngine[NewsResult]):
26
+ """Yahoo news search engine."""
27
+
28
+ name = "yahoo"
29
+ category = "news"
30
+ provider = "bing"
31
+
32
+ search_url = "https://news.search.yahoo.com/search"
33
+ search_method = "GET"
34
+
35
+ items_xpath = "//div[contains(@class, 'NewsArticle')]"
36
+ elements_xpath: Mapping[str, str] = {
37
+ "date": ".//span[contains(@class, 'fc-2nd')]//text()",
38
+ "title": ".//h4//a//text()",
39
+ "url": ".//h4//a/@href",
40
+ "body": ".//p//text()",
41
+ "image": ".//img/@src",
42
+ "source": ".//span[contains(@class, 's-source')]//text()",
43
+ }
44
+
45
+ def build_payload(
46
+ self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
47
+ ) -> dict[str, Any]:
48
+ """Build a payload for the search request."""
49
+ self.search_url = (
50
+ f"https://news.search.yahoo.com/search;_ylt={token_urlsafe(24 * 3 // 4)};_ylu={token_urlsafe(47 * 3 // 4)}"
51
+ )
52
+ payload = {"p": query}
53
+ if page > 1:
54
+ payload["b"] = f"{(page - 1) * 10 + 1}"
55
+ if timelimit:
56
+ payload["btf"] = timelimit
57
+ return payload
58
+
59
+ def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
60
+ """Post-process search results."""
61
+ for result in results:
62
+ result.image = extract_image(result.image)
63
+ result.source = extract_source(result.source)
64
+ return results
@@ -0,0 +1,43 @@
1
+ """Yandex search engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping
6
+ from random import SystemRandom
7
+ from typing import Any
8
+
9
+ from ..base import BaseSearchEngine
10
+ from ..results import TextResult
11
+
12
+ random = SystemRandom()
13
+
14
+
15
+ class Yandex(BaseSearchEngine[TextResult]):
16
+ """Yandex search engine."""
17
+
18
+ name = "yandex"
19
+ category = "text"
20
+ provider = "yandex"
21
+
22
+ search_url = "https://yandex.com/search/"
23
+ search_method = "GET"
24
+
25
+ items_xpath = "//li[contains(@class, 'serp-item')]"
26
+ elements_xpath: Mapping[str, str] = {
27
+ "title": ".//h2//text()",
28
+ "href": ".//h2/a/@href",
29
+ "body": ".//div[contains(@class, 'text-container')]//text()",
30
+ }
31
+
32
+ def build_payload(
33
+ self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
34
+ ) -> dict[str, Any]:
35
+ """Build a payload for the search request."""
36
+ safesearch_base = {"on": "1", "moderate": "0", "off": "0"}
37
+ payload = {
38
+ "text": query,
39
+ "family": safesearch_base[safesearch.lower()],
40
+ }
41
+ if page > 1:
42
+ payload["p"] = str(page - 1)
43
+ return payload
@@ -0,0 +1,13 @@
1
+ """Yep search engines package."""
2
+
3
+ from .base import YepBase
4
+ from .images import YepImages
5
+ from .suggestions import YepSuggestions
6
+ from .text import YepSearch as YepTextSearch
7
+
8
+ __all__ = [
9
+ "YepBase",
10
+ "YepTextSearch",
11
+ "YepImages",
12
+ "YepSuggestions",
13
+ ]