webscout 2025.10.11__py3-none-any.whl → 2025.10.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Provider/Andi.py +1 -1
- webscout/Provider/ChatGPTClone.py +2 -1
- webscout/__init__.py +1 -4
- webscout/auth/routes.py +2 -3
- webscout/cli.py +1 -1
- webscout/search/__init__.py +51 -0
- webscout/search/base.py +195 -0
- webscout/search/duckduckgo_main.py +54 -0
- webscout/search/engines/__init__.py +48 -0
- webscout/search/engines/bing.py +84 -0
- webscout/search/engines/bing_news.py +52 -0
- webscout/search/engines/brave.py +43 -0
- webscout/search/engines/duckduckgo/__init__.py +25 -0
- webscout/search/engines/duckduckgo/answers.py +78 -0
- webscout/search/engines/duckduckgo/base.py +187 -0
- webscout/search/engines/duckduckgo/images.py +97 -0
- webscout/search/engines/duckduckgo/maps.py +168 -0
- webscout/search/engines/duckduckgo/news.py +68 -0
- webscout/search/engines/duckduckgo/suggestions.py +21 -0
- webscout/search/engines/duckduckgo/text.py +211 -0
- webscout/search/engines/duckduckgo/translate.py +47 -0
- webscout/search/engines/duckduckgo/videos.py +63 -0
- webscout/search/engines/duckduckgo/weather.py +74 -0
- webscout/search/engines/mojeek.py +37 -0
- webscout/search/engines/wikipedia.py +56 -0
- webscout/search/engines/yahoo.py +65 -0
- webscout/search/engines/yahoo_news.py +64 -0
- webscout/search/engines/yandex.py +43 -0
- webscout/search/engines/yep/__init__.py +13 -0
- webscout/search/engines/yep/base.py +32 -0
- webscout/search/engines/yep/images.py +99 -0
- webscout/search/engines/yep/suggestions.py +35 -0
- webscout/search/engines/yep/text.py +114 -0
- webscout/search/http_client.py +156 -0
- webscout/search/results.py +137 -0
- webscout/search/yep_main.py +44 -0
- webscout/version.py +1 -1
- webscout/version.py.bak +2 -0
- {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/METADATA +3 -4
- {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/RECORD +44 -15
- webscout/webscout_search.py +0 -1183
- webscout/webscout_search_async.py +0 -649
- webscout/yep_search.py +0 -346
- {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/WHEEL +0 -0
- {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/entry_points.txt +0 -0
- {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-2025.10.11.dist-info → webscout-2025.10.13.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""DuckDuckGo text search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
from random import shuffle
|
|
7
|
+
|
|
8
|
+
from ....exceptions import WebscoutE
|
|
9
|
+
from .base import DuckDuckGoBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DuckDuckGoTextSearch(DuckDuckGoBase):
|
|
13
|
+
"""DuckDuckGo text/web search."""
|
|
14
|
+
|
|
15
|
+
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
16
|
+
"""Perform text search on DuckDuckGo.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
keywords: Search query.
|
|
20
|
+
region: Region code (e.g., wt-wt, us-en).
|
|
21
|
+
safesearch: on, moderate, or off.
|
|
22
|
+
timelimit: d, w, m, or y.
|
|
23
|
+
backend: html, lite, or auto.
|
|
24
|
+
max_results: Maximum number of results.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
List of search result dictionaries.
|
|
28
|
+
"""
|
|
29
|
+
keywords = args[0] if args else kwargs.get("keywords")
|
|
30
|
+
region = args[1] if len(args) > 1 else kwargs.get("region", "wt-wt")
|
|
31
|
+
safesearch = args[2] if len(args) > 2 else kwargs.get("safesearch", "moderate")
|
|
32
|
+
timelimit = args[3] if len(args) > 3 else kwargs.get("timelimit")
|
|
33
|
+
backend = args[4] if len(args) > 4 else kwargs.get("backend", "auto")
|
|
34
|
+
max_results = args[5] if len(args) > 5 else kwargs.get("max_results")
|
|
35
|
+
|
|
36
|
+
if backend in ("api", "ecosia"):
|
|
37
|
+
warnings.warn(f"{backend=} is deprecated, using backend='auto'", stacklevel=2)
|
|
38
|
+
backend = "auto"
|
|
39
|
+
backends = ["html", "lite"] if backend == "auto" else [backend]
|
|
40
|
+
shuffle(backends)
|
|
41
|
+
|
|
42
|
+
results, err = [], None
|
|
43
|
+
for b in backends:
|
|
44
|
+
try:
|
|
45
|
+
if b == "html":
|
|
46
|
+
results = self._text_html(keywords, region, timelimit, max_results)
|
|
47
|
+
elif b == "lite":
|
|
48
|
+
results = self._text_lite(keywords, region, timelimit, max_results)
|
|
49
|
+
return results
|
|
50
|
+
except Exception as ex:
|
|
51
|
+
err = ex
|
|
52
|
+
|
|
53
|
+
raise WebscoutE(err)
|
|
54
|
+
|
|
55
|
+
def _text_html(
|
|
56
|
+
self,
|
|
57
|
+
keywords: str,
|
|
58
|
+
region: str = "wt-wt",
|
|
59
|
+
timelimit: str | None = None,
|
|
60
|
+
max_results: int | None = None,
|
|
61
|
+
) -> list[dict[str, str]]:
|
|
62
|
+
"""Text search using HTML backend."""
|
|
63
|
+
assert keywords, "keywords is mandatory"
|
|
64
|
+
|
|
65
|
+
payload = {
|
|
66
|
+
"q": keywords,
|
|
67
|
+
"s": "0",
|
|
68
|
+
"o": "json",
|
|
69
|
+
"api": "d.js",
|
|
70
|
+
"vqd": "",
|
|
71
|
+
"kl": region,
|
|
72
|
+
"bing_market": region,
|
|
73
|
+
}
|
|
74
|
+
if timelimit:
|
|
75
|
+
payload["df"] = timelimit
|
|
76
|
+
if max_results and max_results > 20:
|
|
77
|
+
vqd = self._get_vqd(keywords)
|
|
78
|
+
payload["vqd"] = vqd
|
|
79
|
+
|
|
80
|
+
cache = set()
|
|
81
|
+
results: list[dict[str, str]] = []
|
|
82
|
+
|
|
83
|
+
def _text_html_page(s: int) -> list[dict[str, str]]:
|
|
84
|
+
payload["s"] = f"{s}"
|
|
85
|
+
resp_content = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload).content
|
|
86
|
+
if b"No results." in resp_content:
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
page_results = []
|
|
90
|
+
tree = self.parser.fromstring(resp_content)
|
|
91
|
+
elements = tree.xpath("//div[h2]")
|
|
92
|
+
if not isinstance(elements, list):
|
|
93
|
+
return []
|
|
94
|
+
for e in elements:
|
|
95
|
+
if isinstance(e, self.parser.etree.Element):
|
|
96
|
+
hrefxpath = e.xpath("./a/@href")
|
|
97
|
+
href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, list) else None
|
|
98
|
+
if (
|
|
99
|
+
href
|
|
100
|
+
and href not in cache
|
|
101
|
+
and not href.startswith(
|
|
102
|
+
("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
|
|
103
|
+
)
|
|
104
|
+
):
|
|
105
|
+
cache.add(href)
|
|
106
|
+
titlexpath = e.xpath("./h2/a/text()")
|
|
107
|
+
title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, list) else ""
|
|
108
|
+
bodyxpath = e.xpath("./a//text()")
|
|
109
|
+
body = "".join(str(x) for x in bodyxpath) if bodyxpath and isinstance(bodyxpath, list) else ""
|
|
110
|
+
result = {
|
|
111
|
+
"title": self._normalize(title),
|
|
112
|
+
"href": self._normalize_url(href),
|
|
113
|
+
"body": self._normalize(body),
|
|
114
|
+
}
|
|
115
|
+
page_results.append(result)
|
|
116
|
+
return page_results
|
|
117
|
+
|
|
118
|
+
slist = [0]
|
|
119
|
+
if max_results:
|
|
120
|
+
max_results = min(max_results, 2023)
|
|
121
|
+
slist.extend(range(23, max_results, 50))
|
|
122
|
+
try:
|
|
123
|
+
for r in self._executor.map(_text_html_page, slist):
|
|
124
|
+
results.extend(r)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
raise e
|
|
127
|
+
|
|
128
|
+
return list(self.islice(results, max_results))
|
|
129
|
+
|
|
130
|
+
def _text_lite(
|
|
131
|
+
self,
|
|
132
|
+
keywords: str,
|
|
133
|
+
region: str = "wt-wt",
|
|
134
|
+
timelimit: str | None = None,
|
|
135
|
+
max_results: int | None = None,
|
|
136
|
+
) -> list[dict[str, str]]:
|
|
137
|
+
"""Text search using lite backend."""
|
|
138
|
+
assert keywords, "keywords is mandatory"
|
|
139
|
+
|
|
140
|
+
payload = {
|
|
141
|
+
"q": keywords,
|
|
142
|
+
"s": "0",
|
|
143
|
+
"o": "json",
|
|
144
|
+
"api": "d.js",
|
|
145
|
+
"vqd": "",
|
|
146
|
+
"kl": region,
|
|
147
|
+
"bing_market": region,
|
|
148
|
+
}
|
|
149
|
+
if timelimit:
|
|
150
|
+
payload["df"] = timelimit
|
|
151
|
+
|
|
152
|
+
cache = set()
|
|
153
|
+
results: list[dict[str, str]] = []
|
|
154
|
+
|
|
155
|
+
def _text_lite_page(s: int) -> list[dict[str, str]]:
|
|
156
|
+
payload["s"] = f"{s}"
|
|
157
|
+
resp_content = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload).content
|
|
158
|
+
if b"No more results." in resp_content:
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
page_results = []
|
|
162
|
+
tree = self.parser.fromstring(resp_content)
|
|
163
|
+
elements = tree.xpath("//table[last()]//tr")
|
|
164
|
+
if not isinstance(elements, list):
|
|
165
|
+
return []
|
|
166
|
+
|
|
167
|
+
data = zip(self.cycle(range(1, 5)), elements)
|
|
168
|
+
for i, e in data:
|
|
169
|
+
if isinstance(e, self.parser.etree.Element):
|
|
170
|
+
if i == 1:
|
|
171
|
+
hrefxpath = e.xpath(".//a//@href")
|
|
172
|
+
href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, list) else None
|
|
173
|
+
if (
|
|
174
|
+
href is None
|
|
175
|
+
or href in cache
|
|
176
|
+
or href.startswith(
|
|
177
|
+
("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
|
|
178
|
+
)
|
|
179
|
+
):
|
|
180
|
+
[next(data, None) for _ in range(3)] # skip block(i=1,2,3,4)
|
|
181
|
+
else:
|
|
182
|
+
cache.add(href)
|
|
183
|
+
titlexpath = e.xpath(".//a//text()")
|
|
184
|
+
title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, list) else ""
|
|
185
|
+
elif i == 2:
|
|
186
|
+
bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
|
|
187
|
+
body = (
|
|
188
|
+
"".join(str(x) for x in bodyxpath).strip()
|
|
189
|
+
if bodyxpath and isinstance(bodyxpath, list)
|
|
190
|
+
else ""
|
|
191
|
+
)
|
|
192
|
+
if href:
|
|
193
|
+
result = {
|
|
194
|
+
"title": self._normalize(title),
|
|
195
|
+
"href": self._normalize_url(href),
|
|
196
|
+
"body": self._normalize(body),
|
|
197
|
+
}
|
|
198
|
+
page_results.append(result)
|
|
199
|
+
return page_results
|
|
200
|
+
|
|
201
|
+
slist = [0]
|
|
202
|
+
if max_results:
|
|
203
|
+
max_results = min(max_results, 2023)
|
|
204
|
+
slist.extend(range(23, max_results, 50))
|
|
205
|
+
try:
|
|
206
|
+
for r in self._executor.map(_text_lite_page, slist):
|
|
207
|
+
results.extend(r)
|
|
208
|
+
except Exception as e:
|
|
209
|
+
raise e
|
|
210
|
+
|
|
211
|
+
return list(self.islice(results, max_results))
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ....exceptions import WebscoutE
|
|
4
|
+
from .base import DuckDuckGoBase
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DuckDuckGoTranslate(DuckDuckGoBase):
|
|
8
|
+
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
9
|
+
keywords = args[0] if args else kwargs.get("keywords")
|
|
10
|
+
from_ = args[1] if len(args) > 1 else kwargs.get("from_")
|
|
11
|
+
to = args[2] if len(args) > 2 else kwargs.get("to", "en")
|
|
12
|
+
|
|
13
|
+
assert keywords, "keywords is mandatory"
|
|
14
|
+
|
|
15
|
+
vqd = self._get_vqd("translate")
|
|
16
|
+
|
|
17
|
+
payload = {
|
|
18
|
+
"vqd": vqd,
|
|
19
|
+
"query": "translate",
|
|
20
|
+
"to": to,
|
|
21
|
+
}
|
|
22
|
+
if from_:
|
|
23
|
+
payload["from"] = from_
|
|
24
|
+
|
|
25
|
+
def _translate_keyword(keyword: str) -> dict[str, str]:
|
|
26
|
+
resp_content = self._get_url(
|
|
27
|
+
"POST",
|
|
28
|
+
"https://duckduckgo.com/translation.js",
|
|
29
|
+
params=payload,
|
|
30
|
+
content=keyword.encode(),
|
|
31
|
+
).content
|
|
32
|
+
page_data: dict[str, str] = self.json_loads(resp_content)
|
|
33
|
+
page_data["original"] = keyword
|
|
34
|
+
return page_data
|
|
35
|
+
|
|
36
|
+
if isinstance(keywords, str):
|
|
37
|
+
keywords = [keywords]
|
|
38
|
+
|
|
39
|
+
results = []
|
|
40
|
+
try:
|
|
41
|
+
for r in self._executor.map(_translate_keyword, keywords):
|
|
42
|
+
results.append(r)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
raise e
|
|
45
|
+
|
|
46
|
+
return results
|
|
47
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ....exceptions import WebscoutE
|
|
4
|
+
from .base import DuckDuckGoBase
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DuckDuckGoVideos(DuckDuckGoBase):
|
|
8
|
+
def run(self, *args, **kwargs) -> list[dict[str, str]]:
|
|
9
|
+
keywords = args[0] if args else kwargs.get("keywords")
|
|
10
|
+
region = args[1] if len(args) > 1 else kwargs.get("region", "wt-wt")
|
|
11
|
+
safesearch = args[2] if len(args) > 2 else kwargs.get("safesearch", "moderate")
|
|
12
|
+
timelimit = args[3] if len(args) > 3 else kwargs.get("timelimit")
|
|
13
|
+
resolution = args[4] if len(args) > 4 else kwargs.get("resolution")
|
|
14
|
+
duration = args[5] if len(args) > 5 else kwargs.get("duration")
|
|
15
|
+
license_videos = args[6] if len(args) > 6 else kwargs.get("license_videos")
|
|
16
|
+
max_results = args[7] if len(args) > 7 else kwargs.get("max_results")
|
|
17
|
+
|
|
18
|
+
assert keywords, "keywords is mandatory"
|
|
19
|
+
|
|
20
|
+
vqd = self._get_vqd(keywords)
|
|
21
|
+
|
|
22
|
+
safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
|
|
23
|
+
timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
|
|
24
|
+
resolution = f"videoDefinition:{resolution}" if resolution else ""
|
|
25
|
+
duration = f"videoDuration:{duration}" if duration else ""
|
|
26
|
+
license_videos = f"videoLicense:{license_videos}" if license_videos else ""
|
|
27
|
+
payload = {
|
|
28
|
+
"l": region,
|
|
29
|
+
"o": "json",
|
|
30
|
+
"q": keywords,
|
|
31
|
+
"vqd": vqd,
|
|
32
|
+
"f": f"{timelimit},{resolution},{duration},{license_videos}",
|
|
33
|
+
"p": safesearch_base[safesearch.lower()],
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
cache = set()
|
|
37
|
+
results: list[dict[str, str]] = []
|
|
38
|
+
|
|
39
|
+
def _videos_page(s: int) -> list[dict[str, str]]:
|
|
40
|
+
payload["s"] = f"{s}"
|
|
41
|
+
resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload).content
|
|
42
|
+
resp_json = self.json_loads(resp_content)
|
|
43
|
+
|
|
44
|
+
page_data = resp_json.get("results", [])
|
|
45
|
+
page_results = []
|
|
46
|
+
for row in page_data:
|
|
47
|
+
if row["content"] not in cache:
|
|
48
|
+
cache.add(row["content"])
|
|
49
|
+
page_results.append(row)
|
|
50
|
+
return page_results
|
|
51
|
+
|
|
52
|
+
slist = [0]
|
|
53
|
+
if max_results:
|
|
54
|
+
max_results = min(max_results, 400)
|
|
55
|
+
slist.extend(range(60, max_results, 60))
|
|
56
|
+
try:
|
|
57
|
+
for r in self._executor.map(_videos_page, slist):
|
|
58
|
+
results.extend(r)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
raise e
|
|
61
|
+
|
|
62
|
+
return list(self.islice(results, max_results))
|
|
63
|
+
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from urllib.parse import quote
|
|
6
|
+
|
|
7
|
+
from ....exceptions import WebscoutE
|
|
8
|
+
from .base import DuckDuckGoBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DuckDuckGoWeather(DuckDuckGoBase):
|
|
12
|
+
def run(self, *args, **kwargs) -> dict[str, any]:
|
|
13
|
+
location = args[0] if args else kwargs.get("location")
|
|
14
|
+
language = args[1] if len(args) > 1 else kwargs.get("language", "en")
|
|
15
|
+
|
|
16
|
+
assert location, "location is mandatory"
|
|
17
|
+
lang = language.split('-')[0]
|
|
18
|
+
url = f"https://duckduckgo.com/js/spice/forecast/{quote(location)}/{lang}"
|
|
19
|
+
|
|
20
|
+
resp = self._get_url("GET", url).content
|
|
21
|
+
resp_text = resp.decode('utf-8')
|
|
22
|
+
|
|
23
|
+
if "ddg_spice_forecast(" not in resp_text:
|
|
24
|
+
raise WebscoutE(f"No weather data found for {location}")
|
|
25
|
+
|
|
26
|
+
json_text = resp_text[resp_text.find('(') + 1:resp_text.rfind(')')]
|
|
27
|
+
try:
|
|
28
|
+
result = json.loads(json_text)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
raise WebscoutE(f"Error parsing weather JSON: {e}")
|
|
31
|
+
|
|
32
|
+
if not result or 'currentWeather' not in result or 'forecastDaily' not in result:
|
|
33
|
+
raise WebscoutE(f"Invalid weather data format for {location}")
|
|
34
|
+
|
|
35
|
+
formatted_data = {
|
|
36
|
+
"location": result["currentWeather"]["metadata"].get("ddg-location", "Unknown"),
|
|
37
|
+
"current": {
|
|
38
|
+
"condition": result["currentWeather"].get("conditionCode"),
|
|
39
|
+
"temperature_c": result["currentWeather"].get("temperature"),
|
|
40
|
+
"feels_like_c": result["currentWeather"].get("temperatureApparent"),
|
|
41
|
+
"humidity": result["currentWeather"].get("humidity"),
|
|
42
|
+
"wind_speed_ms": result["currentWeather"].get("windSpeed"),
|
|
43
|
+
"wind_direction": result["currentWeather"].get("windDirection"),
|
|
44
|
+
"visibility_m": result["currentWeather"].get("visibility"),
|
|
45
|
+
},
|
|
46
|
+
"daily_forecast": [],
|
|
47
|
+
"hourly_forecast": []
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
for day in result["forecastDaily"]["days"]:
|
|
51
|
+
formatted_data["daily_forecast"].append({
|
|
52
|
+
"date": datetime.fromisoformat(day["forecastStart"].replace("Z", "+00:00")).strftime("%Y-%m-%d"),
|
|
53
|
+
"condition": day["daytimeForecast"].get("conditionCode"),
|
|
54
|
+
"max_temp_c": day["temperatureMax"],
|
|
55
|
+
"min_temp_c": day["temperatureMin"],
|
|
56
|
+
"sunrise": datetime.fromisoformat(day["sunrise"].replace("Z", "+00:00")).strftime("%H:%M"),
|
|
57
|
+
"sunset": datetime.fromisoformat(day["sunset"].replace("Z", "+00:00")).strftime("%H:%M"),
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
if 'forecastHourly' in result and 'hours' in result['forecastHourly']:
|
|
61
|
+
for hour in result['forecastHourly']['hours']:
|
|
62
|
+
formatted_data["hourly_forecast"].append({
|
|
63
|
+
"time": datetime.fromisoformat(hour["forecastStart"].replace("Z", "+00:00")).strftime("%H:%M"),
|
|
64
|
+
"condition": hour.get("conditionCode"),
|
|
65
|
+
"temperature_c": hour.get("temperature"),
|
|
66
|
+
"feels_like_c": hour.get("temperatureApparent"),
|
|
67
|
+
"humidity": hour.get("humidity"),
|
|
68
|
+
"wind_speed_ms": hour.get("windSpeed"),
|
|
69
|
+
"wind_direction": hour.get("windDirection"),
|
|
70
|
+
"visibility_m": hour.get("visibility"),
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
return formatted_data
|
|
74
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Mojeek search engine implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ..base import BaseSearchEngine
|
|
9
|
+
from ..results import TextResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Mojeek(BaseSearchEngine[TextResult]):
|
|
13
|
+
"""Mojeek search engine."""
|
|
14
|
+
|
|
15
|
+
name = "mojeek"
|
|
16
|
+
category = "text"
|
|
17
|
+
provider = "mojeek"
|
|
18
|
+
|
|
19
|
+
search_url = "https://www.mojeek.com/search"
|
|
20
|
+
search_method = "GET"
|
|
21
|
+
|
|
22
|
+
items_xpath = "//ul[contains(@class, 'results')]/li"
|
|
23
|
+
elements_xpath: Mapping[str, str] = {
|
|
24
|
+
"title": ".//h2//text()",
|
|
25
|
+
"href": ".//h2/a/@href",
|
|
26
|
+
"body": ".//p[@class='s']//text()",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
def build_payload(
|
|
30
|
+
self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
|
|
31
|
+
) -> dict[str, Any]:
|
|
32
|
+
"""Build a payload for the search request."""
|
|
33
|
+
safesearch_base = {"on": "1", "moderate": "0", "off": "0"}
|
|
34
|
+
payload = {"q": query, "safe": safesearch_base[safesearch.lower()]}
|
|
35
|
+
if page > 1:
|
|
36
|
+
payload["s"] = f"{(page - 1) * 10 + 1}"
|
|
37
|
+
return payload
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Wikipedia text search engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import quote
|
|
8
|
+
|
|
9
|
+
from ..base import BaseSearchEngine
|
|
10
|
+
from ..results import TextResult
|
|
11
|
+
from ...utils import json_loads
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Wikipedia(BaseSearchEngine[TextResult]):
|
|
17
|
+
"""Wikipedia text search engine."""
|
|
18
|
+
|
|
19
|
+
name = "wikipedia"
|
|
20
|
+
category = "text"
|
|
21
|
+
provider = "wikipedia"
|
|
22
|
+
priority = 2
|
|
23
|
+
|
|
24
|
+
search_url = "https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={query}"
|
|
25
|
+
search_method = "GET"
|
|
26
|
+
|
|
27
|
+
def build_payload(
|
|
28
|
+
self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
|
|
29
|
+
) -> dict[str, Any]:
|
|
30
|
+
"""Build a payload for the search request."""
|
|
31
|
+
_country, lang = region.lower().split("-")
|
|
32
|
+
encoded_query = quote(query)
|
|
33
|
+
self.search_url = (
|
|
34
|
+
f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&limit=1&search={encoded_query}"
|
|
35
|
+
)
|
|
36
|
+
payload: dict[str, Any] = {}
|
|
37
|
+
self.lang = lang # used in extract_results
|
|
38
|
+
return payload
|
|
39
|
+
|
|
40
|
+
def extract_results(self, html_text: str) -> list[TextResult]:
|
|
41
|
+
"""Extract search results from html text."""
|
|
42
|
+
json_data = json_loads(html_text)
|
|
43
|
+
if not json_data or len(json_data) < 4:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
results = []
|
|
47
|
+
titles, descriptions, urls = json_data[1], json_data[2], json_data[3]
|
|
48
|
+
|
|
49
|
+
for title, description, url in zip(titles, descriptions, urls):
|
|
50
|
+
result = TextResult()
|
|
51
|
+
result.title = title
|
|
52
|
+
result.body = description
|
|
53
|
+
result.href = url
|
|
54
|
+
results.append(result)
|
|
55
|
+
|
|
56
|
+
return results
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Yahoo search engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from secrets import token_urlsafe
|
|
7
|
+
from typing import Any
|
|
8
|
+
from urllib.parse import unquote_plus
|
|
9
|
+
|
|
10
|
+
from ..base import BaseSearchEngine
|
|
11
|
+
from ..results import TextResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_url(u: str) -> str:
|
|
15
|
+
"""Sanitize url."""
|
|
16
|
+
if "/RU=" in u:
|
|
17
|
+
start = u.find("/RU=") + 4
|
|
18
|
+
end = u.find("/RK=", start)
|
|
19
|
+
if end == -1:
|
|
20
|
+
end = len(u)
|
|
21
|
+
return unquote_plus(u[start:end])
|
|
22
|
+
return u
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Yahoo(BaseSearchEngine[TextResult]):
|
|
26
|
+
"""Yahoo search engine."""
|
|
27
|
+
|
|
28
|
+
name = "yahoo"
|
|
29
|
+
category = "text"
|
|
30
|
+
provider = "bing"
|
|
31
|
+
|
|
32
|
+
search_url = "https://search.yahoo.com/search"
|
|
33
|
+
search_method = "GET"
|
|
34
|
+
|
|
35
|
+
items_xpath = "//div[contains(@class, 'relsrch')]"
|
|
36
|
+
elements_xpath: Mapping[str, str] = {
|
|
37
|
+
"title": ".//div[contains(@class, 'Title')]//h3//text()",
|
|
38
|
+
"href": ".//div[contains(@class, 'Title')]//a/@href",
|
|
39
|
+
"body": ".//div[contains(@class, 'Text')]//text()",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def build_payload(
|
|
43
|
+
self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
|
|
44
|
+
) -> dict[str, Any]:
|
|
45
|
+
"""Build a payload for the search request."""
|
|
46
|
+
self.search_url = (
|
|
47
|
+
f"https://search.yahoo.com/search;_ylt={token_urlsafe(24 * 3 // 4)};_ylu={token_urlsafe(47 * 3 // 4)}"
|
|
48
|
+
)
|
|
49
|
+
payload = {"p": query}
|
|
50
|
+
if page > 1:
|
|
51
|
+
payload["b"] = f"{(page - 1) * 7 + 1}"
|
|
52
|
+
if timelimit:
|
|
53
|
+
payload["btf"] = timelimit
|
|
54
|
+
return payload
|
|
55
|
+
|
|
56
|
+
def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
|
|
57
|
+
"""Post-process search results."""
|
|
58
|
+
post_results = []
|
|
59
|
+
for result in results:
|
|
60
|
+
if result.href.startswith("https://www.bing.com/aclick?"):
|
|
61
|
+
continue
|
|
62
|
+
if "/RU=" in result.href:
|
|
63
|
+
result.href = extract_url(result.href)
|
|
64
|
+
post_results.append(result)
|
|
65
|
+
return post_results
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Yahoo news search engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from secrets import token_urlsafe
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ..base import BaseSearchEngine
|
|
10
|
+
from ..results import NewsResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_image(u: str) -> str:
|
|
14
|
+
"""Sanitize image url."""
|
|
15
|
+
if u and u.startswith("data:image"):
|
|
16
|
+
return ""
|
|
17
|
+
return u
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def extract_source(s: str) -> str:
|
|
21
|
+
"""Remove ' via Yahoo' from string."""
|
|
22
|
+
return s.replace(" via Yahoo", "") if s else s
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class YahooNews(BaseSearchEngine[NewsResult]):
|
|
26
|
+
"""Yahoo news search engine."""
|
|
27
|
+
|
|
28
|
+
name = "yahoo"
|
|
29
|
+
category = "news"
|
|
30
|
+
provider = "bing"
|
|
31
|
+
|
|
32
|
+
search_url = "https://news.search.yahoo.com/search"
|
|
33
|
+
search_method = "GET"
|
|
34
|
+
|
|
35
|
+
items_xpath = "//div[contains(@class, 'NewsArticle')]"
|
|
36
|
+
elements_xpath: Mapping[str, str] = {
|
|
37
|
+
"date": ".//span[contains(@class, 'fc-2nd')]//text()",
|
|
38
|
+
"title": ".//h4//a//text()",
|
|
39
|
+
"url": ".//h4//a/@href",
|
|
40
|
+
"body": ".//p//text()",
|
|
41
|
+
"image": ".//img/@src",
|
|
42
|
+
"source": ".//span[contains(@class, 's-source')]//text()",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
def build_payload(
|
|
46
|
+
self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
|
|
47
|
+
) -> dict[str, Any]:
|
|
48
|
+
"""Build a payload for the search request."""
|
|
49
|
+
self.search_url = (
|
|
50
|
+
f"https://news.search.yahoo.com/search;_ylt={token_urlsafe(24 * 3 // 4)};_ylu={token_urlsafe(47 * 3 // 4)}"
|
|
51
|
+
)
|
|
52
|
+
payload = {"p": query}
|
|
53
|
+
if page > 1:
|
|
54
|
+
payload["b"] = f"{(page - 1) * 10 + 1}"
|
|
55
|
+
if timelimit:
|
|
56
|
+
payload["btf"] = timelimit
|
|
57
|
+
return payload
|
|
58
|
+
|
|
59
|
+
def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
|
|
60
|
+
"""Post-process search results."""
|
|
61
|
+
for result in results:
|
|
62
|
+
result.image = extract_image(result.image)
|
|
63
|
+
result.source = extract_source(result.source)
|
|
64
|
+
return results
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Yandex search engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from random import SystemRandom
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ..base import BaseSearchEngine
|
|
10
|
+
from ..results import TextResult
|
|
11
|
+
|
|
12
|
+
random = SystemRandom()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Yandex(BaseSearchEngine[TextResult]):
|
|
16
|
+
"""Yandex search engine."""
|
|
17
|
+
|
|
18
|
+
name = "yandex"
|
|
19
|
+
category = "text"
|
|
20
|
+
provider = "yandex"
|
|
21
|
+
|
|
22
|
+
search_url = "https://yandex.com/search/"
|
|
23
|
+
search_method = "GET"
|
|
24
|
+
|
|
25
|
+
items_xpath = "//li[contains(@class, 'serp-item')]"
|
|
26
|
+
elements_xpath: Mapping[str, str] = {
|
|
27
|
+
"title": ".//h2//text()",
|
|
28
|
+
"href": ".//h2/a/@href",
|
|
29
|
+
"body": ".//div[contains(@class, 'text-container')]//text()",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
def build_payload(
|
|
33
|
+
self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
|
|
34
|
+
) -> dict[str, Any]:
|
|
35
|
+
"""Build a payload for the search request."""
|
|
36
|
+
safesearch_base = {"on": "1", "moderate": "0", "off": "0"}
|
|
37
|
+
payload = {
|
|
38
|
+
"text": query,
|
|
39
|
+
"family": safesearch_base[safesearch.lower()],
|
|
40
|
+
}
|
|
41
|
+
if page > 1:
|
|
42
|
+
payload["p"] = str(page - 1)
|
|
43
|
+
return payload
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Yep search engines package."""
|
|
2
|
+
|
|
3
|
+
from .base import YepBase
|
|
4
|
+
from .images import YepImages
|
|
5
|
+
from .suggestions import YepSuggestions
|
|
6
|
+
from .text import YepSearch as YepTextSearch
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"YepBase",
|
|
10
|
+
"YepTextSearch",
|
|
11
|
+
"YepImages",
|
|
12
|
+
"YepSuggestions",
|
|
13
|
+
]
|