webscout 4.8__py3-none-any.whl → 4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Bing_search.py +124 -0
- webscout/DWEBS.py +141 -777
- webscout/Provider/Cloudflare.py +286 -0
- webscout/Provider/DiscordRocks.py +3 -3
- webscout/Provider/Farfalle.py +3 -3
- webscout/Provider/Llama3.py +3 -3
- webscout/Provider/PI.py +208 -0
- webscout/Provider/Youchat.py +247 -0
- webscout/Provider/__init__.py +16 -2
- webscout/Provider/felo_search.py +238 -0
- webscout/Provider/julius.py +263 -0
- webscout/Provider/turboseek.py +237 -0
- webscout/Provider/xdash.py +202 -0
- webscout/Provider/yep.py +258 -0
- webscout/__init__.py +1 -59
- {webscout-4.8.dist-info → webscout-4.9.dist-info}/METADATA +25 -74
- {webscout-4.8.dist-info → webscout-4.9.dist-info}/RECORD +21 -13
- webscout/GoogleS.py +0 -342
- {webscout-4.8.dist-info → webscout-4.9.dist-info}/LICENSE.md +0 -0
- {webscout-4.8.dist-info → webscout-4.9.dist-info}/WHEEL +0 -0
- {webscout-4.8.dist-info → webscout-4.9.dist-info}/entry_points.txt +0 -0
- {webscout-4.8.dist-info → webscout-4.9.dist-info}/top_level.txt +0 -0
webscout/Bing_search.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup
|
|
2
|
+
import requests
|
|
3
|
+
from typing import Dict, List, Optional, Union
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
from termcolor import colored
|
|
7
|
+
import time
|
|
8
|
+
import random
|
|
9
|
+
|
|
10
|
+
class BingS:
|
|
11
|
+
"""Bing search class to get search results from bing.com."""
|
|
12
|
+
|
|
13
|
+
_executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=10)
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
headers: Optional[Dict[str, str]] = None,
|
|
18
|
+
proxy: Optional[str] = None,
|
|
19
|
+
timeout: Optional[int] = 10,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""Initialize the BingS object."""
|
|
22
|
+
self.proxy: Optional[str] = proxy
|
|
23
|
+
self.headers = headers if headers else {
|
|
24
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"
|
|
25
|
+
}
|
|
26
|
+
self.headers["Referer"] = "https://www.bing.com/"
|
|
27
|
+
self.client = requests.Session()
|
|
28
|
+
self.client.headers.update(self.headers)
|
|
29
|
+
self.client.proxies.update({"http": self.proxy, "https": self.proxy})
|
|
30
|
+
self.timeout = timeout
|
|
31
|
+
|
|
32
|
+
def __enter__(self) -> "BingS":
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
36
|
+
self.client.close()
|
|
37
|
+
|
|
38
|
+
def _get_url(
|
|
39
|
+
self,
|
|
40
|
+
method: str,
|
|
41
|
+
url: str,
|
|
42
|
+
params: Optional[Dict[str, str]] = None,
|
|
43
|
+
data: Optional[Union[Dict[str, str], bytes]] = None,
|
|
44
|
+
) -> bytes:
|
|
45
|
+
try:
|
|
46
|
+
resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
|
|
47
|
+
except Exception as ex:
|
|
48
|
+
raise Exception(f"{url} {type(ex).__name__}: {ex}") from ex
|
|
49
|
+
if resp.status_code == 200:
|
|
50
|
+
return resp.content
|
|
51
|
+
raise Exception(f"{resp.url} returned status code {resp.status_code}. {params=} {data=}")
|
|
52
|
+
|
|
53
|
+
def search(
|
|
54
|
+
self,
|
|
55
|
+
keywords: str,
|
|
56
|
+
region: str = "us-EN", # Bing uses us-EN
|
|
57
|
+
lang: str = "en",
|
|
58
|
+
safe: str = "off",
|
|
59
|
+
timelimit: Optional[str] = None, # Not directly supported by Bing
|
|
60
|
+
max_results: Optional[int] = None,
|
|
61
|
+
) -> List[Dict[str, str]]:
|
|
62
|
+
"""Bing text search."""
|
|
63
|
+
assert keywords, "keywords is mandatory"
|
|
64
|
+
|
|
65
|
+
results = []
|
|
66
|
+
start = 1 # Bing uses 1-based indexing for pages
|
|
67
|
+
while len(results) < (max_results or float('inf')):
|
|
68
|
+
params = {
|
|
69
|
+
"q": keywords,
|
|
70
|
+
"count": 10, # Number of results per page
|
|
71
|
+
"mkt": region,
|
|
72
|
+
"setlang": lang,
|
|
73
|
+
"safeSearch": safe,
|
|
74
|
+
"first": start, # Bing uses 'first' for pagination
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
resp_content = self._get_url("GET", "https://www.bing.com/search", params=params)
|
|
79
|
+
soup = BeautifulSoup(resp_content, "html.parser")
|
|
80
|
+
result_block = soup.find_all("li", class_="b_algo")
|
|
81
|
+
|
|
82
|
+
if not result_block:
|
|
83
|
+
break
|
|
84
|
+
|
|
85
|
+
for result in result_block:
|
|
86
|
+
try:
|
|
87
|
+
link = result.find("a", href=True)
|
|
88
|
+
if link:
|
|
89
|
+
initial_url = link["href"]
|
|
90
|
+
|
|
91
|
+
title = result.find("h2").text if result.find("h2") else ""
|
|
92
|
+
description = result.find("p").text.strip() if result.find("p") else "" # Strip whitespace
|
|
93
|
+
|
|
94
|
+
# Remove 'WEB' prefix if present
|
|
95
|
+
if description.startswith("WEB"):
|
|
96
|
+
description = description[4:] # Skip the first 4 characters ('WEB ')
|
|
97
|
+
|
|
98
|
+
results.append({
|
|
99
|
+
"title": title,
|
|
100
|
+
"href": initial_url,
|
|
101
|
+
"abstract": description,
|
|
102
|
+
"index": len(results),
|
|
103
|
+
"type": "web",
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
if len(results) >= max_results:
|
|
107
|
+
return results
|
|
108
|
+
|
|
109
|
+
except Exception as e:
|
|
110
|
+
print(f"Error extracting result: {e}")
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print(f"Error fetching URL: {e}")
|
|
114
|
+
|
|
115
|
+
start += 10
|
|
116
|
+
|
|
117
|
+
return results
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
from rich import print
|
|
121
|
+
searcher = BingS()
|
|
122
|
+
results = searcher.search("Python development tools", max_results=30)
|
|
123
|
+
for result in results:
|
|
124
|
+
print(result)
|