webscout 1.2.2__tar.gz → 1.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout-1.2.3/DeepWEBS/documents/query_results_extractor.py +99 -0
- webscout-1.2.3/DeepWEBS/networks/google_searcher.py +52 -0
- webscout-1.2.3/DeepWEBS/networks/webpage_fetcher.py +97 -0
- {webscout-1.2.2 → webscout-1.2.3}/PKG-INFO +23 -10
- {webscout-1.2.2 → webscout-1.2.3}/README.md +22 -9
- {webscout-1.2.2 → webscout-1.2.3}/setup.py +1 -1
- {webscout-1.2.2 → webscout-1.2.3}/webscout/DWEBS.py +40 -22
- webscout-1.2.3/webscout/version.py +2 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/PKG-INFO +23 -10
- webscout-1.2.2/DeepWEBS/documents/query_results_extractor.py +0 -78
- webscout-1.2.2/DeepWEBS/networks/google_searcher.py +0 -48
- webscout-1.2.2/DeepWEBS/networks/webpage_fetcher.py +0 -107
- webscout-1.2.2/webscout/version.py +0 -2
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/__init__.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/documents/__init__.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/documents/webpage_content_extractor.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/networks/__init__.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/networks/filepath_converter.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/networks/network_configs.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/utilsdw/__init__.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/utilsdw/enver.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/utilsdw/logger.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/LICENSE.md +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/setup.cfg +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/AI.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/AIbase.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/AIutel.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/HelpingAI.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/LLM.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/__init__.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/__main__.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/cli.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/exceptions.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/models.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/utils.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/webscout_search.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout/webscout_search_async.py +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/SOURCES.txt +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/dependency_links.txt +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/entry_points.txt +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/requires.txt +0 -0
- {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
4
|
+
|
|
5
|
+
class QueryResultsExtractor:
|
|
6
|
+
def __init__(self) -> None:
|
|
7
|
+
self.query_results = []
|
|
8
|
+
self.related_questions = []
|
|
9
|
+
|
|
10
|
+
def load_html(self, html_path):
|
|
11
|
+
try:
|
|
12
|
+
with open(html_path, "r", encoding="utf-8") as f:
|
|
13
|
+
html = f.read()
|
|
14
|
+
self.soup = BeautifulSoup(html, "html.parser")
|
|
15
|
+
except FileNotFoundError:
|
|
16
|
+
logger.error(f"File not found: {html_path}")
|
|
17
|
+
except Exception as e:
|
|
18
|
+
logger.error(f"Error loading HTML: {e}")
|
|
19
|
+
|
|
20
|
+
def extract_query_results(self):
|
|
21
|
+
try:
|
|
22
|
+
self.query = self.soup.find("textarea").text.strip()
|
|
23
|
+
query_result_elements = self.soup.find_all("div", class_="g")
|
|
24
|
+
for idx, result in enumerate(query_result_elements):
|
|
25
|
+
try:
|
|
26
|
+
site = result.find("cite").find_previous("span").text.strip()
|
|
27
|
+
url = result.find("a")["href"]
|
|
28
|
+
title = result.find("h3").text.strip()
|
|
29
|
+
abstract_element_conditions = [
|
|
30
|
+
{"data-sncf": "1"},
|
|
31
|
+
{"class_": "ITZIwc"},
|
|
32
|
+
]
|
|
33
|
+
for condition in abstract_element_conditions:
|
|
34
|
+
abstract_element = result.find("div", condition)
|
|
35
|
+
if abstract_element is not None:
|
|
36
|
+
abstract = abstract_element.text.strip()
|
|
37
|
+
break
|
|
38
|
+
else:
|
|
39
|
+
abstract = ""
|
|
40
|
+
logger.mesg(
|
|
41
|
+
f"{title}\n"
|
|
42
|
+
f" - {site}\n"
|
|
43
|
+
f" - {url}\n"
|
|
44
|
+
f" - {abstract}\n"
|
|
45
|
+
f"\n"
|
|
46
|
+
)
|
|
47
|
+
self.query_results.append(
|
|
48
|
+
{
|
|
49
|
+
"title": title,
|
|
50
|
+
"site": site,
|
|
51
|
+
"url": url,
|
|
52
|
+
"abstract": abstract,
|
|
53
|
+
"index": idx,
|
|
54
|
+
"type": "web",
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Error extracting query result: {e}")
|
|
59
|
+
logger.success(f"- {len(query_result_elements)} query results")
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.error(f"Error extracting query results: {e}")
|
|
62
|
+
|
|
63
|
+
def extract_related_questions(self):
|
|
64
|
+
try:
|
|
65
|
+
related_question_elements = self.soup.find_all(
|
|
66
|
+
"div", class_="related-question-pair"
|
|
67
|
+
)
|
|
68
|
+
for question_element in related_question_elements:
|
|
69
|
+
try:
|
|
70
|
+
question = question_element.find("span").text.strip()
|
|
71
|
+
print(question)
|
|
72
|
+
self.related_questions.append(question)
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.error(f"Error extracting related question: {e}")
|
|
75
|
+
logger.success(f"- {len(self.related_questions)} related questions")
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Error extracting related questions: {e}")
|
|
78
|
+
|
|
79
|
+
def extract(self, html_path):
|
|
80
|
+
self.load_html(html_path)
|
|
81
|
+
self.extract_query_results()
|
|
82
|
+
self.extract_related_questions()
|
|
83
|
+
self.search_results = {
|
|
84
|
+
"query": self.query,
|
|
85
|
+
"query_results": self.query_results,
|
|
86
|
+
"related_questions": self.related_questions,
|
|
87
|
+
}
|
|
88
|
+
return self.search_results
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
html_path_root = Path(__file__).parents[1] / "files"
|
|
93
|
+
html_filename = "python_tutorials"
|
|
94
|
+
html_path = html_path_root / f"{html_filename}.html"
|
|
95
|
+
extractor = QueryResultsExtractor()
|
|
96
|
+
try:
|
|
97
|
+
extractor.extract(html_path)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"Error in main function: {e}")
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import random
|
|
5
|
+
from DeepWEBS.utilsdw.enver import enver
|
|
6
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
7
|
+
from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
|
|
8
|
+
from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
|
|
9
|
+
|
|
10
|
+
class GoogleSearcher:
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.url = "https://www.google.com/search"
|
|
13
|
+
self.enver = enver
|
|
14
|
+
self.enver.set_envs(proxies=True)
|
|
15
|
+
self.filepath_converter = QueryToFilepathConverter()
|
|
16
|
+
|
|
17
|
+
def send_request(self, query: str, result_num: int = 10, safe: bool = False) -> requests.Response:
|
|
18
|
+
params = {
|
|
19
|
+
"q": query,
|
|
20
|
+
"num": result_num,
|
|
21
|
+
}
|
|
22
|
+
response = requests.get(
|
|
23
|
+
self.url,
|
|
24
|
+
headers=REQUESTS_HEADERS,
|
|
25
|
+
params=params,
|
|
26
|
+
proxies=self.enver.requests_proxies,
|
|
27
|
+
)
|
|
28
|
+
response.raise_for_status() # Raise an exception for non-2xx status codes
|
|
29
|
+
return response
|
|
30
|
+
|
|
31
|
+
def save_response(self, response: requests.Response, html_path: Path) -> None:
|
|
32
|
+
html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
logger.note(f"Saving to: [{html_path}]")
|
|
34
|
+
with html_path.open("wb") as wf:
|
|
35
|
+
wf.write(response.content)
|
|
36
|
+
|
|
37
|
+
def search(self, query: str, result_num: int = 10, safe: bool = False, overwrite: bool = False) -> Path:
|
|
38
|
+
html_path = self.filepath_converter.convert(query)
|
|
39
|
+
logger.note(f"Searching: [{query}]")
|
|
40
|
+
|
|
41
|
+
if html_path.exists() and not overwrite:
|
|
42
|
+
logger.success(f"HTML existed: {html_path}")
|
|
43
|
+
else:
|
|
44
|
+
response = self.send_request(query, result_num, safe)
|
|
45
|
+
self.save_response(response, html_path)
|
|
46
|
+
|
|
47
|
+
return html_path
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
searcher = GoogleSearcher()
|
|
51
|
+
html_path = searcher.search("python tutorials")
|
|
52
|
+
print(f"HTML file saved at: {html_path}")
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import random
|
|
3
|
+
import requests
|
|
4
|
+
import tldextract
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Tuple, Dict
|
|
7
|
+
|
|
8
|
+
from DeepWEBS.utilsdw.enver import enver
|
|
9
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
10
|
+
from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
|
|
11
|
+
from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
|
|
12
|
+
|
|
13
|
+
class WebpageFetcher:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.enver = enver
|
|
16
|
+
self.enver.set_envs(proxies=True)
|
|
17
|
+
self.filepath_converter = UrlToFilepathConverter()
|
|
18
|
+
|
|
19
|
+
def is_ignored_host(self, url: str) -> bool:
|
|
20
|
+
host = tldextract.extract(url).registered_domain
|
|
21
|
+
return host in IGNORE_HOSTS
|
|
22
|
+
|
|
23
|
+
def send_request(self, url: str) -> requests.Response:
|
|
24
|
+
try:
|
|
25
|
+
user_agent = random.choice(REQUESTS_HEADERS["User-Agent"])
|
|
26
|
+
response = requests.get(
|
|
27
|
+
url=url,
|
|
28
|
+
headers={"User-Agent": user_agent},
|
|
29
|
+
proxies=self.enver.requests_proxies,
|
|
30
|
+
timeout=15,
|
|
31
|
+
)
|
|
32
|
+
response.raise_for_status()
|
|
33
|
+
return response
|
|
34
|
+
except requests.exceptions.RequestException as e:
|
|
35
|
+
logger.warn(f"Failed to fetch: [{url}] | {e}")
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
def save_response(self, response: requests.Response, html_path: Path) -> None:
|
|
39
|
+
if response is None:
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
logger.success(f"Saving to: [{html_path}]")
|
|
44
|
+
with html_path.open("wb") as wf:
|
|
45
|
+
wf.write(response.content)
|
|
46
|
+
|
|
47
|
+
def fetch(self, url: str, overwrite: bool = False, output_parent: str = None) -> Path:
|
|
48
|
+
logger.note(f"Fetching: [{url}]")
|
|
49
|
+
html_path = self.filepath_converter.convert(url, parent=output_parent)
|
|
50
|
+
|
|
51
|
+
if self.is_ignored_host(url):
|
|
52
|
+
logger.warn(f"Ignored host: [{tldextract.extract(url).registered_domain}]")
|
|
53
|
+
return html_path
|
|
54
|
+
|
|
55
|
+
if html_path.exists() and not overwrite:
|
|
56
|
+
logger.success(f"HTML existed: [{html_path}]")
|
|
57
|
+
else:
|
|
58
|
+
response = self.send_request(url)
|
|
59
|
+
self.save_response(response, html_path)
|
|
60
|
+
|
|
61
|
+
return html_path
|
|
62
|
+
|
|
63
|
+
class BatchWebpageFetcher:
|
|
64
|
+
def __init__(self):
|
|
65
|
+
self.done_count = 0
|
|
66
|
+
self.total_count = 0
|
|
67
|
+
self.url_and_html_path_list: List[Dict[str, str]] = []
|
|
68
|
+
|
|
69
|
+
def fetch_single_webpage(self, url: str, overwrite: bool = False, output_parent: str = None) -> Tuple[str, Path]:
|
|
70
|
+
webpage_fetcher = WebpageFetcher()
|
|
71
|
+
html_path = webpage_fetcher.fetch(url, overwrite, output_parent)
|
|
72
|
+
self.url_and_html_path_list.append({"url": url, "html_path": str(html_path)})
|
|
73
|
+
self.done_count += 1
|
|
74
|
+
logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
|
|
75
|
+
return url, html_path
|
|
76
|
+
|
|
77
|
+
def fetch(self, urls: List[str], overwrite: bool = False, output_parent: str = None) -> List[Dict[str, str]]:
|
|
78
|
+
self.urls = urls
|
|
79
|
+
self.total_count = len(self.urls)
|
|
80
|
+
|
|
81
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
82
|
+
futures = [
|
|
83
|
+
executor.submit(self.fetch_single_webpage, url, overwrite, output_parent)
|
|
84
|
+
for url in urls
|
|
85
|
+
]
|
|
86
|
+
concurrent.futures.wait(futures)
|
|
87
|
+
|
|
88
|
+
return self.url_and_html_path_list
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
urls = [
|
|
92
|
+
"https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
|
|
93
|
+
"https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
|
|
94
|
+
"https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
|
|
95
|
+
]
|
|
96
|
+
batch_webpage_fetcher = BatchWebpageFetcher()
|
|
97
|
+
batch_webpage_fetcher.fetch(urls=urls, overwrite=True, output_parent="python tutorials")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: webscout
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.3
|
|
4
4
|
Summary: Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models
|
|
5
5
|
Author: OEvortex
|
|
6
6
|
Author-email: helpingai5@gmail.com
|
|
@@ -69,6 +69,7 @@ Also containes AI models that you can use
|
|
|
69
69
|
- [Regions](#regions)
|
|
70
70
|
- [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
|
|
71
71
|
- [Activating DeepWEBS](#activating-deepwebs)
|
|
72
|
+
- [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
|
|
72
73
|
- [Usage Example](#usage-example)
|
|
73
74
|
- [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
|
|
74
75
|
- [Exceptions](#exceptions)
|
|
@@ -224,26 +225,33 @@ ___
|
|
|
224
225
|
|
|
225
226
|
To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
|
|
226
227
|
|
|
228
|
+
### Point to remember before using `DeepWEBS`
|
|
229
|
+
As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
|
|
230
|
+
|
|
227
231
|
### Usage Example
|
|
228
232
|
|
|
229
233
|
Here's a basic example of how to use the `DeepWEBS` :
|
|
230
234
|
```python
|
|
231
235
|
from webscout import DeepWEBS
|
|
232
236
|
|
|
233
|
-
|
|
234
237
|
def perform_web_search(query):
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
238
|
+
# Initialize the DeepWEBS class
|
|
239
|
+
D = DeepWEBS()
|
|
240
|
+
|
|
241
|
+
# Set up the search parameters
|
|
242
|
+
search_params = D.DeepSearch(
|
|
243
|
+
queries=[query], # Query to search
|
|
244
|
+
result_num=5, # Number of search results
|
|
245
|
+
safe=True, # Enable SafeSearch
|
|
246
|
+
types=["web"], # Search type: web
|
|
241
247
|
extract_webpage=True, # True for extracting webpages
|
|
242
248
|
overwrite_query_html=False,
|
|
243
249
|
overwrite_webpage_html=False,
|
|
244
250
|
)
|
|
245
|
-
|
|
246
|
-
|
|
251
|
+
|
|
252
|
+
# Execute the search and retrieve results
|
|
253
|
+
results = D.queries_to_search_results(search_params)
|
|
254
|
+
|
|
247
255
|
return results
|
|
248
256
|
|
|
249
257
|
def print_search_results(results):
|
|
@@ -260,8 +268,13 @@ def print_search_results(results):
|
|
|
260
268
|
print("No search results found.")
|
|
261
269
|
|
|
262
270
|
def main():
|
|
271
|
+
# Prompt the user for a search query
|
|
263
272
|
query = input("Enter your search query: ")
|
|
273
|
+
|
|
274
|
+
# Perform the web search
|
|
264
275
|
results = perform_web_search(query)
|
|
276
|
+
|
|
277
|
+
# Print the search results
|
|
265
278
|
print_search_results(results)
|
|
266
279
|
|
|
267
280
|
if __name__ == "__main__":
|
|
@@ -18,6 +18,7 @@ Also containes AI models that you can use
|
|
|
18
18
|
- [Regions](#regions)
|
|
19
19
|
- [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
|
|
20
20
|
- [Activating DeepWEBS](#activating-deepwebs)
|
|
21
|
+
- [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
|
|
21
22
|
- [Usage Example](#usage-example)
|
|
22
23
|
- [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
|
|
23
24
|
- [Exceptions](#exceptions)
|
|
@@ -173,26 +174,33 @@ ___
|
|
|
173
174
|
|
|
174
175
|
To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
|
|
175
176
|
|
|
177
|
+
### Point to remember before using `DeepWEBS`
|
|
178
|
+
As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
|
|
179
|
+
|
|
176
180
|
### Usage Example
|
|
177
181
|
|
|
178
182
|
Here's a basic example of how to use the `DeepWEBS` :
|
|
179
183
|
```python
|
|
180
184
|
from webscout import DeepWEBS
|
|
181
185
|
|
|
182
|
-
|
|
183
186
|
def perform_web_search(query):
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
187
|
+
# Initialize the DeepWEBS class
|
|
188
|
+
D = DeepWEBS()
|
|
189
|
+
|
|
190
|
+
# Set up the search parameters
|
|
191
|
+
search_params = D.DeepSearch(
|
|
192
|
+
queries=[query], # Query to search
|
|
193
|
+
result_num=5, # Number of search results
|
|
194
|
+
safe=True, # Enable SafeSearch
|
|
195
|
+
types=["web"], # Search type: web
|
|
190
196
|
extract_webpage=True, # True for extracting webpages
|
|
191
197
|
overwrite_query_html=False,
|
|
192
198
|
overwrite_webpage_html=False,
|
|
193
199
|
)
|
|
194
|
-
|
|
195
|
-
|
|
200
|
+
|
|
201
|
+
# Execute the search and retrieve results
|
|
202
|
+
results = D.queries_to_search_results(search_params)
|
|
203
|
+
|
|
196
204
|
return results
|
|
197
205
|
|
|
198
206
|
def print_search_results(results):
|
|
@@ -209,8 +217,13 @@ def print_search_results(results):
|
|
|
209
217
|
print("No search results found.")
|
|
210
218
|
|
|
211
219
|
def main():
|
|
220
|
+
# Prompt the user for a search query
|
|
212
221
|
query = input("Enter your search query: ")
|
|
222
|
+
|
|
223
|
+
# Perform the web search
|
|
213
224
|
results = perform_web_search(query)
|
|
225
|
+
|
|
226
|
+
# Print the search results
|
|
214
227
|
print_search_results(results)
|
|
215
228
|
|
|
216
229
|
if __name__ == "__main__":
|
|
@@ -9,7 +9,7 @@ with open("README.md", encoding="utf-8") as f:
|
|
|
9
9
|
|
|
10
10
|
setup(
|
|
11
11
|
name="webscout",
|
|
12
|
-
version="1.2.
|
|
12
|
+
version="1.2.3", # Use the version variable from the version.py file
|
|
13
13
|
description="Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models",
|
|
14
14
|
long_description=README,
|
|
15
15
|
long_description_content_type="text/markdown",
|
|
@@ -51,13 +51,23 @@ class DeepWEBS:
|
|
|
51
51
|
query_results_extractor = QueryResultsExtractor()
|
|
52
52
|
if not query.strip():
|
|
53
53
|
continue
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
54
|
+
try:
|
|
55
|
+
query_html_path = google_searcher.search(
|
|
56
|
+
query=query,
|
|
57
|
+
result_num=item.result_num,
|
|
58
|
+
safe=item.safe,
|
|
59
|
+
overwrite=item.overwrite_query_html,
|
|
60
|
+
)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.error(f"Failed to search for query '{query}': {e}")
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
query_search_results = query_results_extractor.extract(query_html_path)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.error(f"Failed to extract search results for query '{query}': {e}")
|
|
69
|
+
continue
|
|
70
|
+
|
|
61
71
|
queries_search_results.append(query_search_results)
|
|
62
72
|
logger.note(queries_search_results)
|
|
63
73
|
|
|
@@ -70,17 +80,21 @@ class DeepWEBS:
|
|
|
70
80
|
|
|
71
81
|
def extract_webpages(self, queries_search_results, overwrite_webpage_html=False):
|
|
72
82
|
for query_idx, query_search_results in enumerate(queries_search_results):
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
83
|
+
try:
|
|
84
|
+
# Fetch webpages with urls
|
|
85
|
+
batch_webpage_fetcher = BatchWebpageFetcher()
|
|
86
|
+
urls = [
|
|
87
|
+
query_result["url"]
|
|
88
|
+
for query_result in query_search_results["query_results"]
|
|
89
|
+
]
|
|
90
|
+
url_and_html_path_list = batch_webpage_fetcher.fetch(
|
|
91
|
+
urls,
|
|
92
|
+
overwrite=overwrite_webpage_html,
|
|
93
|
+
output_parent=query_search_results["query"],
|
|
94
|
+
)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.error(f"Failed to fetch webpages for query '{query_search_results['query']}': {e}")
|
|
97
|
+
continue
|
|
84
98
|
|
|
85
99
|
# Extract webpage contents from htmls
|
|
86
100
|
html_paths = [
|
|
@@ -88,9 +102,13 @@ class DeepWEBS:
|
|
|
88
102
|
for url_and_html_path in url_and_html_path_list
|
|
89
103
|
]
|
|
90
104
|
batch_webpage_content_extractor = BatchWebpageContentExtractor()
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
105
|
+
try:
|
|
106
|
+
html_path_and_extracted_content_list = (
|
|
107
|
+
batch_webpage_content_extractor.extract(html_paths)
|
|
108
|
+
)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"Failed to extract webpage contents for query '{query_search_results['query']}': {e}")
|
|
111
|
+
continue
|
|
94
112
|
|
|
95
113
|
# Build the map of url to extracted_content
|
|
96
114
|
html_path_to_url_dict = {
|
|
@@ -109,7 +127,7 @@ class DeepWEBS:
|
|
|
109
127
|
query_search_results["query_results"]
|
|
110
128
|
):
|
|
111
129
|
url = query_result["url"]
|
|
112
|
-
extracted_content = url_to_extracted_content_dict
|
|
130
|
+
extracted_content = url_to_extracted_content_dict.get(url, "")
|
|
113
131
|
queries_search_results[query_idx]["query_results"][query_result_idx][
|
|
114
132
|
"text"
|
|
115
133
|
] = extracted_content
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: webscout
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.3
|
|
4
4
|
Summary: Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models
|
|
5
5
|
Author: OEvortex
|
|
6
6
|
Author-email: helpingai5@gmail.com
|
|
@@ -69,6 +69,7 @@ Also containes AI models that you can use
|
|
|
69
69
|
- [Regions](#regions)
|
|
70
70
|
- [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
|
|
71
71
|
- [Activating DeepWEBS](#activating-deepwebs)
|
|
72
|
+
- [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
|
|
72
73
|
- [Usage Example](#usage-example)
|
|
73
74
|
- [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
|
|
74
75
|
- [Exceptions](#exceptions)
|
|
@@ -224,26 +225,33 @@ ___
|
|
|
224
225
|
|
|
225
226
|
To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
|
|
226
227
|
|
|
228
|
+
### Point to remember before using `DeepWEBS`
|
|
229
|
+
As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
|
|
230
|
+
|
|
227
231
|
### Usage Example
|
|
228
232
|
|
|
229
233
|
Here's a basic example of how to use the `DeepWEBS` :
|
|
230
234
|
```python
|
|
231
235
|
from webscout import DeepWEBS
|
|
232
236
|
|
|
233
|
-
|
|
234
237
|
def perform_web_search(query):
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
238
|
+
# Initialize the DeepWEBS class
|
|
239
|
+
D = DeepWEBS()
|
|
240
|
+
|
|
241
|
+
# Set up the search parameters
|
|
242
|
+
search_params = D.DeepSearch(
|
|
243
|
+
queries=[query], # Query to search
|
|
244
|
+
result_num=5, # Number of search results
|
|
245
|
+
safe=True, # Enable SafeSearch
|
|
246
|
+
types=["web"], # Search type: web
|
|
241
247
|
extract_webpage=True, # True for extracting webpages
|
|
242
248
|
overwrite_query_html=False,
|
|
243
249
|
overwrite_webpage_html=False,
|
|
244
250
|
)
|
|
245
|
-
|
|
246
|
-
|
|
251
|
+
|
|
252
|
+
# Execute the search and retrieve results
|
|
253
|
+
results = D.queries_to_search_results(search_params)
|
|
254
|
+
|
|
247
255
|
return results
|
|
248
256
|
|
|
249
257
|
def print_search_results(results):
|
|
@@ -260,8 +268,13 @@ def print_search_results(results):
|
|
|
260
268
|
print("No search results found.")
|
|
261
269
|
|
|
262
270
|
def main():
|
|
271
|
+
# Prompt the user for a search query
|
|
263
272
|
query = input("Enter your search query: ")
|
|
273
|
+
|
|
274
|
+
# Perform the web search
|
|
264
275
|
results = perform_web_search(query)
|
|
276
|
+
|
|
277
|
+
# Print the search results
|
|
265
278
|
print_search_results(results)
|
|
266
279
|
|
|
267
280
|
if __name__ == "__main__":
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
from bs4 import BeautifulSoup
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from DeepWEBS.utilsdw.logger import logger
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class QueryResultsExtractor:
|
|
7
|
-
def __init__(self) -> None:
|
|
8
|
-
self.query_results = []
|
|
9
|
-
self.related_questions = []
|
|
10
|
-
|
|
11
|
-
def load_html(self, html_path):
|
|
12
|
-
with open(html_path, "r", encoding="utf-8") as f:
|
|
13
|
-
html = f.read()
|
|
14
|
-
self.soup = BeautifulSoup(html, "html.parser")
|
|
15
|
-
|
|
16
|
-
def extract_query_results(self):
|
|
17
|
-
self.query = self.soup.find("textarea").text.strip()
|
|
18
|
-
query_result_elements = self.soup.find_all("div", class_="g")
|
|
19
|
-
for idx, result in enumerate(query_result_elements):
|
|
20
|
-
site = result.find("cite").find_previous("span").text.strip()
|
|
21
|
-
url = result.find("a")["href"]
|
|
22
|
-
title = result.find("h3").text.strip()
|
|
23
|
-
|
|
24
|
-
abstract_element_conditions = [
|
|
25
|
-
{"data-sncf": "1"},
|
|
26
|
-
{"class_": "ITZIwc"},
|
|
27
|
-
]
|
|
28
|
-
for condition in abstract_element_conditions:
|
|
29
|
-
abstract_element = result.find("div", condition)
|
|
30
|
-
if abstract_element is not None:
|
|
31
|
-
abstract = abstract_element.text.strip()
|
|
32
|
-
break
|
|
33
|
-
else:
|
|
34
|
-
abstract = ""
|
|
35
|
-
|
|
36
|
-
logger.mesg(
|
|
37
|
-
f"{title}\n" f" - {site}\n" f" - {url}\n" f" - {abstract}\n" f"\n"
|
|
38
|
-
)
|
|
39
|
-
self.query_results.append(
|
|
40
|
-
{
|
|
41
|
-
"title": title,
|
|
42
|
-
"site": site,
|
|
43
|
-
"url": url,
|
|
44
|
-
"abstract": abstract,
|
|
45
|
-
"index": idx,
|
|
46
|
-
"type": "web",
|
|
47
|
-
}
|
|
48
|
-
)
|
|
49
|
-
logger.success(f"- {len(query_result_elements)} query results")
|
|
50
|
-
|
|
51
|
-
def extract_related_questions(self):
|
|
52
|
-
related_question_elements = self.soup.find_all(
|
|
53
|
-
"div", class_="related-question-pair"
|
|
54
|
-
)
|
|
55
|
-
for question_element in related_question_elements:
|
|
56
|
-
question = question_element.find("span").text.strip()
|
|
57
|
-
print(question)
|
|
58
|
-
self.related_questions.append(question)
|
|
59
|
-
logger.success(f"- {len(self.related_questions)} related questions")
|
|
60
|
-
|
|
61
|
-
def extract(self, html_path):
|
|
62
|
-
self.load_html(html_path)
|
|
63
|
-
self.extract_query_results()
|
|
64
|
-
self.extract_related_questions()
|
|
65
|
-
self.search_results = {
|
|
66
|
-
"query": self.query,
|
|
67
|
-
"query_results": self.query_results,
|
|
68
|
-
"related_questions": self.related_questions,
|
|
69
|
-
}
|
|
70
|
-
return self.search_results
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
if __name__ == "__main__":
|
|
74
|
-
html_path_root = Path(__file__).parents[1] / "files"
|
|
75
|
-
html_filename = "python_tutorials"
|
|
76
|
-
html_path = html_path_root / f"{html_filename}.html"
|
|
77
|
-
extractor = QueryResultsExtractor()
|
|
78
|
-
extractor.extract(html_path)
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import requests
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from DeepWEBS.utilsdw.enver import enver
|
|
4
|
-
from DeepWEBS.utilsdw.logger import logger
|
|
5
|
-
from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
|
|
6
|
-
from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class GoogleSearcher:
|
|
10
|
-
def __init__(self):
|
|
11
|
-
self.url = "https://www.google.com/search"
|
|
12
|
-
self.enver = enver
|
|
13
|
-
self.enver.set_envs(proxies=True)
|
|
14
|
-
self.filepath_converter = QueryToFilepathConverter()
|
|
15
|
-
|
|
16
|
-
def send_request(self, result_num=10, safe=False):
|
|
17
|
-
self.request_response = requests.get(
|
|
18
|
-
url=self.url,
|
|
19
|
-
headers=REQUESTS_HEADERS,
|
|
20
|
-
params={
|
|
21
|
-
"q": self.query,
|
|
22
|
-
"num": result_num,
|
|
23
|
-
},
|
|
24
|
-
proxies=self.enver.requests_proxies,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
def save_response(self):
|
|
28
|
-
if not self.html_path.exists():
|
|
29
|
-
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
-
logger.note(f"Saving to: [{self.html_path}]")
|
|
31
|
-
with open(self.html_path, "wb") as wf:
|
|
32
|
-
wf.write(self.request_response.content)
|
|
33
|
-
|
|
34
|
-
def search(self, query, result_num=10, safe=False, overwrite=False):
|
|
35
|
-
self.query = query
|
|
36
|
-
self.html_path = self.filepath_converter.convert(self.query)
|
|
37
|
-
logger.note(f"Searching: [{self.query}]")
|
|
38
|
-
if self.html_path.exists() and not overwrite:
|
|
39
|
-
logger.success(f"HTML existed: {self.html_path}")
|
|
40
|
-
else:
|
|
41
|
-
self.send_request(result_num=result_num, safe=safe)
|
|
42
|
-
self.save_response()
|
|
43
|
-
return self.html_path
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if __name__ == "__main__":
|
|
47
|
-
searcher = GoogleSearcher()
|
|
48
|
-
searcher.search("python tutorials")
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
import concurrent.futures
|
|
2
|
-
import requests
|
|
3
|
-
import tldextract
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from DeepWEBS.utilsdw.enver import enver
|
|
6
|
-
from DeepWEBS.utilsdw.logger import logger
|
|
7
|
-
from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
|
|
8
|
-
from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class WebpageFetcher:
|
|
12
|
-
def __init__(self):
|
|
13
|
-
self.enver = enver
|
|
14
|
-
self.enver.set_envs(proxies=True)
|
|
15
|
-
self.filepath_converter = UrlToFilepathConverter()
|
|
16
|
-
|
|
17
|
-
def is_ignored_host(self, url):
|
|
18
|
-
self.host = tldextract.extract(url).registered_domain
|
|
19
|
-
if self.host in IGNORE_HOSTS:
|
|
20
|
-
return True
|
|
21
|
-
else:
|
|
22
|
-
return False
|
|
23
|
-
|
|
24
|
-
def send_request(self):
|
|
25
|
-
try:
|
|
26
|
-
self.request_response = requests.get(
|
|
27
|
-
url=self.url,
|
|
28
|
-
headers=REQUESTS_HEADERS,
|
|
29
|
-
proxies=self.enver.requests_proxies,
|
|
30
|
-
timeout=15,
|
|
31
|
-
)
|
|
32
|
-
except:
|
|
33
|
-
logger.warn(f"Failed to fetch: [{self.url}]")
|
|
34
|
-
self.request_response = None
|
|
35
|
-
|
|
36
|
-
def save_response(self):
|
|
37
|
-
if not self.html_path.exists():
|
|
38
|
-
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
39
|
-
logger.success(f"Saving to: [{self.html_path}]")
|
|
40
|
-
|
|
41
|
-
if self.request_response is None:
|
|
42
|
-
return
|
|
43
|
-
else:
|
|
44
|
-
with open(self.html_path, "wb") as wf:
|
|
45
|
-
wf.write(self.request_response.content)
|
|
46
|
-
|
|
47
|
-
def fetch(self, url, overwrite=False, output_parent=None):
|
|
48
|
-
self.url = url
|
|
49
|
-
logger.note(f"Fetching: [{self.url}]")
|
|
50
|
-
self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
|
|
51
|
-
|
|
52
|
-
if self.is_ignored_host(self.url):
|
|
53
|
-
logger.warn(f"Ignore host: [{self.host}]")
|
|
54
|
-
return self.html_path
|
|
55
|
-
|
|
56
|
-
if self.html_path.exists() and not overwrite:
|
|
57
|
-
logger.success(f"HTML existed: [{self.html_path}]")
|
|
58
|
-
else:
|
|
59
|
-
self.send_request()
|
|
60
|
-
self.save_response()
|
|
61
|
-
return self.html_path
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class BatchWebpageFetcher:
|
|
65
|
-
def __init__(self):
|
|
66
|
-
self.done_count = 0
|
|
67
|
-
self.total_count = 0
|
|
68
|
-
self.url_and_html_path_list = []
|
|
69
|
-
|
|
70
|
-
def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
|
|
71
|
-
webpage_fetcher = WebpageFetcher()
|
|
72
|
-
html_path = webpage_fetcher.fetch(
|
|
73
|
-
url=url, overwrite=overwrite, output_parent=output_parent
|
|
74
|
-
)
|
|
75
|
-
self.url_and_html_path_list.append({"url": url, "html_path": html_path})
|
|
76
|
-
self.done_count += 1
|
|
77
|
-
logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
|
|
78
|
-
|
|
79
|
-
def fetch(self, urls, overwrite=False, output_parent=None):
|
|
80
|
-
self.urls = urls
|
|
81
|
-
self.total_count = len(self.urls)
|
|
82
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
83
|
-
futures = [
|
|
84
|
-
executor.submit(
|
|
85
|
-
self.fecth_single_webpage,
|
|
86
|
-
url=url,
|
|
87
|
-
overwrite=overwrite,
|
|
88
|
-
output_parent=output_parent,
|
|
89
|
-
)
|
|
90
|
-
for url in urls
|
|
91
|
-
]
|
|
92
|
-
|
|
93
|
-
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
94
|
-
result = future.result()
|
|
95
|
-
return self.url_and_html_path_list
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
if __name__ == "__main__":
|
|
99
|
-
urls = [
|
|
100
|
-
"https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
|
|
101
|
-
"https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
|
|
102
|
-
"https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
|
|
103
|
-
]
|
|
104
|
-
batch_webpage_fetcher = BatchWebpageFetcher()
|
|
105
|
-
batch_webpage_fetcher.fetch(
|
|
106
|
-
urls=urls, overwrite=True, output_parent="python tutorials"
|
|
107
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|