webscout 1.2.2__tar.gz → 1.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (42) hide show
  1. webscout-1.2.3/DeepWEBS/documents/query_results_extractor.py +99 -0
  2. webscout-1.2.3/DeepWEBS/networks/google_searcher.py +52 -0
  3. webscout-1.2.3/DeepWEBS/networks/webpage_fetcher.py +97 -0
  4. {webscout-1.2.2 → webscout-1.2.3}/PKG-INFO +23 -10
  5. {webscout-1.2.2 → webscout-1.2.3}/README.md +22 -9
  6. {webscout-1.2.2 → webscout-1.2.3}/setup.py +1 -1
  7. {webscout-1.2.2 → webscout-1.2.3}/webscout/DWEBS.py +40 -22
  8. webscout-1.2.3/webscout/version.py +2 -0
  9. {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/PKG-INFO +23 -10
  10. webscout-1.2.2/DeepWEBS/documents/query_results_extractor.py +0 -78
  11. webscout-1.2.2/DeepWEBS/networks/google_searcher.py +0 -48
  12. webscout-1.2.2/DeepWEBS/networks/webpage_fetcher.py +0 -107
  13. webscout-1.2.2/webscout/version.py +0 -2
  14. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/__init__.py +0 -0
  15. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/documents/__init__.py +0 -0
  16. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/documents/webpage_content_extractor.py +0 -0
  17. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/networks/__init__.py +0 -0
  18. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/networks/filepath_converter.py +0 -0
  19. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/networks/network_configs.py +0 -0
  20. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/utilsdw/__init__.py +0 -0
  21. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/utilsdw/enver.py +0 -0
  22. {webscout-1.2.2 → webscout-1.2.3}/DeepWEBS/utilsdw/logger.py +0 -0
  23. {webscout-1.2.2 → webscout-1.2.3}/LICENSE.md +0 -0
  24. {webscout-1.2.2 → webscout-1.2.3}/setup.cfg +0 -0
  25. {webscout-1.2.2 → webscout-1.2.3}/webscout/AI.py +0 -0
  26. {webscout-1.2.2 → webscout-1.2.3}/webscout/AIbase.py +0 -0
  27. {webscout-1.2.2 → webscout-1.2.3}/webscout/AIutel.py +0 -0
  28. {webscout-1.2.2 → webscout-1.2.3}/webscout/HelpingAI.py +0 -0
  29. {webscout-1.2.2 → webscout-1.2.3}/webscout/LLM.py +0 -0
  30. {webscout-1.2.2 → webscout-1.2.3}/webscout/__init__.py +0 -0
  31. {webscout-1.2.2 → webscout-1.2.3}/webscout/__main__.py +0 -0
  32. {webscout-1.2.2 → webscout-1.2.3}/webscout/cli.py +0 -0
  33. {webscout-1.2.2 → webscout-1.2.3}/webscout/exceptions.py +0 -0
  34. {webscout-1.2.2 → webscout-1.2.3}/webscout/models.py +0 -0
  35. {webscout-1.2.2 → webscout-1.2.3}/webscout/utils.py +0 -0
  36. {webscout-1.2.2 → webscout-1.2.3}/webscout/webscout_search.py +0 -0
  37. {webscout-1.2.2 → webscout-1.2.3}/webscout/webscout_search_async.py +0 -0
  38. {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/SOURCES.txt +0 -0
  39. {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/dependency_links.txt +0 -0
  40. {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/entry_points.txt +0 -0
  41. {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/requires.txt +0 -0
  42. {webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/top_level.txt +0 -0
@@ -0,0 +1,99 @@
1
+ from bs4 import BeautifulSoup
2
+ from pathlib import Path
3
+ from DeepWEBS.utilsdw.logger import logger
4
+
5
+ class QueryResultsExtractor:
6
+ def __init__(self) -> None:
7
+ self.query_results = []
8
+ self.related_questions = []
9
+
10
+ def load_html(self, html_path):
11
+ try:
12
+ with open(html_path, "r", encoding="utf-8") as f:
13
+ html = f.read()
14
+ self.soup = BeautifulSoup(html, "html.parser")
15
+ except FileNotFoundError:
16
+ logger.error(f"File not found: {html_path}")
17
+ except Exception as e:
18
+ logger.error(f"Error loading HTML: {e}")
19
+
20
+ def extract_query_results(self):
21
+ try:
22
+ self.query = self.soup.find("textarea").text.strip()
23
+ query_result_elements = self.soup.find_all("div", class_="g")
24
+ for idx, result in enumerate(query_result_elements):
25
+ try:
26
+ site = result.find("cite").find_previous("span").text.strip()
27
+ url = result.find("a")["href"]
28
+ title = result.find("h3").text.strip()
29
+ abstract_element_conditions = [
30
+ {"data-sncf": "1"},
31
+ {"class_": "ITZIwc"},
32
+ ]
33
+ for condition in abstract_element_conditions:
34
+ abstract_element = result.find("div", condition)
35
+ if abstract_element is not None:
36
+ abstract = abstract_element.text.strip()
37
+ break
38
+ else:
39
+ abstract = ""
40
+ logger.mesg(
41
+ f"{title}\n"
42
+ f" - {site}\n"
43
+ f" - {url}\n"
44
+ f" - {abstract}\n"
45
+ f"\n"
46
+ )
47
+ self.query_results.append(
48
+ {
49
+ "title": title,
50
+ "site": site,
51
+ "url": url,
52
+ "abstract": abstract,
53
+ "index": idx,
54
+ "type": "web",
55
+ }
56
+ )
57
+ except Exception as e:
58
+ logger.error(f"Error extracting query result: {e}")
59
+ logger.success(f"- {len(query_result_elements)} query results")
60
+ except Exception as e:
61
+ logger.error(f"Error extracting query results: {e}")
62
+
63
+ def extract_related_questions(self):
64
+ try:
65
+ related_question_elements = self.soup.find_all(
66
+ "div", class_="related-question-pair"
67
+ )
68
+ for question_element in related_question_elements:
69
+ try:
70
+ question = question_element.find("span").text.strip()
71
+ print(question)
72
+ self.related_questions.append(question)
73
+ except Exception as e:
74
+ logger.error(f"Error extracting related question: {e}")
75
+ logger.success(f"- {len(self.related_questions)} related questions")
76
+ except Exception as e:
77
+ logger.error(f"Error extracting related questions: {e}")
78
+
79
+ def extract(self, html_path):
80
+ self.load_html(html_path)
81
+ self.extract_query_results()
82
+ self.extract_related_questions()
83
+ self.search_results = {
84
+ "query": self.query,
85
+ "query_results": self.query_results,
86
+ "related_questions": self.related_questions,
87
+ }
88
+ return self.search_results
89
+
90
+
91
+ if __name__ == "__main__":
92
+ html_path_root = Path(__file__).parents[1] / "files"
93
+ html_filename = "python_tutorials"
94
+ html_path = html_path_root / f"{html_filename}.html"
95
+ extractor = QueryResultsExtractor()
96
+ try:
97
+ extractor.extract(html_path)
98
+ except Exception as e:
99
+ logger.error(f"Error in main function: {e}")
@@ -0,0 +1,52 @@
1
+ import requests
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import random
5
+ from DeepWEBS.utilsdw.enver import enver
6
+ from DeepWEBS.utilsdw.logger import logger
7
+ from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
8
+ from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
9
+
10
+ class GoogleSearcher:
11
+ def __init__(self):
12
+ self.url = "https://www.google.com/search"
13
+ self.enver = enver
14
+ self.enver.set_envs(proxies=True)
15
+ self.filepath_converter = QueryToFilepathConverter()
16
+
17
+ def send_request(self, query: str, result_num: int = 10, safe: bool = False) -> requests.Response:
18
+ params = {
19
+ "q": query,
20
+ "num": result_num,
21
+ }
22
+ response = requests.get(
23
+ self.url,
24
+ headers=REQUESTS_HEADERS,
25
+ params=params,
26
+ proxies=self.enver.requests_proxies,
27
+ )
28
+ response.raise_for_status() # Raise an exception for non-2xx status codes
29
+ return response
30
+
31
+ def save_response(self, response: requests.Response, html_path: Path) -> None:
32
+ html_path.parent.mkdir(parents=True, exist_ok=True)
33
+ logger.note(f"Saving to: [{html_path}]")
34
+ with html_path.open("wb") as wf:
35
+ wf.write(response.content)
36
+
37
+ def search(self, query: str, result_num: int = 10, safe: bool = False, overwrite: bool = False) -> Path:
38
+ html_path = self.filepath_converter.convert(query)
39
+ logger.note(f"Searching: [{query}]")
40
+
41
+ if html_path.exists() and not overwrite:
42
+ logger.success(f"HTML existed: {html_path}")
43
+ else:
44
+ response = self.send_request(query, result_num, safe)
45
+ self.save_response(response, html_path)
46
+
47
+ return html_path
48
+
49
+ if __name__ == "__main__":
50
+ searcher = GoogleSearcher()
51
+ html_path = searcher.search("python tutorials")
52
+ print(f"HTML file saved at: {html_path}")
@@ -0,0 +1,97 @@
1
+ import concurrent.futures
2
+ import random
3
+ import requests
4
+ import tldextract
5
+ from pathlib import Path
6
+ from typing import List, Tuple, Dict
7
+
8
+ from DeepWEBS.utilsdw.enver import enver
9
+ from DeepWEBS.utilsdw.logger import logger
10
+ from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
11
+ from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
12
+
13
+ class WebpageFetcher:
14
+ def __init__(self):
15
+ self.enver = enver
16
+ self.enver.set_envs(proxies=True)
17
+ self.filepath_converter = UrlToFilepathConverter()
18
+
19
+ def is_ignored_host(self, url: str) -> bool:
20
+ host = tldextract.extract(url).registered_domain
21
+ return host in IGNORE_HOSTS
22
+
23
+ def send_request(self, url: str) -> requests.Response:
24
+ try:
25
+ user_agent = random.choice(REQUESTS_HEADERS["User-Agent"])
26
+ response = requests.get(
27
+ url=url,
28
+ headers={"User-Agent": user_agent},
29
+ proxies=self.enver.requests_proxies,
30
+ timeout=15,
31
+ )
32
+ response.raise_for_status()
33
+ return response
34
+ except requests.exceptions.RequestException as e:
35
+ logger.warn(f"Failed to fetch: [{url}] | {e}")
36
+ return None
37
+
38
+ def save_response(self, response: requests.Response, html_path: Path) -> None:
39
+ if response is None:
40
+ return
41
+
42
+ html_path.parent.mkdir(parents=True, exist_ok=True)
43
+ logger.success(f"Saving to: [{html_path}]")
44
+ with html_path.open("wb") as wf:
45
+ wf.write(response.content)
46
+
47
+ def fetch(self, url: str, overwrite: bool = False, output_parent: str = None) -> Path:
48
+ logger.note(f"Fetching: [{url}]")
49
+ html_path = self.filepath_converter.convert(url, parent=output_parent)
50
+
51
+ if self.is_ignored_host(url):
52
+ logger.warn(f"Ignored host: [{tldextract.extract(url).registered_domain}]")
53
+ return html_path
54
+
55
+ if html_path.exists() and not overwrite:
56
+ logger.success(f"HTML existed: [{html_path}]")
57
+ else:
58
+ response = self.send_request(url)
59
+ self.save_response(response, html_path)
60
+
61
+ return html_path
62
+
63
+ class BatchWebpageFetcher:
64
+ def __init__(self):
65
+ self.done_count = 0
66
+ self.total_count = 0
67
+ self.url_and_html_path_list: List[Dict[str, str]] = []
68
+
69
+ def fetch_single_webpage(self, url: str, overwrite: bool = False, output_parent: str = None) -> Tuple[str, Path]:
70
+ webpage_fetcher = WebpageFetcher()
71
+ html_path = webpage_fetcher.fetch(url, overwrite, output_parent)
72
+ self.url_and_html_path_list.append({"url": url, "html_path": str(html_path)})
73
+ self.done_count += 1
74
+ logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
75
+ return url, html_path
76
+
77
+ def fetch(self, urls: List[str], overwrite: bool = False, output_parent: str = None) -> List[Dict[str, str]]:
78
+ self.urls = urls
79
+ self.total_count = len(self.urls)
80
+
81
+ with concurrent.futures.ThreadPoolExecutor() as executor:
82
+ futures = [
83
+ executor.submit(self.fetch_single_webpage, url, overwrite, output_parent)
84
+ for url in urls
85
+ ]
86
+ concurrent.futures.wait(futures)
87
+
88
+ return self.url_and_html_path_list
89
+
90
+ if __name__ == "__main__":
91
+ urls = [
92
+ "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
93
+ "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
94
+ "https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
95
+ ]
96
+ batch_webpage_fetcher = BatchWebpageFetcher()
97
+ batch_webpage_fetcher.fetch(urls=urls, overwrite=True, output_parent="python tutorials")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: webscout
3
- Version: 1.2.2
3
+ Version: 1.2.3
4
4
  Summary: Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models
5
5
  Author: OEvortex
6
6
  Author-email: helpingai5@gmail.com
@@ -69,6 +69,7 @@ Also containes AI models that you can use
69
69
  - [Regions](#regions)
70
70
  - [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
71
71
  - [Activating DeepWEBS](#activating-deepwebs)
72
+ - [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
72
73
  - [Usage Example](#usage-example)
73
74
  - [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
74
75
  - [Exceptions](#exceptions)
@@ -224,26 +225,33 @@ ___
224
225
 
225
226
  To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
226
227
 
228
+ ### Point to remember before using `DeepWEBS`
229
+ As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
230
+
227
231
  ### Usage Example
228
232
 
229
233
  Here's a basic example of how to use the `DeepWEBS` :
230
234
  ```python
231
235
  from webscout import DeepWEBS
232
236
 
233
-
234
237
  def perform_web_search(query):
235
- D = DeepWEBS()
236
- item = D.DeepSearch(
237
- queries=[query], # Query to search
238
- result_num=5, # Number of search results
239
- safe=True, # Enable SafeSearch
240
- types=["web"], # Search type: web
238
+ # Initialize the DeepWEBS class
239
+ D = DeepWEBS()
240
+
241
+ # Set up the search parameters
242
+ search_params = D.DeepSearch(
243
+ queries=[query], # Query to search
244
+ result_num=5, # Number of search results
245
+ safe=True, # Enable SafeSearch
246
+ types=["web"], # Search type: web
241
247
  extract_webpage=True, # True for extracting webpages
242
248
  overwrite_query_html=False,
243
249
  overwrite_webpage_html=False,
244
250
  )
245
- results = D.queries_to_search_results(item)
246
-
251
+
252
+ # Execute the search and retrieve results
253
+ results = D.queries_to_search_results(search_params)
254
+
247
255
  return results
248
256
 
249
257
  def print_search_results(results):
@@ -260,8 +268,13 @@ def print_search_results(results):
260
268
  print("No search results found.")
261
269
 
262
270
  def main():
271
+ # Prompt the user for a search query
263
272
  query = input("Enter your search query: ")
273
+
274
+ # Perform the web search
264
275
  results = perform_web_search(query)
276
+
277
+ # Print the search results
265
278
  print_search_results(results)
266
279
 
267
280
  if __name__ == "__main__":
@@ -18,6 +18,7 @@ Also containes AI models that you can use
18
18
  - [Regions](#regions)
19
19
  - [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
20
20
  - [Activating DeepWEBS](#activating-deepwebs)
21
+ - [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
21
22
  - [Usage Example](#usage-example)
22
23
  - [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
23
24
  - [Exceptions](#exceptions)
@@ -173,26 +174,33 @@ ___
173
174
 
174
175
  To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
175
176
 
177
+ ### Point to remember before using `DeepWEBS`
178
+ As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
179
+
176
180
  ### Usage Example
177
181
 
178
182
  Here's a basic example of how to use the `DeepWEBS` :
179
183
  ```python
180
184
  from webscout import DeepWEBS
181
185
 
182
-
183
186
  def perform_web_search(query):
184
- D = DeepWEBS()
185
- item = D.DeepSearch(
186
- queries=[query], # Query to search
187
- result_num=5, # Number of search results
188
- safe=True, # Enable SafeSearch
189
- types=["web"], # Search type: web
187
+ # Initialize the DeepWEBS class
188
+ D = DeepWEBS()
189
+
190
+ # Set up the search parameters
191
+ search_params = D.DeepSearch(
192
+ queries=[query], # Query to search
193
+ result_num=5, # Number of search results
194
+ safe=True, # Enable SafeSearch
195
+ types=["web"], # Search type: web
190
196
  extract_webpage=True, # True for extracting webpages
191
197
  overwrite_query_html=False,
192
198
  overwrite_webpage_html=False,
193
199
  )
194
- results = D.queries_to_search_results(item)
195
-
200
+
201
+ # Execute the search and retrieve results
202
+ results = D.queries_to_search_results(search_params)
203
+
196
204
  return results
197
205
 
198
206
  def print_search_results(results):
@@ -209,8 +217,13 @@ def print_search_results(results):
209
217
  print("No search results found.")
210
218
 
211
219
  def main():
220
+ # Prompt the user for a search query
212
221
  query = input("Enter your search query: ")
222
+
223
+ # Perform the web search
213
224
  results = perform_web_search(query)
225
+
226
+ # Print the search results
214
227
  print_search_results(results)
215
228
 
216
229
  if __name__ == "__main__":
@@ -9,7 +9,7 @@ with open("README.md", encoding="utf-8") as f:
9
9
 
10
10
  setup(
11
11
  name="webscout",
12
- version="1.2.2", # Use the version variable from the version.py file
12
+ version="1.2.3", # Use the version variable from the version.py file
13
13
  description="Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models",
14
14
  long_description=README,
15
15
  long_description_content_type="text/markdown",
@@ -51,13 +51,23 @@ class DeepWEBS:
51
51
  query_results_extractor = QueryResultsExtractor()
52
52
  if not query.strip():
53
53
  continue
54
- query_html_path = google_searcher.search(
55
- query=query,
56
- result_num=item.result_num,
57
- safe=item.safe,
58
- overwrite=item.overwrite_query_html,
59
- )
60
- query_search_results = query_results_extractor.extract(query_html_path)
54
+ try:
55
+ query_html_path = google_searcher.search(
56
+ query=query,
57
+ result_num=item.result_num,
58
+ safe=item.safe,
59
+ overwrite=item.overwrite_query_html,
60
+ )
61
+ except Exception as e:
62
+ logger.error(f"Failed to search for query '{query}': {e}")
63
+ continue
64
+
65
+ try:
66
+ query_search_results = query_results_extractor.extract(query_html_path)
67
+ except Exception as e:
68
+ logger.error(f"Failed to extract search results for query '{query}': {e}")
69
+ continue
70
+
61
71
  queries_search_results.append(query_search_results)
62
72
  logger.note(queries_search_results)
63
73
 
@@ -70,17 +80,21 @@ class DeepWEBS:
70
80
 
71
81
  def extract_webpages(self, queries_search_results, overwrite_webpage_html=False):
72
82
  for query_idx, query_search_results in enumerate(queries_search_results):
73
- # Fetch webpages with urls
74
- batch_webpage_fetcher = BatchWebpageFetcher()
75
- urls = [
76
- query_result["url"]
77
- for query_result in query_search_results["query_results"]
78
- ]
79
- url_and_html_path_list = batch_webpage_fetcher.fetch(
80
- urls,
81
- overwrite=overwrite_webpage_html,
82
- output_parent=query_search_results["query"],
83
- )
83
+ try:
84
+ # Fetch webpages with urls
85
+ batch_webpage_fetcher = BatchWebpageFetcher()
86
+ urls = [
87
+ query_result["url"]
88
+ for query_result in query_search_results["query_results"]
89
+ ]
90
+ url_and_html_path_list = batch_webpage_fetcher.fetch(
91
+ urls,
92
+ overwrite=overwrite_webpage_html,
93
+ output_parent=query_search_results["query"],
94
+ )
95
+ except Exception as e:
96
+ logger.error(f"Failed to fetch webpages for query '{query_search_results['query']}': {e}")
97
+ continue
84
98
 
85
99
  # Extract webpage contents from htmls
86
100
  html_paths = [
@@ -88,9 +102,13 @@ class DeepWEBS:
88
102
  for url_and_html_path in url_and_html_path_list
89
103
  ]
90
104
  batch_webpage_content_extractor = BatchWebpageContentExtractor()
91
- html_path_and_extracted_content_list = (
92
- batch_webpage_content_extractor.extract(html_paths)
93
- )
105
+ try:
106
+ html_path_and_extracted_content_list = (
107
+ batch_webpage_content_extractor.extract(html_paths)
108
+ )
109
+ except Exception as e:
110
+ logger.error(f"Failed to extract webpage contents for query '{query_search_results['query']}': {e}")
111
+ continue
94
112
 
95
113
  # Build the map of url to extracted_content
96
114
  html_path_to_url_dict = {
@@ -109,7 +127,7 @@ class DeepWEBS:
109
127
  query_search_results["query_results"]
110
128
  ):
111
129
  url = query_result["url"]
112
- extracted_content = url_to_extracted_content_dict[url]
130
+ extracted_content = url_to_extracted_content_dict.get(url, "")
113
131
  queries_search_results[query_idx]["query_results"][query_result_idx][
114
132
  "text"
115
133
  ] = extracted_content
@@ -0,0 +1,2 @@
1
+ __version__ = "1.2.3"
2
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: webscout
3
- Version: 1.2.2
3
+ Version: 1.2.3
4
4
  Summary: Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models
5
5
  Author: OEvortex
6
6
  Author-email: helpingai5@gmail.com
@@ -69,6 +69,7 @@ Also containes AI models that you can use
69
69
  - [Regions](#regions)
70
70
  - [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
71
71
  - [Activating DeepWEBS](#activating-deepwebs)
72
+ - [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
72
73
  - [Usage Example](#usage-example)
73
74
  - [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
74
75
  - [Exceptions](#exceptions)
@@ -224,26 +225,33 @@ ___
224
225
 
225
226
  To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
226
227
 
228
+ ### Point to remember before using `DeepWEBS`
229
+ As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
230
+
227
231
  ### Usage Example
228
232
 
229
233
  Here's a basic example of how to use the `DeepWEBS` :
230
234
  ```python
231
235
  from webscout import DeepWEBS
232
236
 
233
-
234
237
  def perform_web_search(query):
235
- D = DeepWEBS()
236
- item = D.DeepSearch(
237
- queries=[query], # Query to search
238
- result_num=5, # Number of search results
239
- safe=True, # Enable SafeSearch
240
- types=["web"], # Search type: web
238
+ # Initialize the DeepWEBS class
239
+ D = DeepWEBS()
240
+
241
+ # Set up the search parameters
242
+ search_params = D.DeepSearch(
243
+ queries=[query], # Query to search
244
+ result_num=5, # Number of search results
245
+ safe=True, # Enable SafeSearch
246
+ types=["web"], # Search type: web
241
247
  extract_webpage=True, # True for extracting webpages
242
248
  overwrite_query_html=False,
243
249
  overwrite_webpage_html=False,
244
250
  )
245
- results = D.queries_to_search_results(item)
246
-
251
+
252
+ # Execute the search and retrieve results
253
+ results = D.queries_to_search_results(search_params)
254
+
247
255
  return results
248
256
 
249
257
  def print_search_results(results):
@@ -260,8 +268,13 @@ def print_search_results(results):
260
268
  print("No search results found.")
261
269
 
262
270
  def main():
271
+ # Prompt the user for a search query
263
272
  query = input("Enter your search query: ")
273
+
274
+ # Perform the web search
264
275
  results = perform_web_search(query)
276
+
277
+ # Print the search results
265
278
  print_search_results(results)
266
279
 
267
280
  if __name__ == "__main__":
@@ -1,78 +0,0 @@
1
- from bs4 import BeautifulSoup
2
- from pathlib import Path
3
- from DeepWEBS.utilsdw.logger import logger
4
-
5
-
6
- class QueryResultsExtractor:
7
- def __init__(self) -> None:
8
- self.query_results = []
9
- self.related_questions = []
10
-
11
- def load_html(self, html_path):
12
- with open(html_path, "r", encoding="utf-8") as f:
13
- html = f.read()
14
- self.soup = BeautifulSoup(html, "html.parser")
15
-
16
- def extract_query_results(self):
17
- self.query = self.soup.find("textarea").text.strip()
18
- query_result_elements = self.soup.find_all("div", class_="g")
19
- for idx, result in enumerate(query_result_elements):
20
- site = result.find("cite").find_previous("span").text.strip()
21
- url = result.find("a")["href"]
22
- title = result.find("h3").text.strip()
23
-
24
- abstract_element_conditions = [
25
- {"data-sncf": "1"},
26
- {"class_": "ITZIwc"},
27
- ]
28
- for condition in abstract_element_conditions:
29
- abstract_element = result.find("div", condition)
30
- if abstract_element is not None:
31
- abstract = abstract_element.text.strip()
32
- break
33
- else:
34
- abstract = ""
35
-
36
- logger.mesg(
37
- f"{title}\n" f" - {site}\n" f" - {url}\n" f" - {abstract}\n" f"\n"
38
- )
39
- self.query_results.append(
40
- {
41
- "title": title,
42
- "site": site,
43
- "url": url,
44
- "abstract": abstract,
45
- "index": idx,
46
- "type": "web",
47
- }
48
- )
49
- logger.success(f"- {len(query_result_elements)} query results")
50
-
51
- def extract_related_questions(self):
52
- related_question_elements = self.soup.find_all(
53
- "div", class_="related-question-pair"
54
- )
55
- for question_element in related_question_elements:
56
- question = question_element.find("span").text.strip()
57
- print(question)
58
- self.related_questions.append(question)
59
- logger.success(f"- {len(self.related_questions)} related questions")
60
-
61
- def extract(self, html_path):
62
- self.load_html(html_path)
63
- self.extract_query_results()
64
- self.extract_related_questions()
65
- self.search_results = {
66
- "query": self.query,
67
- "query_results": self.query_results,
68
- "related_questions": self.related_questions,
69
- }
70
- return self.search_results
71
-
72
-
73
- if __name__ == "__main__":
74
- html_path_root = Path(__file__).parents[1] / "files"
75
- html_filename = "python_tutorials"
76
- html_path = html_path_root / f"{html_filename}.html"
77
- extractor = QueryResultsExtractor()
78
- extractor.extract(html_path)
@@ -1,48 +0,0 @@
1
- import requests
2
- from pathlib import Path
3
- from DeepWEBS.utilsdw.enver import enver
4
- from DeepWEBS.utilsdw.logger import logger
5
- from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
6
- from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
7
-
8
-
9
- class GoogleSearcher:
10
- def __init__(self):
11
- self.url = "https://www.google.com/search"
12
- self.enver = enver
13
- self.enver.set_envs(proxies=True)
14
- self.filepath_converter = QueryToFilepathConverter()
15
-
16
- def send_request(self, result_num=10, safe=False):
17
- self.request_response = requests.get(
18
- url=self.url,
19
- headers=REQUESTS_HEADERS,
20
- params={
21
- "q": self.query,
22
- "num": result_num,
23
- },
24
- proxies=self.enver.requests_proxies,
25
- )
26
-
27
- def save_response(self):
28
- if not self.html_path.exists():
29
- self.html_path.parent.mkdir(parents=True, exist_ok=True)
30
- logger.note(f"Saving to: [{self.html_path}]")
31
- with open(self.html_path, "wb") as wf:
32
- wf.write(self.request_response.content)
33
-
34
- def search(self, query, result_num=10, safe=False, overwrite=False):
35
- self.query = query
36
- self.html_path = self.filepath_converter.convert(self.query)
37
- logger.note(f"Searching: [{self.query}]")
38
- if self.html_path.exists() and not overwrite:
39
- logger.success(f"HTML existed: {self.html_path}")
40
- else:
41
- self.send_request(result_num=result_num, safe=safe)
42
- self.save_response()
43
- return self.html_path
44
-
45
-
46
- if __name__ == "__main__":
47
- searcher = GoogleSearcher()
48
- searcher.search("python tutorials")
@@ -1,107 +0,0 @@
1
- import concurrent.futures
2
- import requests
3
- import tldextract
4
- from pathlib import Path
5
- from DeepWEBS.utilsdw.enver import enver
6
- from DeepWEBS.utilsdw.logger import logger
7
- from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
8
- from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
9
-
10
-
11
- class WebpageFetcher:
12
- def __init__(self):
13
- self.enver = enver
14
- self.enver.set_envs(proxies=True)
15
- self.filepath_converter = UrlToFilepathConverter()
16
-
17
- def is_ignored_host(self, url):
18
- self.host = tldextract.extract(url).registered_domain
19
- if self.host in IGNORE_HOSTS:
20
- return True
21
- else:
22
- return False
23
-
24
- def send_request(self):
25
- try:
26
- self.request_response = requests.get(
27
- url=self.url,
28
- headers=REQUESTS_HEADERS,
29
- proxies=self.enver.requests_proxies,
30
- timeout=15,
31
- )
32
- except:
33
- logger.warn(f"Failed to fetch: [{self.url}]")
34
- self.request_response = None
35
-
36
- def save_response(self):
37
- if not self.html_path.exists():
38
- self.html_path.parent.mkdir(parents=True, exist_ok=True)
39
- logger.success(f"Saving to: [{self.html_path}]")
40
-
41
- if self.request_response is None:
42
- return
43
- else:
44
- with open(self.html_path, "wb") as wf:
45
- wf.write(self.request_response.content)
46
-
47
- def fetch(self, url, overwrite=False, output_parent=None):
48
- self.url = url
49
- logger.note(f"Fetching: [{self.url}]")
50
- self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
51
-
52
- if self.is_ignored_host(self.url):
53
- logger.warn(f"Ignore host: [{self.host}]")
54
- return self.html_path
55
-
56
- if self.html_path.exists() and not overwrite:
57
- logger.success(f"HTML existed: [{self.html_path}]")
58
- else:
59
- self.send_request()
60
- self.save_response()
61
- return self.html_path
62
-
63
-
64
- class BatchWebpageFetcher:
65
- def __init__(self):
66
- self.done_count = 0
67
- self.total_count = 0
68
- self.url_and_html_path_list = []
69
-
70
- def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
71
- webpage_fetcher = WebpageFetcher()
72
- html_path = webpage_fetcher.fetch(
73
- url=url, overwrite=overwrite, output_parent=output_parent
74
- )
75
- self.url_and_html_path_list.append({"url": url, "html_path": html_path})
76
- self.done_count += 1
77
- logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
78
-
79
- def fetch(self, urls, overwrite=False, output_parent=None):
80
- self.urls = urls
81
- self.total_count = len(self.urls)
82
- with concurrent.futures.ThreadPoolExecutor() as executor:
83
- futures = [
84
- executor.submit(
85
- self.fecth_single_webpage,
86
- url=url,
87
- overwrite=overwrite,
88
- output_parent=output_parent,
89
- )
90
- for url in urls
91
- ]
92
-
93
- for idx, future in enumerate(concurrent.futures.as_completed(futures)):
94
- result = future.result()
95
- return self.url_and_html_path_list
96
-
97
-
98
- if __name__ == "__main__":
99
- urls = [
100
- "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
101
- "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
102
- "https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
103
- ]
104
- batch_webpage_fetcher = BatchWebpageFetcher()
105
- batch_webpage_fetcher.fetch(
106
- urls=urls, overwrite=True, output_parent="python tutorials"
107
- )
@@ -1,2 +0,0 @@
1
- __version__ = "1.2.2"
2
-
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes