PyPI - webscout - Versions diffs - 1.2.2__tar.gz → 1.2.3__tar.gz - Mend

webscout 1.2.2tar.gz → 1.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of webscout might be problematic. Click here for more details.

Files changed (42) hide show

webscout-1.2.3/DeepWEBS/documents/query_results_extractor.py ADDED Viewed

@@ -0,0 +1,99 @@
+from bs4 import BeautifulSoup
+from pathlib import Path
+from DeepWEBS.utilsdw.logger import logger
+class QueryResultsExtractor:
+    def __init__(self) -> None:
+        self.query_results = []
+        self.related_questions = []
+    def load_html(self, html_path):
+        try:
+            with open(html_path, "r", encoding="utf-8") as f:
+                html = f.read()
+            self.soup = BeautifulSoup(html, "html.parser")
+        except FileNotFoundError:
+            logger.error(f"File not found: {html_path}")
+        except Exception as e:
+            logger.error(f"Error loading HTML: {e}")
+    def extract_query_results(self):
+        try:
+            self.query = self.soup.find("textarea").text.strip()
+            query_result_elements = self.soup.find_all("div", class_="g")
+            for idx, result in enumerate(query_result_elements):
+                try:
+                    site = result.find("cite").find_previous("span").text.strip()
+                    url = result.find("a")["href"]
+                    title = result.find("h3").text.strip()
+                    abstract_element_conditions = [
+                        {"data-sncf": "1"},
+                        {"class_": "ITZIwc"},
+                    ]
+                    for condition in abstract_element_conditions:
+                        abstract_element = result.find("div", condition)
+                        if abstract_element is not None:
+                            abstract = abstract_element.text.strip()
+                            break
+                    else:
+                        abstract = ""
+                    logger.mesg(
+                        f"{title}\n"
+                        f" - {site}\n"
+                        f" - {url}\n"
+                        f" - {abstract}\n"
+                        f"\n"
+                    )
+                    self.query_results.append(
+                        {
+                            "title": title,
+                            "site": site,
+                            "url": url,
+                            "abstract": abstract,
+                            "index": idx,
+                            "type": "web",
+                        }
+                    )
+                except Exception as e:
+                    logger.error(f"Error extracting query result: {e}")
+            logger.success(f"- {len(query_result_elements)} query results")
+        except Exception as e:
+            logger.error(f"Error extracting query results: {e}")
+    def extract_related_questions(self):
+        try:
+            related_question_elements = self.soup.find_all(
+                "div", class_="related-question-pair"
+            )
+            for question_element in related_question_elements:
+                try:
+                    question = question_element.find("span").text.strip()
+                    print(question)
+                    self.related_questions.append(question)
+                except Exception as e:
+                    logger.error(f"Error extracting related question: {e}")
+            logger.success(f"- {len(self.related_questions)} related questions")
+        except Exception as e:
+            logger.error(f"Error extracting related questions: {e}")
+    def extract(self, html_path):
+        self.load_html(html_path)
+        self.extract_query_results()
+        self.extract_related_questions()
+        self.search_results = {
+            "query": self.query,
+            "query_results": self.query_results,
+            "related_questions": self.related_questions,
+        }
+        return self.search_results
+if __name__ == "__main__":
+    html_path_root = Path(__file__).parents[1] / "files"
+    html_filename = "python_tutorials"
+    html_path = html_path_root / f"{html_filename}.html"
+    extractor = QueryResultsExtractor()
+    try:
+        extractor.extract(html_path)
+    except Exception as e:
+        logger.error(f"Error in main function: {e}")

webscout-1.2.3/DeepWEBS/networks/google_searcher.py ADDED Viewed

@@ -0,0 +1,52 @@
+import requests
+from pathlib import Path
+from typing import Optional
+import random
+from DeepWEBS.utilsdw.enver import enver
+from DeepWEBS.utilsdw.logger import logger
+from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
+from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
+class GoogleSearcher:
+    def __init__(self):
+        self.url = "https://www.google.com/search"
+        self.enver = enver
+        self.enver.set_envs(proxies=True)
+        self.filepath_converter = QueryToFilepathConverter()
+    def send_request(self, query: str, result_num: int = 10, safe: bool = False) -> requests.Response:
+        params = {
+            "q": query,
+            "num": result_num,
+        }
+        response = requests.get(
+            self.url,
+            headers=REQUESTS_HEADERS,
+            params=params,
+            proxies=self.enver.requests_proxies,
+        )
+        response.raise_for_status()  # Raise an exception for non-2xx status codes
+        return response
+    def save_response(self, response: requests.Response, html_path: Path) -> None:
+        html_path.parent.mkdir(parents=True, exist_ok=True)
+        logger.note(f"Saving to: [{html_path}]")
+        with html_path.open("wb") as wf:
+            wf.write(response.content)
+    def search(self, query: str, result_num: int = 10, safe: bool = False, overwrite: bool = False) -> Path:
+        html_path = self.filepath_converter.convert(query)
+        logger.note(f"Searching: [{query}]")
+        if html_path.exists() and not overwrite:
+            logger.success(f"HTML existed: {html_path}")
+        else:
+            response = self.send_request(query, result_num, safe)
+            self.save_response(response, html_path)
+        return html_path
+if __name__ == "__main__":
+    searcher = GoogleSearcher()
+    html_path = searcher.search("python tutorials")
+    print(f"HTML file saved at: {html_path}")

webscout-1.2.3/DeepWEBS/networks/webpage_fetcher.py ADDED Viewed

@@ -0,0 +1,97 @@
+import concurrent.futures
+import random
+import requests
+import tldextract
+from pathlib import Path
+from typing import List, Tuple, Dict
+from DeepWEBS.utilsdw.enver import enver
+from DeepWEBS.utilsdw.logger import logger
+from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
+from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
+class WebpageFetcher:
+    def __init__(self):
+        self.enver = enver
+        self.enver.set_envs(proxies=True)
+        self.filepath_converter = UrlToFilepathConverter()
+    def is_ignored_host(self, url: str) -> bool:
+        host = tldextract.extract(url).registered_domain
+        return host in IGNORE_HOSTS
+    def send_request(self, url: str) -> requests.Response:
+        try:
+            user_agent = random.choice(REQUESTS_HEADERS["User-Agent"])
+            response = requests.get(
+                url=url,
+                headers={"User-Agent": user_agent},
+                proxies=self.enver.requests_proxies,
+                timeout=15,
+            )
+            response.raise_for_status()
+            return response
+        except requests.exceptions.RequestException as e:
+            logger.warn(f"Failed to fetch: [{url}] | {e}")
+            return None
+    def save_response(self, response: requests.Response, html_path: Path) -> None:
+        if response is None:
+            return
+        html_path.parent.mkdir(parents=True, exist_ok=True)
+        logger.success(f"Saving to: [{html_path}]")
+        with html_path.open("wb") as wf:
+            wf.write(response.content)
+    def fetch(self, url: str, overwrite: bool = False, output_parent: str = None) -> Path:
+        logger.note(f"Fetching: [{url}]")
+        html_path = self.filepath_converter.convert(url, parent=output_parent)
+        if self.is_ignored_host(url):
+            logger.warn(f"Ignored host: [{tldextract.extract(url).registered_domain}]")
+            return html_path
+        if html_path.exists() and not overwrite:
+            logger.success(f"HTML existed: [{html_path}]")
+        else:
+            response = self.send_request(url)
+            self.save_response(response, html_path)
+        return html_path
+class BatchWebpageFetcher:
+    def __init__(self):
+        self.done_count = 0
+        self.total_count = 0
+        self.url_and_html_path_list: List[Dict[str, str]] = []
+    def fetch_single_webpage(self, url: str, overwrite: bool = False, output_parent: str = None) -> Tuple[str, Path]:
+        webpage_fetcher = WebpageFetcher()
+        html_path = webpage_fetcher.fetch(url, overwrite, output_parent)
+        self.url_and_html_path_list.append({"url": url, "html_path": str(html_path)})
+        self.done_count += 1
+        logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
+        return url, html_path
+    def fetch(self, urls: List[str], overwrite: bool = False, output_parent: str = None) -> List[Dict[str, str]]:
+        self.urls = urls
+        self.total_count = len(self.urls)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.fetch_single_webpage, url, overwrite, output_parent)
+                for url in urls
+            ]
+            concurrent.futures.wait(futures)
+        return self.url_and_html_path_list
+if __name__ == "__main__":
+    urls = [
+        "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
+        "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
+        "https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
+    ]
+    batch_webpage_fetcher = BatchWebpageFetcher()
+    batch_webpage_fetcher.fetch(urls=urls, overwrite=True, output_parent="python tutorials")

{webscout-1.2.2 → webscout-1.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: webscout
-Version: 1.2.2
+Version: 1.2.3
 Summary: Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models
 Author: OEvortex
 Author-email: helpingai5@gmail.com
@@ -69,6 +69,7 @@ Also containes AI models that you can use
   - [Regions](#regions)
   - [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
     - [Activating DeepWEBS](#activating-deepwebs)
+    - [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
     - [Usage Example](#usage-example)
   - [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
   - [Exceptions](#exceptions)
@@ -224,26 +225,33 @@ ___
 To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
+### Point to remember before using `DeepWEBS`
+As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
 ### Usage Example
 Here's a basic example of how to use the `DeepWEBS` :
 ```python
 from webscout import DeepWEBS
 def perform_web_search(query):
-    D = DeepWEBS()
-    item = D.DeepSearch(
-        queries=[query],  # Query to search
-        result_num=5,  # Number of search results
-        safe=True,  # Enable SafeSearch
-        types=["web"],  # Search type:  web
+    # Initialize the DeepWEBS class
+    D = DeepWEBS()
+    # Set up the search parameters
+    search_params = D.DeepSearch(
+        queries=[query], # Query to search
+        result_num=5, # Number of search results
+        safe=True, # Enable SafeSearch
+        types=["web"], # Search type: web
         extract_webpage=True, # True for extracting webpages
         overwrite_query_html=False,
         overwrite_webpage_html=False,
     )
-    results = D.queries_to_search_results(item)
+    # Execute the search and retrieve results
+    results = D.queries_to_search_results(search_params)
     return results
 def print_search_results(results):
@@ -260,8 +268,13 @@ def print_search_results(results):
         print("No search results found.")
 def main():
+    # Prompt the user for a search query
     query = input("Enter your search query: ")
+    # Perform the web search
     results = perform_web_search(query)
+    # Print the search results
     print_search_results(results)
 if __name__ == "__main__":

{webscout-1.2.2 → webscout-1.2.3}/README.md RENAMED Viewed

@@ -18,6 +18,7 @@ Also containes AI models that you can use
   - [Regions](#regions)
   - [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
     - [Activating DeepWEBS](#activating-deepwebs)
+    - [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
     - [Usage Example](#usage-example)
   - [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
   - [Exceptions](#exceptions)
@@ -173,26 +174,33 @@ ___
 To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
+### Point to remember before using `DeepWEBS`
+As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
 ### Usage Example
 Here's a basic example of how to use the `DeepWEBS` :
 ```python
 from webscout import DeepWEBS
 def perform_web_search(query):
-    D = DeepWEBS()
-    item = D.DeepSearch(
-        queries=[query],  # Query to search
-        result_num=5,  # Number of search results
-        safe=True,  # Enable SafeSearch
-        types=["web"],  # Search type:  web
+    # Initialize the DeepWEBS class
+    D = DeepWEBS()
+    # Set up the search parameters
+    search_params = D.DeepSearch(
+        queries=[query], # Query to search
+        result_num=5, # Number of search results
+        safe=True, # Enable SafeSearch
+        types=["web"], # Search type: web
         extract_webpage=True, # True for extracting webpages
         overwrite_query_html=False,
         overwrite_webpage_html=False,
     )
-    results = D.queries_to_search_results(item)
+    # Execute the search and retrieve results
+    results = D.queries_to_search_results(search_params)
     return results
 def print_search_results(results):
@@ -209,8 +217,13 @@ def print_search_results(results):
         print("No search results found.")
 def main():
+    # Prompt the user for a search query
     query = input("Enter your search query: ")
+    # Perform the web search
     results = perform_web_search(query)
+    # Print the search results
     print_search_results(results)
 if __name__ == "__main__":

{webscout-1.2.2 → webscout-1.2.3}/setup.py RENAMED Viewed

@@ -9,7 +9,7 @@ with open("README.md", encoding="utf-8") as f:
 setup(
     name="webscout",
-    version="1.2.2", # Use the version variable from the version.py file
+    version="1.2.3", # Use the version variable from the version.py file
     description="Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models",
     long_description=README,
     long_description_content_type="text/markdown",

{webscout-1.2.2 → webscout-1.2.3}/webscout/DWEBS.py RENAMED Viewed

@@ -51,13 +51,23 @@ class DeepWEBS:
             query_results_extractor = QueryResultsExtractor()
             if not query.strip():
                 continue
-            query_html_path = google_searcher.search(
-                query=query,
-                result_num=item.result_num,
-                safe=item.safe,
-                overwrite=item.overwrite_query_html,
-            )
-            query_search_results = query_results_extractor.extract(query_html_path)
+            try:
+                query_html_path = google_searcher.search(
+                    query=query,
+                    result_num=item.result_num,
+                    safe=item.safe,
+                    overwrite=item.overwrite_query_html,
+                )
+            except Exception as e:
+                logger.error(f"Failed to search for query '{query}': {e}")
+                continue
+            try:
+                query_search_results = query_results_extractor.extract(query_html_path)
+            except Exception as e:
+                logger.error(f"Failed to extract search results for query '{query}': {e}")
+                continue
             queries_search_results.append(query_search_results)
         logger.note(queries_search_results)
@@ -70,17 +80,21 @@ class DeepWEBS:
     def extract_webpages(self, queries_search_results, overwrite_webpage_html=False):
         for query_idx, query_search_results in enumerate(queries_search_results):
-            # Fetch webpages with urls
-            batch_webpage_fetcher = BatchWebpageFetcher()
-            urls = [
-                query_result["url"]
-                for query_result in query_search_results["query_results"]
-            ]
-            url_and_html_path_list = batch_webpage_fetcher.fetch(
-                urls,
-                overwrite=overwrite_webpage_html,
-                output_parent=query_search_results["query"],
-            )
+            try:
+                # Fetch webpages with urls
+                batch_webpage_fetcher = BatchWebpageFetcher()
+                urls = [
+                    query_result["url"]
+                    for query_result in query_search_results["query_results"]
+                ]
+                url_and_html_path_list = batch_webpage_fetcher.fetch(
+                    urls,
+                    overwrite=overwrite_webpage_html,
+                    output_parent=query_search_results["query"],
+                )
+            except Exception as e:
+                logger.error(f"Failed to fetch webpages for query '{query_search_results['query']}': {e}")
+                continue
             # Extract webpage contents from htmls
             html_paths = [
@@ -88,9 +102,13 @@ class DeepWEBS:
                 for url_and_html_path in url_and_html_path_list
             ]
             batch_webpage_content_extractor = BatchWebpageContentExtractor()
-            html_path_and_extracted_content_list = (
-                batch_webpage_content_extractor.extract(html_paths)
-            )
+            try:
+                html_path_and_extracted_content_list = (
+                    batch_webpage_content_extractor.extract(html_paths)
+                )
+            except Exception as e:
+                logger.error(f"Failed to extract webpage contents for query '{query_search_results['query']}': {e}")
+                continue
             # Build the map of url to extracted_content
             html_path_to_url_dict = {
@@ -109,7 +127,7 @@ class DeepWEBS:
                 query_search_results["query_results"]
             ):
                 url = query_result["url"]
-                extracted_content = url_to_extracted_content_dict[url]
+                extracted_content = url_to_extracted_content_dict.get(url, "")
                 queries_search_results[query_idx]["query_results"][query_result_idx][
                     "text"
                 ] = extracted_content

webscout-1.2.3/webscout/version.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ __version__ = "1.2.3"
2	+

{webscout-1.2.2 → webscout-1.2.3}/webscout.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: webscout
-Version: 1.2.2
+Version: 1.2.3
 Summary: Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models
 Author: OEvortex
 Author-email: helpingai5@gmail.com
@@ -69,6 +69,7 @@ Also containes AI models that you can use
   - [Regions](#regions)
   - [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
     - [Activating DeepWEBS](#activating-deepwebs)
+    - [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
     - [Usage Example](#usage-example)
   - [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
   - [Exceptions](#exceptions)
@@ -224,26 +225,33 @@ ___
 To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
+### Point to remember before using `DeepWEBS`
+As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
 ### Usage Example
 Here's a basic example of how to use the `DeepWEBS` :
 ```python
 from webscout import DeepWEBS
 def perform_web_search(query):
-    D = DeepWEBS()
-    item = D.DeepSearch(
-        queries=[query],  # Query to search
-        result_num=5,  # Number of search results
-        safe=True,  # Enable SafeSearch
-        types=["web"],  # Search type:  web
+    # Initialize the DeepWEBS class
+    D = DeepWEBS()
+    # Set up the search parameters
+    search_params = D.DeepSearch(
+        queries=[query], # Query to search
+        result_num=5, # Number of search results
+        safe=True, # Enable SafeSearch
+        types=["web"], # Search type: web
         extract_webpage=True, # True for extracting webpages
         overwrite_query_html=False,
         overwrite_webpage_html=False,
     )
-    results = D.queries_to_search_results(item)
+    # Execute the search and retrieve results
+    results = D.queries_to_search_results(search_params)
     return results
 def print_search_results(results):
@@ -260,8 +268,13 @@ def print_search_results(results):
         print("No search results found.")
 def main():
+    # Prompt the user for a search query
     query = input("Enter your search query: ")
+    # Perform the web search
     results = perform_web_search(query)
+    # Print the search results
     print_search_results(results)
 if __name__ == "__main__":

webscout-1.2.2/DeepWEBS/documents/query_results_extractor.py DELETED Viewed

@@ -1,78 +0,0 @@
-from bs4 import BeautifulSoup
-from pathlib import Path
-from DeepWEBS.utilsdw.logger import logger
-class QueryResultsExtractor:
-    def __init__(self) -> None:
-        self.query_results = []
-        self.related_questions = []
-    def load_html(self, html_path):
-        with open(html_path, "r", encoding="utf-8") as f:
-            html = f.read()
-        self.soup = BeautifulSoup(html, "html.parser")
-    def extract_query_results(self):
-        self.query = self.soup.find("textarea").text.strip()
-        query_result_elements = self.soup.find_all("div", class_="g")
-        for idx, result in enumerate(query_result_elements):
-            site = result.find("cite").find_previous("span").text.strip()
-            url = result.find("a")["href"]
-            title = result.find("h3").text.strip()
-            abstract_element_conditions = [
-                {"data-sncf": "1"},
-                {"class_": "ITZIwc"},
-            ]
-            for condition in abstract_element_conditions:
-                abstract_element = result.find("div", condition)
-                if abstract_element is not None:
-                    abstract = abstract_element.text.strip()
-                    break
-            else:
-                abstract = ""
-            logger.mesg(
-                f"{title}\n" f"  - {site}\n" f"  - {url}\n" f"  - {abstract}\n" f"\n"
-            )
-            self.query_results.append(
-                {
-                    "title": title,
-                    "site": site,
-                    "url": url,
-                    "abstract": abstract,
-                    "index": idx,
-                    "type": "web",
-                }
-            )
-        logger.success(f"- {len(query_result_elements)} query results")
-    def extract_related_questions(self):
-        related_question_elements = self.soup.find_all(
-            "div", class_="related-question-pair"
-        )
-        for question_element in related_question_elements:
-            question = question_element.find("span").text.strip()
-            print(question)
-            self.related_questions.append(question)
-        logger.success(f"- {len(self.related_questions)} related questions")
-    def extract(self, html_path):
-        self.load_html(html_path)
-        self.extract_query_results()
-        self.extract_related_questions()
-        self.search_results = {
-            "query": self.query,
-            "query_results": self.query_results,
-            "related_questions": self.related_questions,
-        }
-        return self.search_results
-if __name__ == "__main__":
-    html_path_root = Path(__file__).parents[1] / "files"
-    html_filename = "python_tutorials"
-    html_path = html_path_root / f"{html_filename}.html"
-    extractor = QueryResultsExtractor()
-    extractor.extract(html_path)

webscout-1.2.2/DeepWEBS/networks/google_searcher.py DELETED Viewed

@@ -1,48 +0,0 @@
-import requests
-from pathlib import Path
-from DeepWEBS.utilsdw.enver import enver
-from DeepWEBS.utilsdw.logger import logger
-from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
-from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
-class GoogleSearcher:
-    def __init__(self):
-        self.url = "https://www.google.com/search"
-        self.enver = enver
-        self.enver.set_envs(proxies=True)
-        self.filepath_converter = QueryToFilepathConverter()
-    def send_request(self, result_num=10, safe=False):
-        self.request_response = requests.get(
-            url=self.url,
-            headers=REQUESTS_HEADERS,
-            params={
-                "q": self.query,
-                "num": result_num,
-            },
-            proxies=self.enver.requests_proxies,
-        )
-    def save_response(self):
-        if not self.html_path.exists():
-            self.html_path.parent.mkdir(parents=True, exist_ok=True)
-        logger.note(f"Saving to: [{self.html_path}]")
-        with open(self.html_path, "wb") as wf:
-            wf.write(self.request_response.content)
-    def search(self, query, result_num=10, safe=False, overwrite=False):
-        self.query = query
-        self.html_path = self.filepath_converter.convert(self.query)
-        logger.note(f"Searching: [{self.query}]")
-        if self.html_path.exists() and not overwrite:
-            logger.success(f"HTML existed: {self.html_path}")
-        else:
-            self.send_request(result_num=result_num, safe=safe)
-            self.save_response()
-        return self.html_path
-if __name__ == "__main__":
-    searcher = GoogleSearcher()
-    searcher.search("python tutorials")

webscout-1.2.2/DeepWEBS/networks/webpage_fetcher.py DELETED Viewed

@@ -1,107 +0,0 @@
-import concurrent.futures
-import requests
-import tldextract
-from pathlib import Path
-from DeepWEBS.utilsdw.enver import enver
-from DeepWEBS.utilsdw.logger import logger
-from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
-from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
-class WebpageFetcher:
-    def __init__(self):
-        self.enver = enver
-        self.enver.set_envs(proxies=True)
-        self.filepath_converter = UrlToFilepathConverter()
-    def is_ignored_host(self, url):
-        self.host = tldextract.extract(url).registered_domain
-        if self.host in IGNORE_HOSTS:
-            return True
-        else:
-            return False
-    def send_request(self):
-        try:
-            self.request_response = requests.get(
-                url=self.url,
-                headers=REQUESTS_HEADERS,
-                proxies=self.enver.requests_proxies,
-                timeout=15,
-            )
-        except:
-            logger.warn(f"Failed to fetch: [{self.url}]")
-            self.request_response = None
-    def save_response(self):
-        if not self.html_path.exists():
-            self.html_path.parent.mkdir(parents=True, exist_ok=True)
-        logger.success(f"Saving to: [{self.html_path}]")
-        if self.request_response is None:
-            return
-        else:
-            with open(self.html_path, "wb") as wf:
-                wf.write(self.request_response.content)
-    def fetch(self, url, overwrite=False, output_parent=None):
-        self.url = url
-        logger.note(f"Fetching: [{self.url}]")
-        self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
-        if self.is_ignored_host(self.url):
-            logger.warn(f"Ignore host: [{self.host}]")
-            return self.html_path
-        if self.html_path.exists() and not overwrite:
-            logger.success(f"HTML existed: [{self.html_path}]")
-        else:
-            self.send_request()
-            self.save_response()
-        return self.html_path
-class BatchWebpageFetcher:
-    def __init__(self):
-        self.done_count = 0
-        self.total_count = 0
-        self.url_and_html_path_list = []
-    def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
-        webpage_fetcher = WebpageFetcher()
-        html_path = webpage_fetcher.fetch(
-            url=url, overwrite=overwrite, output_parent=output_parent
-        )
-        self.url_and_html_path_list.append({"url": url, "html_path": html_path})
-        self.done_count += 1
-        logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
-    def fetch(self, urls, overwrite=False, output_parent=None):
-        self.urls = urls
-        self.total_count = len(self.urls)
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [
-                executor.submit(
-                    self.fecth_single_webpage,
-                    url=url,
-                    overwrite=overwrite,
-                    output_parent=output_parent,
-                )
-                for url in urls
-            ]
-            for idx, future in enumerate(concurrent.futures.as_completed(futures)):
-                result = future.result()
-        return self.url_and_html_path_list
-if __name__ == "__main__":
-    urls = [
-        "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
-        "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
-        "https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
-    ]
-    batch_webpage_fetcher = BatchWebpageFetcher()
-    batch_webpage_fetcher.fetch(
-        urls=urls, overwrite=True, output_parent="python tutorials"
-    )