webscout 1.2.1__tar.gz → 1.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (39) hide show
  1. webscout-1.2.3/DeepWEBS/__init__.py +0 -0
  2. webscout-1.2.3/DeepWEBS/documents/__init__.py +0 -0
  3. webscout-1.2.3/DeepWEBS/documents/query_results_extractor.py +99 -0
  4. webscout-1.2.3/DeepWEBS/documents/webpage_content_extractor.py +145 -0
  5. webscout-1.2.3/DeepWEBS/networks/__init__.py +0 -0
  6. webscout-1.2.3/DeepWEBS/networks/filepath_converter.py +109 -0
  7. webscout-1.2.3/DeepWEBS/networks/google_searcher.py +52 -0
  8. webscout-1.2.3/DeepWEBS/networks/network_configs.py +30 -0
  9. webscout-1.2.3/DeepWEBS/networks/webpage_fetcher.py +97 -0
  10. webscout-1.2.3/DeepWEBS/utilsdw/__init__.py +0 -0
  11. webscout-1.2.3/DeepWEBS/utilsdw/enver.py +60 -0
  12. webscout-1.2.3/DeepWEBS/utilsdw/logger.py +269 -0
  13. {webscout-1.2.1/webscout.egg-info → webscout-1.2.3}/PKG-INFO +81 -166
  14. {webscout-1.2.1 → webscout-1.2.3}/README.md +71 -164
  15. {webscout-1.2.1 → webscout-1.2.3}/setup.py +10 -2
  16. webscout-1.2.3/webscout/DWEBS.py +197 -0
  17. {webscout-1.2.1 → webscout-1.2.3}/webscout/__init__.py +1 -1
  18. webscout-1.2.3/webscout/version.py +2 -0
  19. {webscout-1.2.1 → webscout-1.2.3/webscout.egg-info}/PKG-INFO +81 -166
  20. {webscout-1.2.1 → webscout-1.2.3}/webscout.egg-info/SOURCES.txt +13 -0
  21. {webscout-1.2.1 → webscout-1.2.3}/webscout.egg-info/requires.txt +8 -0
  22. {webscout-1.2.1 → webscout-1.2.3}/webscout.egg-info/top_level.txt +1 -0
  23. webscout-1.2.1/webscout/version.py +0 -2
  24. {webscout-1.2.1 → webscout-1.2.3}/LICENSE.md +0 -0
  25. {webscout-1.2.1 → webscout-1.2.3}/setup.cfg +0 -0
  26. {webscout-1.2.1 → webscout-1.2.3}/webscout/AI.py +0 -0
  27. {webscout-1.2.1 → webscout-1.2.3}/webscout/AIbase.py +0 -0
  28. {webscout-1.2.1 → webscout-1.2.3}/webscout/AIutel.py +0 -0
  29. {webscout-1.2.1 → webscout-1.2.3}/webscout/HelpingAI.py +0 -0
  30. {webscout-1.2.1 → webscout-1.2.3}/webscout/LLM.py +0 -0
  31. {webscout-1.2.1 → webscout-1.2.3}/webscout/__main__.py +0 -0
  32. {webscout-1.2.1 → webscout-1.2.3}/webscout/cli.py +0 -0
  33. {webscout-1.2.1 → webscout-1.2.3}/webscout/exceptions.py +0 -0
  34. {webscout-1.2.1 → webscout-1.2.3}/webscout/models.py +0 -0
  35. {webscout-1.2.1 → webscout-1.2.3}/webscout/utils.py +0 -0
  36. {webscout-1.2.1 → webscout-1.2.3}/webscout/webscout_search.py +0 -0
  37. {webscout-1.2.1 → webscout-1.2.3}/webscout/webscout_search_async.py +0 -0
  38. {webscout-1.2.1 → webscout-1.2.3}/webscout.egg-info/dependency_links.txt +0 -0
  39. {webscout-1.2.1 → webscout-1.2.3}/webscout.egg-info/entry_points.txt +0 -0
File without changes
File without changes
@@ -0,0 +1,99 @@
1
+ from bs4 import BeautifulSoup
2
+ from pathlib import Path
3
+ from DeepWEBS.utilsdw.logger import logger
4
+
5
+ class QueryResultsExtractor:
6
+ def __init__(self) -> None:
7
+ self.query_results = []
8
+ self.related_questions = []
9
+
10
+ def load_html(self, html_path):
11
+ try:
12
+ with open(html_path, "r", encoding="utf-8") as f:
13
+ html = f.read()
14
+ self.soup = BeautifulSoup(html, "html.parser")
15
+ except FileNotFoundError:
16
+ logger.error(f"File not found: {html_path}")
17
+ except Exception as e:
18
+ logger.error(f"Error loading HTML: {e}")
19
+
20
+ def extract_query_results(self):
21
+ try:
22
+ self.query = self.soup.find("textarea").text.strip()
23
+ query_result_elements = self.soup.find_all("div", class_="g")
24
+ for idx, result in enumerate(query_result_elements):
25
+ try:
26
+ site = result.find("cite").find_previous("span").text.strip()
27
+ url = result.find("a")["href"]
28
+ title = result.find("h3").text.strip()
29
+ abstract_element_conditions = [
30
+ {"data-sncf": "1"},
31
+ {"class_": "ITZIwc"},
32
+ ]
33
+ for condition in abstract_element_conditions:
34
+ abstract_element = result.find("div", condition)
35
+ if abstract_element is not None:
36
+ abstract = abstract_element.text.strip()
37
+ break
38
+ else:
39
+ abstract = ""
40
+ logger.mesg(
41
+ f"{title}\n"
42
+ f" - {site}\n"
43
+ f" - {url}\n"
44
+ f" - {abstract}\n"
45
+ f"\n"
46
+ )
47
+ self.query_results.append(
48
+ {
49
+ "title": title,
50
+ "site": site,
51
+ "url": url,
52
+ "abstract": abstract,
53
+ "index": idx,
54
+ "type": "web",
55
+ }
56
+ )
57
+ except Exception as e:
58
+ logger.error(f"Error extracting query result: {e}")
59
+ logger.success(f"- {len(query_result_elements)} query results")
60
+ except Exception as e:
61
+ logger.error(f"Error extracting query results: {e}")
62
+
63
+ def extract_related_questions(self):
64
+ try:
65
+ related_question_elements = self.soup.find_all(
66
+ "div", class_="related-question-pair"
67
+ )
68
+ for question_element in related_question_elements:
69
+ try:
70
+ question = question_element.find("span").text.strip()
71
+ print(question)
72
+ self.related_questions.append(question)
73
+ except Exception as e:
74
+ logger.error(f"Error extracting related question: {e}")
75
+ logger.success(f"- {len(self.related_questions)} related questions")
76
+ except Exception as e:
77
+ logger.error(f"Error extracting related questions: {e}")
78
+
79
+ def extract(self, html_path):
80
+ self.load_html(html_path)
81
+ self.extract_query_results()
82
+ self.extract_related_questions()
83
+ self.search_results = {
84
+ "query": self.query,
85
+ "query_results": self.query_results,
86
+ "related_questions": self.related_questions,
87
+ }
88
+ return self.search_results
89
+
90
+
91
+ if __name__ == "__main__":
92
+ html_path_root = Path(__file__).parents[1] / "files"
93
+ html_filename = "python_tutorials"
94
+ html_path = html_path_root / f"{html_filename}.html"
95
+ extractor = QueryResultsExtractor()
96
+ try:
97
+ extractor.extract(html_path)
98
+ except Exception as e:
99
+ logger.error(f"Error in main function: {e}")
@@ -0,0 +1,145 @@
1
+ import concurrent.futures
2
+ import re
3
+ from pathlib import Path
4
+ from pprint import pprint
5
+ from bs4 import BeautifulSoup
6
+ from tiktoken import get_encoding as tiktoken_get_encoding
7
+ from DeepWEBS.utilsdw.logger import logger
8
+ from markdownify import markdownify
9
+ from DeepWEBS.networks.network_configs import IGNORE_TAGS, IGNORE_CLASSES
10
+ from termcolor import colored
11
+
12
+
13
+ class WebpageContentExtractor:
14
+ def __init__(self):
15
+ self.tokenizer = tiktoken_get_encoding("cl100k_base")
16
+
17
+ def count_tokens(self, text):
18
+ tokens = self.tokenizer.encode(text)
19
+ token_count = len(tokens)
20
+ return token_count
21
+
22
+ def html_to_markdown(self, html_str, ignore_links=True):
23
+ if ignore_links:
24
+ markdown_str = markdownify(html_str, strip="a")
25
+ else:
26
+ markdown_str = markdownify(html_str)
27
+ markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
28
+
29
+ self.markdown_token_count = self.count_tokens(markdown_str)
30
+ logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
31
+
32
+ self.markdown_str = markdown_str
33
+
34
+ return self.markdown_str
35
+
36
+ def remove_elements_from_html(self, html_str):
37
+ soup = BeautifulSoup(html_str, "html.parser")
38
+ ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
39
+ ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
40
+ removed_element_counts = 0
41
+ for element in soup.find_all():
42
+ class_str = ""
43
+ id_str = ""
44
+ try:
45
+ class_attr = element.get("class", [])
46
+ if class_attr:
47
+ class_str = " ".join(list(class_attr))
48
+ if id_str:
49
+ class_str = f"{class_str} {id_str}"
50
+ except:
51
+ pass
52
+
53
+ try:
54
+ id_str = element.get("id", "")
55
+ except:
56
+ pass
57
+
58
+ if (
59
+ (not element.text.strip())
60
+ or (element.name in IGNORE_TAGS)
61
+ or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
62
+ or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
63
+ ):
64
+ element.decompose()
65
+ removed_element_counts += 1
66
+
67
+ logger.mesg(
68
+ f"- Elements: "
69
+ f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
70
+ )
71
+
72
+ html_str = str(soup)
73
+ self.html_str = html_str
74
+
75
+ return self.html_str
76
+
77
+ def extract(self, html_path):
78
+ logger.note(f"Extracting content from: {html_path}")
79
+
80
+ if not Path(html_path).exists():
81
+ logger.warn(f"File not found: {html_path}")
82
+ return ""
83
+
84
+ encodings = ["utf-8", "latin-1"]
85
+ for encoding in encodings:
86
+ try:
87
+ with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
88
+ html_str = rf.read()
89
+ break
90
+ except UnicodeDecodeError:
91
+ pass
92
+ else:
93
+ logger.warn(f"No matching encodings: {html_path}")
94
+ return ""
95
+
96
+ html_str = self.remove_elements_from_html(html_str)
97
+ markdown_str = self.html_to_markdown(html_str)
98
+ return markdown_str
99
+
100
+
101
+ class BatchWebpageContentExtractor:
102
+ def __init__(self) -> None:
103
+ self.html_path_and_extracted_content_list = []
104
+ self.done_count = 0
105
+
106
+ def extract_single_html(self, html_path):
107
+ webpage_content_extractor = WebpageContentExtractor()
108
+ extracted_content = webpage_content_extractor.extract(html_path)
109
+ self.html_path_and_extracted_content_list.append(
110
+ {"html_path": html_path, "extracted_content": extracted_content}
111
+ )
112
+ self.done_count += 1
113
+ logger.success(
114
+ f"> [{self.done_count}/{self.total_count}] Extracted: {html_path}"
115
+ )
116
+
117
+ def extract(self, html_paths):
118
+ self.html_path = html_paths
119
+ self.total_count = len(self.html_path)
120
+ with concurrent.futures.ThreadPoolExecutor() as executor:
121
+ futures = [
122
+ executor.submit(self.extract_single_html, html_path)
123
+ for html_path in self.html_path
124
+ ]
125
+ for idx, future in enumerate(concurrent.futures.as_completed(futures)):
126
+ result = future.result()
127
+
128
+ return self.html_path_and_extracted_content_list
129
+
130
+
131
+ if __name__ == "__main__":
132
+ html_root = Path(__file__).parents[1] / "files" / "urls" / "python tutorials"
133
+ html_paths = [
134
+ html_root / html_filename
135
+ for html_filename in [
136
+ "docs.python.org_zh-cn_3_tutorial_interpreter.html",
137
+ "stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html",
138
+ "www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html",
139
+ ]
140
+ ]
141
+ batch_webpage_content_extractor = BatchWebpageContentExtractor()
142
+ html_path_and_extracted_content_list = batch_webpage_content_extractor.extract(
143
+ html_paths
144
+ )
145
+ # pprint(html_path_and_extracted_content_list)
File without changes
@@ -0,0 +1,109 @@
1
+ import platform
2
+ import re
3
+ from pathlib import Path
4
+ from urllib.parse import quote, unquote
5
+
6
+
7
+ # What characters are forbidden in Windows and Linux directory names?
8
+ # https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
9
+
10
+ INVALID_FILE_PATH_CHARS = [
11
+ "\\",
12
+ "/",
13
+ ":",
14
+ "*",
15
+ "?",
16
+ '"',
17
+ "<",
18
+ ">",
19
+ "|",
20
+ "\n",
21
+ "\t",
22
+ "\r",
23
+ *[chr(i) for i in range(32)],
24
+ ]
25
+
26
+ WINDOWS_INVALID_FILE_PATH_NAMES = [
27
+ "con",
28
+ "prn",
29
+ "aux",
30
+ "nul",
31
+ *[f"com{i+1}" for i in range(10)],
32
+ *[f"lpt{i+1}" for i in range(10)],
33
+ ]
34
+
35
+
36
+ class FilepathConverter:
37
+ def __init__(self, parent: str = None):
38
+ self.output_root = Path(__file__).parents[1] / "files"
39
+ self.parent = parent
40
+
41
+ def preprocess(self, input_string):
42
+ return input_string
43
+
44
+ def validate(self, input_string):
45
+ if not input_string:
46
+ return input_string
47
+ filename = input_string
48
+ for char in INVALID_FILE_PATH_CHARS:
49
+ filename = filename.replace(char, "_")
50
+ if platform.system() == "Windows":
51
+ filename_base = filename.split(".")[0]
52
+ if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
53
+ filename_base = filename_base + "_"
54
+ filename = ".".join([filename_base, *filename.split(".")[1:]])
55
+ return filename
56
+
57
+ def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
58
+ if ext:
59
+ filename_ext = "." + filename.split(".")[-1]
60
+ if filename_ext.lower() not in accept_exts:
61
+ filename += ext
62
+ return filename
63
+
64
+ def convert(self, input_string, parent=None):
65
+ filename = self.preprocess(input_string)
66
+ filename = self.validate(filename)
67
+ filename = self.append_extension(filename)
68
+
69
+ parent = parent or self.parent
70
+ parent = self.validate(parent)
71
+ if parent:
72
+ filepath = self.output_root / parent / filename
73
+ else:
74
+ filepath = self.output_root / filename
75
+
76
+ self.filename = filename
77
+ self.filepath = filepath
78
+
79
+ return self.filepath
80
+
81
+
82
+ class UrlToFilepathConverter(FilepathConverter):
83
+ def __init__(self, parent: str = None):
84
+ super().__init__(parent)
85
+ self.output_root = self.output_root / "urls"
86
+
87
+ def preprocess(self, url):
88
+ filename = unquote(url.split("//")[1])
89
+ return filename
90
+
91
+
92
+ class QueryToFilepathConverter(FilepathConverter):
93
+ def __init__(self, parent: str = None):
94
+ super().__init__(parent)
95
+ self.output_root = self.output_root / "queries"
96
+
97
+
98
+ if __name__ == "__main__":
99
+ query = "python"
100
+ query_converter = QueryToFilepathConverter()
101
+ print(query_converter.convert(query))
102
+
103
+ # url = "https://trafilatura.readthedocs.io/en/latest/quickstart.html"
104
+ url = (
105
+ "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename"
106
+ )
107
+
108
+ url_converter = UrlToFilepathConverter(parent=query)
109
+ print(url_converter.convert(url))
@@ -0,0 +1,52 @@
1
+ import requests
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import random
5
+ from DeepWEBS.utilsdw.enver import enver
6
+ from DeepWEBS.utilsdw.logger import logger
7
+ from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
8
+ from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
9
+
10
+ class GoogleSearcher:
11
+ def __init__(self):
12
+ self.url = "https://www.google.com/search"
13
+ self.enver = enver
14
+ self.enver.set_envs(proxies=True)
15
+ self.filepath_converter = QueryToFilepathConverter()
16
+
17
+ def send_request(self, query: str, result_num: int = 10, safe: bool = False) -> requests.Response:
18
+ params = {
19
+ "q": query,
20
+ "num": result_num,
21
+ }
22
+ response = requests.get(
23
+ self.url,
24
+ headers=REQUESTS_HEADERS,
25
+ params=params,
26
+ proxies=self.enver.requests_proxies,
27
+ )
28
+ response.raise_for_status() # Raise an exception for non-2xx status codes
29
+ return response
30
+
31
+ def save_response(self, response: requests.Response, html_path: Path) -> None:
32
+ html_path.parent.mkdir(parents=True, exist_ok=True)
33
+ logger.note(f"Saving to: [{html_path}]")
34
+ with html_path.open("wb") as wf:
35
+ wf.write(response.content)
36
+
37
+ def search(self, query: str, result_num: int = 10, safe: bool = False, overwrite: bool = False) -> Path:
38
+ html_path = self.filepath_converter.convert(query)
39
+ logger.note(f"Searching: [{query}]")
40
+
41
+ if html_path.exists() and not overwrite:
42
+ logger.success(f"HTML existed: {html_path}")
43
+ else:
44
+ response = self.send_request(query, result_num, safe)
45
+ self.save_response(response, html_path)
46
+
47
+ return html_path
48
+
49
+ if __name__ == "__main__":
50
+ searcher = GoogleSearcher()
51
+ html_path = searcher.search("python tutorials")
52
+ print(f"HTML file saved at: {html_path}")
@@ -0,0 +1,30 @@
1
+ IGNORE_TAGS = ["script", "style", "button"]
2
+ IGNORE_CLASSES = [
3
+ # common
4
+ "sidebar",
5
+ "footer",
6
+ "related",
7
+ "comment",
8
+ "topbar",
9
+ "offcanvas",
10
+ "navbar",
11
+ # 163.com
12
+ "post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
13
+ "ntes\-.*nav",
14
+ "nav\-bottom",
15
+ # wikipedia.org
16
+ "language\-list",
17
+ "vector\-(header)|(column)|(sticky\-pinned)|(dropdown\-content)",
18
+ "navbox",
19
+ "catlinks",
20
+ ]
21
+
22
+ IGNORE_HOSTS = [
23
+ "weibo.com",
24
+ "hymson.com",
25
+ "yahoo.com",
26
+ ]
27
+
28
+ REQUESTS_HEADERS = {
29
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
30
+ }
@@ -0,0 +1,97 @@
1
+ import concurrent.futures
2
+ import random
3
+ import requests
4
+ import tldextract
5
+ from pathlib import Path
6
+ from typing import List, Tuple, Dict
7
+
8
+ from DeepWEBS.utilsdw.enver import enver
9
+ from DeepWEBS.utilsdw.logger import logger
10
+ from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
11
+ from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
12
+
13
+ class WebpageFetcher:
14
+ def __init__(self):
15
+ self.enver = enver
16
+ self.enver.set_envs(proxies=True)
17
+ self.filepath_converter = UrlToFilepathConverter()
18
+
19
+ def is_ignored_host(self, url: str) -> bool:
20
+ host = tldextract.extract(url).registered_domain
21
+ return host in IGNORE_HOSTS
22
+
23
+ def send_request(self, url: str) -> requests.Response:
24
+ try:
25
+ user_agent = random.choice(REQUESTS_HEADERS["User-Agent"])
26
+ response = requests.get(
27
+ url=url,
28
+ headers={"User-Agent": user_agent},
29
+ proxies=self.enver.requests_proxies,
30
+ timeout=15,
31
+ )
32
+ response.raise_for_status()
33
+ return response
34
+ except requests.exceptions.RequestException as e:
35
+ logger.warn(f"Failed to fetch: [{url}] | {e}")
36
+ return None
37
+
38
+ def save_response(self, response: requests.Response, html_path: Path) -> None:
39
+ if response is None:
40
+ return
41
+
42
+ html_path.parent.mkdir(parents=True, exist_ok=True)
43
+ logger.success(f"Saving to: [{html_path}]")
44
+ with html_path.open("wb") as wf:
45
+ wf.write(response.content)
46
+
47
+ def fetch(self, url: str, overwrite: bool = False, output_parent: str = None) -> Path:
48
+ logger.note(f"Fetching: [{url}]")
49
+ html_path = self.filepath_converter.convert(url, parent=output_parent)
50
+
51
+ if self.is_ignored_host(url):
52
+ logger.warn(f"Ignored host: [{tldextract.extract(url).registered_domain}]")
53
+ return html_path
54
+
55
+ if html_path.exists() and not overwrite:
56
+ logger.success(f"HTML existed: [{html_path}]")
57
+ else:
58
+ response = self.send_request(url)
59
+ self.save_response(response, html_path)
60
+
61
+ return html_path
62
+
63
+ class BatchWebpageFetcher:
64
+ def __init__(self):
65
+ self.done_count = 0
66
+ self.total_count = 0
67
+ self.url_and_html_path_list: List[Dict[str, str]] = []
68
+
69
+ def fetch_single_webpage(self, url: str, overwrite: bool = False, output_parent: str = None) -> Tuple[str, Path]:
70
+ webpage_fetcher = WebpageFetcher()
71
+ html_path = webpage_fetcher.fetch(url, overwrite, output_parent)
72
+ self.url_and_html_path_list.append({"url": url, "html_path": str(html_path)})
73
+ self.done_count += 1
74
+ logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
75
+ return url, html_path
76
+
77
+ def fetch(self, urls: List[str], overwrite: bool = False, output_parent: str = None) -> List[Dict[str, str]]:
78
+ self.urls = urls
79
+ self.total_count = len(self.urls)
80
+
81
+ with concurrent.futures.ThreadPoolExecutor() as executor:
82
+ futures = [
83
+ executor.submit(self.fetch_single_webpage, url, overwrite, output_parent)
84
+ for url in urls
85
+ ]
86
+ concurrent.futures.wait(futures)
87
+
88
+ return self.url_and_html_path_list
89
+
90
+ if __name__ == "__main__":
91
+ urls = [
92
+ "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
93
+ "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
94
+ "https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
95
+ ]
96
+ batch_webpage_fetcher = BatchWebpageFetcher()
97
+ batch_webpage_fetcher.fetch(urls=urls, overwrite=True, output_parent="python tutorials")
File without changes
@@ -0,0 +1,60 @@
1
+ import json
2
+ import os
3
+
4
+ from pathlib import Path
5
+ from DeepWEBS.utilsdw.logger import logger
6
+
7
+
8
+ class OSEnver:
9
+ def __init__(self):
10
+ self.envs_stack = []
11
+ self.envs = os.environ.copy()
12
+
13
+ def store_envs(self):
14
+ self.envs_stack.append(self.envs)
15
+
16
+ def restore_envs(self):
17
+ self.envs = self.envs_stack.pop()
18
+
19
+ def set_envs(self, secrets=True, proxies=None, store_envs=True):
20
+ # caller_info = inspect.stack()[1]
21
+ # logger.back(f"OS Envs is set by: {caller_info.filename}")
22
+
23
+ if store_envs:
24
+ self.store_envs()
25
+
26
+ if secrets:
27
+ secrets_path = Path(__file__).parents[1] / "secrets.json"
28
+ if secrets_path.exists():
29
+ with open(secrets_path, "r") as rf:
30
+ secrets = json.load(rf)
31
+ else:
32
+ secrets = {}
33
+
34
+ if proxies:
35
+ for proxy_env in ["http_proxy", "https_proxy"]:
36
+ if isinstance(proxies, str):
37
+ self.envs[proxy_env] = proxies
38
+ elif "http_proxy" in secrets.keys():
39
+ self.envs[proxy_env] = secrets["http_proxy"]
40
+ elif os.getenv("http_proxy"):
41
+ self.envs[proxy_env] = os.getenv("http_proxy")
42
+ else:
43
+ continue
44
+
45
+ self.proxy = (
46
+ self.envs.get("all_proxy")
47
+ or self.envs.get("http_proxy")
48
+ or self.envs.get("https_proxy")
49
+ or None
50
+ )
51
+ self.requests_proxies = {
52
+ "http": self.proxy,
53
+ "https": self.proxy,
54
+ }
55
+
56
+ if self.proxy:
57
+ logger.note(f"Using proxy: [{self.proxy}]")
58
+
59
+
60
+ enver = OSEnver()