webscout 3.3__py3-none-any.whl → 3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

@@ -1,145 +0,0 @@
1
- import concurrent.futures
2
- import re
3
- from pathlib import Path
4
- from pprint import pprint
5
- from bs4 import BeautifulSoup
6
- from tiktoken import get_encoding as tiktoken_get_encoding
7
- from DeepWEBS.utilsdw.logger import logger
8
- from markdownify import markdownify
9
- from DeepWEBS.networks.network_configs import IGNORE_TAGS, IGNORE_CLASSES
10
- from termcolor import colored
11
-
12
-
13
- class WebpageContentExtractor:
14
- def __init__(self):
15
- self.tokenizer = tiktoken_get_encoding("cl100k_base")
16
-
17
- def count_tokens(self, text):
18
- tokens = self.tokenizer.encode(text)
19
- token_count = len(tokens)
20
- return token_count
21
-
22
- def html_to_markdown(self, html_str, ignore_links=True):
23
- if ignore_links:
24
- markdown_str = markdownify(html_str, strip="a")
25
- else:
26
- markdown_str = markdownify(html_str)
27
- markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
28
-
29
- self.markdown_token_count = self.count_tokens(markdown_str)
30
- logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
31
-
32
- self.markdown_str = markdown_str
33
-
34
- return self.markdown_str
35
-
36
- def remove_elements_from_html(self, html_str):
37
- soup = BeautifulSoup(html_str, "html.parser")
38
- ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
39
- ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
40
- removed_element_counts = 0
41
- for element in soup.find_all():
42
- class_str = ""
43
- id_str = ""
44
- try:
45
- class_attr = element.get("class", [])
46
- if class_attr:
47
- class_str = " ".join(list(class_attr))
48
- if id_str:
49
- class_str = f"{class_str} {id_str}"
50
- except:
51
- pass
52
-
53
- try:
54
- id_str = element.get("id", "")
55
- except:
56
- pass
57
-
58
- if (
59
- (not element.text.strip())
60
- or (element.name in IGNORE_TAGS)
61
- or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
62
- or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
63
- ):
64
- element.decompose()
65
- removed_element_counts += 1
66
-
67
- logger.mesg(
68
- f"- Elements: "
69
- f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
70
- )
71
-
72
- html_str = str(soup)
73
- self.html_str = html_str
74
-
75
- return self.html_str
76
-
77
- def extract(self, html_path):
78
- logger.note(f"Extracting content from: {html_path}")
79
-
80
- if not Path(html_path).exists():
81
- logger.warn(f"File not found: {html_path}")
82
- return ""
83
-
84
- encodings = ["utf-8", "latin-1"]
85
- for encoding in encodings:
86
- try:
87
- with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
88
- html_str = rf.read()
89
- break
90
- except UnicodeDecodeError:
91
- pass
92
- else:
93
- logger.warn(f"No matching encodings: {html_path}")
94
- return ""
95
-
96
- html_str = self.remove_elements_from_html(html_str)
97
- markdown_str = self.html_to_markdown(html_str)
98
- return markdown_str
99
-
100
-
101
- class BatchWebpageContentExtractor:
102
- def __init__(self) -> None:
103
- self.html_path_and_extracted_content_list = []
104
- self.done_count = 0
105
-
106
- def extract_single_html(self, html_path):
107
- webpage_content_extractor = WebpageContentExtractor()
108
- extracted_content = webpage_content_extractor.extract(html_path)
109
- self.html_path_and_extracted_content_list.append(
110
- {"html_path": html_path, "extracted_content": extracted_content}
111
- )
112
- self.done_count += 1
113
- logger.success(
114
- f"> [{self.done_count}/{self.total_count}] Extracted: {html_path}"
115
- )
116
-
117
- def extract(self, html_paths):
118
- self.html_path = html_paths
119
- self.total_count = len(self.html_path)
120
- with concurrent.futures.ThreadPoolExecutor() as executor:
121
- futures = [
122
- executor.submit(self.extract_single_html, html_path)
123
- for html_path in self.html_path
124
- ]
125
- for idx, future in enumerate(concurrent.futures.as_completed(futures)):
126
- result = future.result()
127
-
128
- return self.html_path_and_extracted_content_list
129
-
130
-
131
- if __name__ == "__main__":
132
- html_root = Path(__file__).parents[1] / "files" / "urls" / "python tutorials"
133
- html_paths = [
134
- html_root / html_filename
135
- for html_filename in [
136
- "docs.python.org_zh-cn_3_tutorial_interpreter.html",
137
- "stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html",
138
- "www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html",
139
- ]
140
- ]
141
- batch_webpage_content_extractor = BatchWebpageContentExtractor()
142
- html_path_and_extracted_content_list = batch_webpage_content_extractor.extract(
143
- html_paths
144
- )
145
- # pprint(html_path_and_extracted_content_list)
File without changes
@@ -1,109 +0,0 @@
1
- import platform
2
- import re
3
- from pathlib import Path
4
- from urllib.parse import quote, unquote
5
-
6
-
7
- # What characters are forbidden in Windows and Linux directory names?
8
- # https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
9
-
10
- INVALID_FILE_PATH_CHARS = [
11
- "\\",
12
- "/",
13
- ":",
14
- "*",
15
- "?",
16
- '"',
17
- "<",
18
- ">",
19
- "|",
20
- "\n",
21
- "\t",
22
- "\r",
23
- *[chr(i) for i in range(32)],
24
- ]
25
-
26
- WINDOWS_INVALID_FILE_PATH_NAMES = [
27
- "con",
28
- "prn",
29
- "aux",
30
- "nul",
31
- *[f"com{i+1}" for i in range(10)],
32
- *[f"lpt{i+1}" for i in range(10)],
33
- ]
34
-
35
-
36
- class FilepathConverter:
37
- def __init__(self, parent: str = None):
38
- self.output_root = Path(__file__).parents[1] / "files"
39
- self.parent = parent
40
-
41
- def preprocess(self, input_string):
42
- return input_string
43
-
44
- def validate(self, input_string):
45
- if not input_string:
46
- return input_string
47
- filename = input_string
48
- for char in INVALID_FILE_PATH_CHARS:
49
- filename = filename.replace(char, "_")
50
- if platform.system() == "Windows":
51
- filename_base = filename.split(".")[0]
52
- if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
53
- filename_base = filename_base + "_"
54
- filename = ".".join([filename_base, *filename.split(".")[1:]])
55
- return filename
56
-
57
- def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
58
- if ext:
59
- filename_ext = "." + filename.split(".")[-1]
60
- if filename_ext.lower() not in accept_exts:
61
- filename += ext
62
- return filename
63
-
64
- def convert(self, input_string, parent=None):
65
- filename = self.preprocess(input_string)
66
- filename = self.validate(filename)
67
- filename = self.append_extension(filename)
68
-
69
- parent = parent or self.parent
70
- parent = self.validate(parent)
71
- if parent:
72
- filepath = self.output_root / parent / filename
73
- else:
74
- filepath = self.output_root / filename
75
-
76
- self.filename = filename
77
- self.filepath = filepath
78
-
79
- return self.filepath
80
-
81
-
82
- class UrlToFilepathConverter(FilepathConverter):
83
- def __init__(self, parent: str = None):
84
- super().__init__(parent)
85
- self.output_root = self.output_root / "urls"
86
-
87
- def preprocess(self, url):
88
- filename = unquote(url.split("//")[1])
89
- return filename
90
-
91
-
92
- class QueryToFilepathConverter(FilepathConverter):
93
- def __init__(self, parent: str = None):
94
- super().__init__(parent)
95
- self.output_root = self.output_root / "queries"
96
-
97
-
98
- if __name__ == "__main__":
99
- query = "python"
100
- query_converter = QueryToFilepathConverter()
101
- print(query_converter.convert(query))
102
-
103
- # url = "https://trafilatura.readthedocs.io/en/latest/quickstart.html"
104
- url = (
105
- "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename"
106
- )
107
-
108
- url_converter = UrlToFilepathConverter(parent=query)
109
- print(url_converter.convert(url))
@@ -1,52 +0,0 @@
1
- import requests
2
- from pathlib import Path
3
- from typing import Optional
4
- import random
5
- from DeepWEBS.utilsdw.enver import enver
6
- from DeepWEBS.utilsdw.logger import logger
7
- from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
8
- from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
9
-
10
- class GoogleSearcher:
11
- def __init__(self):
12
- self.url = "https://www.google.com/search"
13
- self.enver = enver
14
- self.enver.set_envs(proxies=True)
15
- self.filepath_converter = QueryToFilepathConverter()
16
-
17
- def send_request(self, query: str, result_num: int = 10, safe: bool = False) -> requests.Response:
18
- params = {
19
- "q": query,
20
- "num": result_num,
21
- }
22
- response = requests.get(
23
- self.url,
24
- headers=REQUESTS_HEADERS,
25
- params=params,
26
- proxies=self.enver.requests_proxies,
27
- )
28
- response.raise_for_status() # Raise an exception for non-2xx status codes
29
- return response
30
-
31
- def save_response(self, response: requests.Response, html_path: Path) -> None:
32
- html_path.parent.mkdir(parents=True, exist_ok=True)
33
- logger.note(f"Saving to: [{html_path}]")
34
- with html_path.open("wb") as wf:
35
- wf.write(response.content)
36
-
37
- def search(self, query: str, result_num: int = 10, safe: bool = False, overwrite: bool = False) -> Path:
38
- html_path = self.filepath_converter.convert(query)
39
- logger.note(f"Searching: [{query}]")
40
-
41
- if html_path.exists() and not overwrite:
42
- logger.success(f"HTML existed: {html_path}")
43
- else:
44
- response = self.send_request(query, result_num, safe)
45
- self.save_response(response, html_path)
46
-
47
- return html_path
48
-
49
- if __name__ == "__main__":
50
- searcher = GoogleSearcher()
51
- html_path = searcher.search("python tutorials")
52
- print(f"HTML file saved at: {html_path}")
@@ -1,30 +0,0 @@
1
- IGNORE_TAGS = ["script", "style", "button"]
2
- IGNORE_CLASSES = [
3
- # common
4
- "sidebar",
5
- "footer",
6
- "related",
7
- "comment",
8
- "topbar",
9
- "offcanvas",
10
- "navbar",
11
- # 163.com
12
- "post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
13
- "ntes\-.*nav",
14
- "nav\-bottom",
15
- # wikipedia.org
16
- "language\-list",
17
- "vector\-(header)|(column)|(sticky\-pinned)|(dropdown\-content)",
18
- "navbox",
19
- "catlinks",
20
- ]
21
-
22
- IGNORE_HOSTS = [
23
- "weibo.com",
24
- "hymson.com",
25
- "yahoo.com",
26
- ]
27
-
28
- REQUESTS_HEADERS = {
29
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
30
- }
@@ -1,95 +0,0 @@
1
- import concurrent.futures
2
- import random
3
- import requests
4
- import tldextract
5
- from pathlib import Path
6
- from typing import List, Tuple, Dict
7
-
8
- from DeepWEBS.utilsdw.enver import enver
9
- from DeepWEBS.utilsdw.logger import logger
10
- from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
11
- from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
12
-
13
- class WebpageFetcher:
14
- def __init__(self):
15
- self.enver = enver
16
- self.enver.set_envs(proxies=True)
17
- self.filepath_converter = UrlToFilepathConverter()
18
-
19
- def is_ignored_host(self, url: str) -> bool:
20
- host = tldextract.extract(url).registered_domain
21
- return host in IGNORE_HOSTS
22
-
23
- def send_request(self, url: str) -> requests.Response:
24
- try:
25
- user_agent = random.choice(REQUESTS_HEADERS["User-Agent"])
26
- response = requests.get(
27
- url=url,
28
- headers={"User-Agent": user_agent},
29
- proxies=self.enver.requests_proxies,
30
- timeout=15,
31
- )
32
- response.raise_for_status()
33
- return response
34
- except requests.exceptions.RequestException as e:
35
- logger.warn(f"Failed to fetch: [{url}] | {e}")
36
- return None
37
-
38
- def save_response(self, response: requests.Response, html_path: Path) -> None:
39
- if response is None:
40
- return
41
-
42
- html_path.parent.mkdir(parents=True, exist_ok=True)
43
- logger.success(f"Saving to: [{html_path}]")
44
- with html_path.open("wb") as wf:
45
- wf.write(response.content)
46
-
47
- def fetch(self, url: str, overwrite: bool = False, output_parent: str = None) -> Path:
48
- logger.note(f"Fetching: [{url}]")
49
- html_path = self.filepath_converter.convert(url, parent=output_parent)
50
-
51
- if self.is_ignored_host(url):
52
- logger.warn(f"Ignored host: [{tldextract.extract(url).registered_domain}]")
53
- return html_path
54
-
55
- if html_path.exists() and not overwrite:
56
- logger.success(f"HTML existed: [{html_path}]")
57
- else:
58
- response = self.send_request(url)
59
- self.save_response(response, html_path)
60
-
61
- return html_path
62
-
63
- class BatchWebpageFetcher:
64
- def __init__(self):
65
- self.done_count = 0
66
- self.total_count = 0
67
- self.url_and_html_path_list: List[Dict[str, str]] = []
68
-
69
- def fetch_single_webpage(self, url: str, overwrite: bool = False, output_parent: str = None) -> Tuple[str, Path]:
70
- webpage_fetcher = WebpageFetcher()
71
- html_path = webpage_fetcher.fetch(url, overwrite, output_parent)
72
- self.url_and_html_path_list.append({"url": url, "html_path": str(html_path)})
73
- self.done_count += 1
74
- logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
75
- return url, html_path
76
-
77
- def fetch(self, urls: List[str], overwrite: bool = False, output_parent: str = None) -> List[Dict[str, str]]:
78
- self.urls = urls
79
- self.total_count = len(self.urls)
80
-
81
- with concurrent.futures.ProcessPoolExecutor() as executor:
82
- futures = [
83
- executor.submit(WebpageFetcher().fetch, url, overwrite, output_parent)
84
- for url in urls
85
- ]
86
- concurrent.futures.wait(futures)
87
-
88
- self.url_and_html_path_list = [
89
- {"url": future.result().url, "html_path": str(future.result().html_path)}
90
- for future in futures
91
- ]
92
-
93
- return self.url_and_html_path_list
94
-
95
-
File without changes
DeepWEBS/utilsdw/enver.py DELETED
@@ -1,78 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
- from typing import Dict, Optional
5
-
6
- from DeepWEBS.utilsdw.logger import OSLogger
7
-
8
-
9
- class OSEnver:
10
- """Manages the OS environment variables."""
11
-
12
- def __init__(self) -> None:
13
- """Initializes the OSEnver object."""
14
- self.envs_stack: list[Dict[str, str]] = []
15
- self.envs: Dict[str, str] = os.environ.copy()
16
-
17
- def store_envs(self) -> None:
18
- """Stores a copy of the current environment variables on a stack."""
19
- self.envs_stack.append(self.envs.copy())
20
-
21
- def restore_envs(self) -> None:
22
- """Restores environment variables from the top of the stack."""
23
- self.envs = self.envs_stack.pop()
24
-
25
- def set_envs(
26
- self,
27
- secrets: bool = True,
28
- proxies: Optional[str] = None,
29
- store_envs: bool = True,
30
- ) -> None:
31
- """Sets environment variables based on the contents of secrets.json.
32
-
33
- Args:
34
- secrets (bool): Whether to load secrets from secrets.json.
35
- proxies (Optional[str]): Proxy URL to set as environment variable.
36
- store_envs (bool): Whether to store a copy of the environment variables
37
- on the stack.
38
- """
39
- if store_envs:
40
- self.store_envs()
41
-
42
- if secrets:
43
- secrets_path = Path(__file__).parents[1] / "secrets.json"
44
- if secrets_path.exists():
45
- with open(secrets_path, "r") as rf:
46
- secrets = json.load(rf)
47
- else:
48
- secrets = {}
49
-
50
- if proxies:
51
- for proxy_env in ["http_proxy", "https_proxy"]:
52
- if isinstance(proxies, str):
53
- self.envs[proxy_env] = proxies
54
- elif "http_proxy" in secrets.keys():
55
- self.envs[proxy_env] = secrets["http_proxy"]
56
- elif os.getenv("http_proxy"):
57
- self.envs[proxy_env] = os.getenv("http_proxy")
58
- else:
59
- continue
60
-
61
- self.proxy = (
62
- self.envs.get("all_proxy")
63
- or self.envs.get("http_proxy")
64
- or self.envs.get("https_proxy")
65
- or None
66
- )
67
- self.requests_proxies = {
68
- "http": self.proxy,
69
- "https": self.proxy,
70
- }
71
-
72
- if self.proxy:
73
- OSLogger().note(f"Using proxy: [{self.proxy}]")
74
-
75
-
76
- enver: OSEnver = OSEnver()
77
-
78
-