webscout 1.2.1__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout-1.2.2/DeepWEBS/__init__.py +0 -0
- webscout-1.2.2/DeepWEBS/documents/__init__.py +0 -0
- webscout-1.2.2/DeepWEBS/documents/query_results_extractor.py +78 -0
- webscout-1.2.2/DeepWEBS/documents/webpage_content_extractor.py +145 -0
- webscout-1.2.2/DeepWEBS/networks/__init__.py +0 -0
- webscout-1.2.2/DeepWEBS/networks/filepath_converter.py +109 -0
- webscout-1.2.2/DeepWEBS/networks/google_searcher.py +48 -0
- webscout-1.2.2/DeepWEBS/networks/network_configs.py +30 -0
- webscout-1.2.2/DeepWEBS/networks/webpage_fetcher.py +107 -0
- webscout-1.2.2/DeepWEBS/utilsdw/__init__.py +0 -0
- webscout-1.2.2/DeepWEBS/utilsdw/enver.py +60 -0
- webscout-1.2.2/DeepWEBS/utilsdw/logger.py +269 -0
- {webscout-1.2.1/webscout.egg-info → webscout-1.2.2}/PKG-INFO +68 -166
- {webscout-1.2.1 → webscout-1.2.2}/README.md +58 -164
- {webscout-1.2.1 → webscout-1.2.2}/setup.py +10 -2
- webscout-1.2.2/webscout/DWEBS.py +179 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/__init__.py +1 -1
- webscout-1.2.2/webscout/version.py +2 -0
- {webscout-1.2.1 → webscout-1.2.2/webscout.egg-info}/PKG-INFO +68 -166
- {webscout-1.2.1 → webscout-1.2.2}/webscout.egg-info/SOURCES.txt +13 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout.egg-info/requires.txt +8 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout.egg-info/top_level.txt +1 -0
- webscout-1.2.1/webscout/version.py +0 -2
- {webscout-1.2.1 → webscout-1.2.2}/LICENSE.md +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/setup.cfg +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/AI.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/AIbase.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/AIutel.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/HelpingAI.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/LLM.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/__main__.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/cli.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/exceptions.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/models.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/utils.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/webscout_search.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout/webscout_search_async.py +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout.egg-info/dependency_links.txt +0 -0
- {webscout-1.2.1 → webscout-1.2.2}/webscout.egg-info/entry_points.txt +0 -0
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class QueryResultsExtractor:
|
|
7
|
+
def __init__(self) -> None:
|
|
8
|
+
self.query_results = []
|
|
9
|
+
self.related_questions = []
|
|
10
|
+
|
|
11
|
+
def load_html(self, html_path):
|
|
12
|
+
with open(html_path, "r", encoding="utf-8") as f:
|
|
13
|
+
html = f.read()
|
|
14
|
+
self.soup = BeautifulSoup(html, "html.parser")
|
|
15
|
+
|
|
16
|
+
def extract_query_results(self):
|
|
17
|
+
self.query = self.soup.find("textarea").text.strip()
|
|
18
|
+
query_result_elements = self.soup.find_all("div", class_="g")
|
|
19
|
+
for idx, result in enumerate(query_result_elements):
|
|
20
|
+
site = result.find("cite").find_previous("span").text.strip()
|
|
21
|
+
url = result.find("a")["href"]
|
|
22
|
+
title = result.find("h3").text.strip()
|
|
23
|
+
|
|
24
|
+
abstract_element_conditions = [
|
|
25
|
+
{"data-sncf": "1"},
|
|
26
|
+
{"class_": "ITZIwc"},
|
|
27
|
+
]
|
|
28
|
+
for condition in abstract_element_conditions:
|
|
29
|
+
abstract_element = result.find("div", condition)
|
|
30
|
+
if abstract_element is not None:
|
|
31
|
+
abstract = abstract_element.text.strip()
|
|
32
|
+
break
|
|
33
|
+
else:
|
|
34
|
+
abstract = ""
|
|
35
|
+
|
|
36
|
+
logger.mesg(
|
|
37
|
+
f"{title}\n" f" - {site}\n" f" - {url}\n" f" - {abstract}\n" f"\n"
|
|
38
|
+
)
|
|
39
|
+
self.query_results.append(
|
|
40
|
+
{
|
|
41
|
+
"title": title,
|
|
42
|
+
"site": site,
|
|
43
|
+
"url": url,
|
|
44
|
+
"abstract": abstract,
|
|
45
|
+
"index": idx,
|
|
46
|
+
"type": "web",
|
|
47
|
+
}
|
|
48
|
+
)
|
|
49
|
+
logger.success(f"- {len(query_result_elements)} query results")
|
|
50
|
+
|
|
51
|
+
def extract_related_questions(self):
|
|
52
|
+
related_question_elements = self.soup.find_all(
|
|
53
|
+
"div", class_="related-question-pair"
|
|
54
|
+
)
|
|
55
|
+
for question_element in related_question_elements:
|
|
56
|
+
question = question_element.find("span").text.strip()
|
|
57
|
+
print(question)
|
|
58
|
+
self.related_questions.append(question)
|
|
59
|
+
logger.success(f"- {len(self.related_questions)} related questions")
|
|
60
|
+
|
|
61
|
+
def extract(self, html_path):
|
|
62
|
+
self.load_html(html_path)
|
|
63
|
+
self.extract_query_results()
|
|
64
|
+
self.extract_related_questions()
|
|
65
|
+
self.search_results = {
|
|
66
|
+
"query": self.query,
|
|
67
|
+
"query_results": self.query_results,
|
|
68
|
+
"related_questions": self.related_questions,
|
|
69
|
+
}
|
|
70
|
+
return self.search_results
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == "__main__":
|
|
74
|
+
html_path_root = Path(__file__).parents[1] / "files"
|
|
75
|
+
html_filename = "python_tutorials"
|
|
76
|
+
html_path = html_path_root / f"{html_filename}.html"
|
|
77
|
+
extractor = QueryResultsExtractor()
|
|
78
|
+
extractor.extract(html_path)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from pprint import pprint
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from tiktoken import get_encoding as tiktoken_get_encoding
|
|
7
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
8
|
+
from markdownify import markdownify
|
|
9
|
+
from DeepWEBS.networks.network_configs import IGNORE_TAGS, IGNORE_CLASSES
|
|
10
|
+
from termcolor import colored
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class WebpageContentExtractor:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.tokenizer = tiktoken_get_encoding("cl100k_base")
|
|
16
|
+
|
|
17
|
+
def count_tokens(self, text):
|
|
18
|
+
tokens = self.tokenizer.encode(text)
|
|
19
|
+
token_count = len(tokens)
|
|
20
|
+
return token_count
|
|
21
|
+
|
|
22
|
+
def html_to_markdown(self, html_str, ignore_links=True):
|
|
23
|
+
if ignore_links:
|
|
24
|
+
markdown_str = markdownify(html_str, strip="a")
|
|
25
|
+
else:
|
|
26
|
+
markdown_str = markdownify(html_str)
|
|
27
|
+
markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
|
|
28
|
+
|
|
29
|
+
self.markdown_token_count = self.count_tokens(markdown_str)
|
|
30
|
+
logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
|
|
31
|
+
|
|
32
|
+
self.markdown_str = markdown_str
|
|
33
|
+
|
|
34
|
+
return self.markdown_str
|
|
35
|
+
|
|
36
|
+
def remove_elements_from_html(self, html_str):
|
|
37
|
+
soup = BeautifulSoup(html_str, "html.parser")
|
|
38
|
+
ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
|
|
39
|
+
ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
|
|
40
|
+
removed_element_counts = 0
|
|
41
|
+
for element in soup.find_all():
|
|
42
|
+
class_str = ""
|
|
43
|
+
id_str = ""
|
|
44
|
+
try:
|
|
45
|
+
class_attr = element.get("class", [])
|
|
46
|
+
if class_attr:
|
|
47
|
+
class_str = " ".join(list(class_attr))
|
|
48
|
+
if id_str:
|
|
49
|
+
class_str = f"{class_str} {id_str}"
|
|
50
|
+
except:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
id_str = element.get("id", "")
|
|
55
|
+
except:
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
if (
|
|
59
|
+
(not element.text.strip())
|
|
60
|
+
or (element.name in IGNORE_TAGS)
|
|
61
|
+
or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
|
|
62
|
+
or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
|
|
63
|
+
):
|
|
64
|
+
element.decompose()
|
|
65
|
+
removed_element_counts += 1
|
|
66
|
+
|
|
67
|
+
logger.mesg(
|
|
68
|
+
f"- Elements: "
|
|
69
|
+
f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
html_str = str(soup)
|
|
73
|
+
self.html_str = html_str
|
|
74
|
+
|
|
75
|
+
return self.html_str
|
|
76
|
+
|
|
77
|
+
def extract(self, html_path):
|
|
78
|
+
logger.note(f"Extracting content from: {html_path}")
|
|
79
|
+
|
|
80
|
+
if not Path(html_path).exists():
|
|
81
|
+
logger.warn(f"File not found: {html_path}")
|
|
82
|
+
return ""
|
|
83
|
+
|
|
84
|
+
encodings = ["utf-8", "latin-1"]
|
|
85
|
+
for encoding in encodings:
|
|
86
|
+
try:
|
|
87
|
+
with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
|
|
88
|
+
html_str = rf.read()
|
|
89
|
+
break
|
|
90
|
+
except UnicodeDecodeError:
|
|
91
|
+
pass
|
|
92
|
+
else:
|
|
93
|
+
logger.warn(f"No matching encodings: {html_path}")
|
|
94
|
+
return ""
|
|
95
|
+
|
|
96
|
+
html_str = self.remove_elements_from_html(html_str)
|
|
97
|
+
markdown_str = self.html_to_markdown(html_str)
|
|
98
|
+
return markdown_str
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class BatchWebpageContentExtractor:
|
|
102
|
+
def __init__(self) -> None:
|
|
103
|
+
self.html_path_and_extracted_content_list = []
|
|
104
|
+
self.done_count = 0
|
|
105
|
+
|
|
106
|
+
def extract_single_html(self, html_path):
|
|
107
|
+
webpage_content_extractor = WebpageContentExtractor()
|
|
108
|
+
extracted_content = webpage_content_extractor.extract(html_path)
|
|
109
|
+
self.html_path_and_extracted_content_list.append(
|
|
110
|
+
{"html_path": html_path, "extracted_content": extracted_content}
|
|
111
|
+
)
|
|
112
|
+
self.done_count += 1
|
|
113
|
+
logger.success(
|
|
114
|
+
f"> [{self.done_count}/{self.total_count}] Extracted: {html_path}"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def extract(self, html_paths):
|
|
118
|
+
self.html_path = html_paths
|
|
119
|
+
self.total_count = len(self.html_path)
|
|
120
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
121
|
+
futures = [
|
|
122
|
+
executor.submit(self.extract_single_html, html_path)
|
|
123
|
+
for html_path in self.html_path
|
|
124
|
+
]
|
|
125
|
+
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
126
|
+
result = future.result()
|
|
127
|
+
|
|
128
|
+
return self.html_path_and_extracted_content_list
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
html_root = Path(__file__).parents[1] / "files" / "urls" / "python tutorials"
|
|
133
|
+
html_paths = [
|
|
134
|
+
html_root / html_filename
|
|
135
|
+
for html_filename in [
|
|
136
|
+
"docs.python.org_zh-cn_3_tutorial_interpreter.html",
|
|
137
|
+
"stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html",
|
|
138
|
+
"www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html",
|
|
139
|
+
]
|
|
140
|
+
]
|
|
141
|
+
batch_webpage_content_extractor = BatchWebpageContentExtractor()
|
|
142
|
+
html_path_and_extracted_content_list = batch_webpage_content_extractor.extract(
|
|
143
|
+
html_paths
|
|
144
|
+
)
|
|
145
|
+
# pprint(html_path_and_extracted_content_list)
|
|
File without changes
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import platform
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from urllib.parse import quote, unquote
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# What characters are forbidden in Windows and Linux directory names?
|
|
8
|
+
# https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
|
|
9
|
+
|
|
10
|
+
INVALID_FILE_PATH_CHARS = [
|
|
11
|
+
"\\",
|
|
12
|
+
"/",
|
|
13
|
+
":",
|
|
14
|
+
"*",
|
|
15
|
+
"?",
|
|
16
|
+
'"',
|
|
17
|
+
"<",
|
|
18
|
+
">",
|
|
19
|
+
"|",
|
|
20
|
+
"\n",
|
|
21
|
+
"\t",
|
|
22
|
+
"\r",
|
|
23
|
+
*[chr(i) for i in range(32)],
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
WINDOWS_INVALID_FILE_PATH_NAMES = [
|
|
27
|
+
"con",
|
|
28
|
+
"prn",
|
|
29
|
+
"aux",
|
|
30
|
+
"nul",
|
|
31
|
+
*[f"com{i+1}" for i in range(10)],
|
|
32
|
+
*[f"lpt{i+1}" for i in range(10)],
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FilepathConverter:
|
|
37
|
+
def __init__(self, parent: str = None):
|
|
38
|
+
self.output_root = Path(__file__).parents[1] / "files"
|
|
39
|
+
self.parent = parent
|
|
40
|
+
|
|
41
|
+
def preprocess(self, input_string):
|
|
42
|
+
return input_string
|
|
43
|
+
|
|
44
|
+
def validate(self, input_string):
|
|
45
|
+
if not input_string:
|
|
46
|
+
return input_string
|
|
47
|
+
filename = input_string
|
|
48
|
+
for char in INVALID_FILE_PATH_CHARS:
|
|
49
|
+
filename = filename.replace(char, "_")
|
|
50
|
+
if platform.system() == "Windows":
|
|
51
|
+
filename_base = filename.split(".")[0]
|
|
52
|
+
if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
|
|
53
|
+
filename_base = filename_base + "_"
|
|
54
|
+
filename = ".".join([filename_base, *filename.split(".")[1:]])
|
|
55
|
+
return filename
|
|
56
|
+
|
|
57
|
+
def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
|
|
58
|
+
if ext:
|
|
59
|
+
filename_ext = "." + filename.split(".")[-1]
|
|
60
|
+
if filename_ext.lower() not in accept_exts:
|
|
61
|
+
filename += ext
|
|
62
|
+
return filename
|
|
63
|
+
|
|
64
|
+
def convert(self, input_string, parent=None):
|
|
65
|
+
filename = self.preprocess(input_string)
|
|
66
|
+
filename = self.validate(filename)
|
|
67
|
+
filename = self.append_extension(filename)
|
|
68
|
+
|
|
69
|
+
parent = parent or self.parent
|
|
70
|
+
parent = self.validate(parent)
|
|
71
|
+
if parent:
|
|
72
|
+
filepath = self.output_root / parent / filename
|
|
73
|
+
else:
|
|
74
|
+
filepath = self.output_root / filename
|
|
75
|
+
|
|
76
|
+
self.filename = filename
|
|
77
|
+
self.filepath = filepath
|
|
78
|
+
|
|
79
|
+
return self.filepath
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class UrlToFilepathConverter(FilepathConverter):
|
|
83
|
+
def __init__(self, parent: str = None):
|
|
84
|
+
super().__init__(parent)
|
|
85
|
+
self.output_root = self.output_root / "urls"
|
|
86
|
+
|
|
87
|
+
def preprocess(self, url):
|
|
88
|
+
filename = unquote(url.split("//")[1])
|
|
89
|
+
return filename
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class QueryToFilepathConverter(FilepathConverter):
|
|
93
|
+
def __init__(self, parent: str = None):
|
|
94
|
+
super().__init__(parent)
|
|
95
|
+
self.output_root = self.output_root / "queries"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
query = "python"
|
|
100
|
+
query_converter = QueryToFilepathConverter()
|
|
101
|
+
print(query_converter.convert(query))
|
|
102
|
+
|
|
103
|
+
# url = "https://trafilatura.readthedocs.io/en/latest/quickstart.html"
|
|
104
|
+
url = (
|
|
105
|
+
"https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
url_converter = UrlToFilepathConverter(parent=query)
|
|
109
|
+
print(url_converter.convert(url))
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from DeepWEBS.utilsdw.enver import enver
|
|
4
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
5
|
+
from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
|
|
6
|
+
from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GoogleSearcher:
|
|
10
|
+
def __init__(self):
|
|
11
|
+
self.url = "https://www.google.com/search"
|
|
12
|
+
self.enver = enver
|
|
13
|
+
self.enver.set_envs(proxies=True)
|
|
14
|
+
self.filepath_converter = QueryToFilepathConverter()
|
|
15
|
+
|
|
16
|
+
def send_request(self, result_num=10, safe=False):
|
|
17
|
+
self.request_response = requests.get(
|
|
18
|
+
url=self.url,
|
|
19
|
+
headers=REQUESTS_HEADERS,
|
|
20
|
+
params={
|
|
21
|
+
"q": self.query,
|
|
22
|
+
"num": result_num,
|
|
23
|
+
},
|
|
24
|
+
proxies=self.enver.requests_proxies,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def save_response(self):
|
|
28
|
+
if not self.html_path.exists():
|
|
29
|
+
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
logger.note(f"Saving to: [{self.html_path}]")
|
|
31
|
+
with open(self.html_path, "wb") as wf:
|
|
32
|
+
wf.write(self.request_response.content)
|
|
33
|
+
|
|
34
|
+
def search(self, query, result_num=10, safe=False, overwrite=False):
|
|
35
|
+
self.query = query
|
|
36
|
+
self.html_path = self.filepath_converter.convert(self.query)
|
|
37
|
+
logger.note(f"Searching: [{self.query}]")
|
|
38
|
+
if self.html_path.exists() and not overwrite:
|
|
39
|
+
logger.success(f"HTML existed: {self.html_path}")
|
|
40
|
+
else:
|
|
41
|
+
self.send_request(result_num=result_num, safe=safe)
|
|
42
|
+
self.save_response()
|
|
43
|
+
return self.html_path
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
searcher = GoogleSearcher()
|
|
48
|
+
searcher.search("python tutorials")
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
IGNORE_TAGS = ["script", "style", "button"]
|
|
2
|
+
IGNORE_CLASSES = [
|
|
3
|
+
# common
|
|
4
|
+
"sidebar",
|
|
5
|
+
"footer",
|
|
6
|
+
"related",
|
|
7
|
+
"comment",
|
|
8
|
+
"topbar",
|
|
9
|
+
"offcanvas",
|
|
10
|
+
"navbar",
|
|
11
|
+
# 163.com
|
|
12
|
+
"post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
|
|
13
|
+
"ntes\-.*nav",
|
|
14
|
+
"nav\-bottom",
|
|
15
|
+
# wikipedia.org
|
|
16
|
+
"language\-list",
|
|
17
|
+
"vector\-(header)|(column)|(sticky\-pinned)|(dropdown\-content)",
|
|
18
|
+
"navbox",
|
|
19
|
+
"catlinks",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
IGNORE_HOSTS = [
|
|
23
|
+
"weibo.com",
|
|
24
|
+
"hymson.com",
|
|
25
|
+
"yahoo.com",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
REQUESTS_HEADERS = {
|
|
29
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
|
30
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import requests
|
|
3
|
+
import tldextract
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from DeepWEBS.utilsdw.enver import enver
|
|
6
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
7
|
+
from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
|
|
8
|
+
from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class WebpageFetcher:
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.enver = enver
|
|
14
|
+
self.enver.set_envs(proxies=True)
|
|
15
|
+
self.filepath_converter = UrlToFilepathConverter()
|
|
16
|
+
|
|
17
|
+
def is_ignored_host(self, url):
|
|
18
|
+
self.host = tldextract.extract(url).registered_domain
|
|
19
|
+
if self.host in IGNORE_HOSTS:
|
|
20
|
+
return True
|
|
21
|
+
else:
|
|
22
|
+
return False
|
|
23
|
+
|
|
24
|
+
def send_request(self):
|
|
25
|
+
try:
|
|
26
|
+
self.request_response = requests.get(
|
|
27
|
+
url=self.url,
|
|
28
|
+
headers=REQUESTS_HEADERS,
|
|
29
|
+
proxies=self.enver.requests_proxies,
|
|
30
|
+
timeout=15,
|
|
31
|
+
)
|
|
32
|
+
except:
|
|
33
|
+
logger.warn(f"Failed to fetch: [{self.url}]")
|
|
34
|
+
self.request_response = None
|
|
35
|
+
|
|
36
|
+
def save_response(self):
|
|
37
|
+
if not self.html_path.exists():
|
|
38
|
+
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
logger.success(f"Saving to: [{self.html_path}]")
|
|
40
|
+
|
|
41
|
+
if self.request_response is None:
|
|
42
|
+
return
|
|
43
|
+
else:
|
|
44
|
+
with open(self.html_path, "wb") as wf:
|
|
45
|
+
wf.write(self.request_response.content)
|
|
46
|
+
|
|
47
|
+
def fetch(self, url, overwrite=False, output_parent=None):
|
|
48
|
+
self.url = url
|
|
49
|
+
logger.note(f"Fetching: [{self.url}]")
|
|
50
|
+
self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
|
|
51
|
+
|
|
52
|
+
if self.is_ignored_host(self.url):
|
|
53
|
+
logger.warn(f"Ignore host: [{self.host}]")
|
|
54
|
+
return self.html_path
|
|
55
|
+
|
|
56
|
+
if self.html_path.exists() and not overwrite:
|
|
57
|
+
logger.success(f"HTML existed: [{self.html_path}]")
|
|
58
|
+
else:
|
|
59
|
+
self.send_request()
|
|
60
|
+
self.save_response()
|
|
61
|
+
return self.html_path
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class BatchWebpageFetcher:
|
|
65
|
+
def __init__(self):
|
|
66
|
+
self.done_count = 0
|
|
67
|
+
self.total_count = 0
|
|
68
|
+
self.url_and_html_path_list = []
|
|
69
|
+
|
|
70
|
+
def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
|
|
71
|
+
webpage_fetcher = WebpageFetcher()
|
|
72
|
+
html_path = webpage_fetcher.fetch(
|
|
73
|
+
url=url, overwrite=overwrite, output_parent=output_parent
|
|
74
|
+
)
|
|
75
|
+
self.url_and_html_path_list.append({"url": url, "html_path": html_path})
|
|
76
|
+
self.done_count += 1
|
|
77
|
+
logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
|
|
78
|
+
|
|
79
|
+
def fetch(self, urls, overwrite=False, output_parent=None):
|
|
80
|
+
self.urls = urls
|
|
81
|
+
self.total_count = len(self.urls)
|
|
82
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
83
|
+
futures = [
|
|
84
|
+
executor.submit(
|
|
85
|
+
self.fecth_single_webpage,
|
|
86
|
+
url=url,
|
|
87
|
+
overwrite=overwrite,
|
|
88
|
+
output_parent=output_parent,
|
|
89
|
+
)
|
|
90
|
+
for url in urls
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
94
|
+
result = future.result()
|
|
95
|
+
return self.url_and_html_path_list
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
urls = [
|
|
100
|
+
"https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
|
|
101
|
+
"https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
|
|
102
|
+
"https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
|
|
103
|
+
]
|
|
104
|
+
batch_webpage_fetcher = BatchWebpageFetcher()
|
|
105
|
+
batch_webpage_fetcher.fetch(
|
|
106
|
+
urls=urls, overwrite=True, output_parent="python tutorials"
|
|
107
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OSEnver:
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self.envs_stack = []
|
|
11
|
+
self.envs = os.environ.copy()
|
|
12
|
+
|
|
13
|
+
def store_envs(self):
|
|
14
|
+
self.envs_stack.append(self.envs)
|
|
15
|
+
|
|
16
|
+
def restore_envs(self):
|
|
17
|
+
self.envs = self.envs_stack.pop()
|
|
18
|
+
|
|
19
|
+
def set_envs(self, secrets=True, proxies=None, store_envs=True):
|
|
20
|
+
# caller_info = inspect.stack()[1]
|
|
21
|
+
# logger.back(f"OS Envs is set by: {caller_info.filename}")
|
|
22
|
+
|
|
23
|
+
if store_envs:
|
|
24
|
+
self.store_envs()
|
|
25
|
+
|
|
26
|
+
if secrets:
|
|
27
|
+
secrets_path = Path(__file__).parents[1] / "secrets.json"
|
|
28
|
+
if secrets_path.exists():
|
|
29
|
+
with open(secrets_path, "r") as rf:
|
|
30
|
+
secrets = json.load(rf)
|
|
31
|
+
else:
|
|
32
|
+
secrets = {}
|
|
33
|
+
|
|
34
|
+
if proxies:
|
|
35
|
+
for proxy_env in ["http_proxy", "https_proxy"]:
|
|
36
|
+
if isinstance(proxies, str):
|
|
37
|
+
self.envs[proxy_env] = proxies
|
|
38
|
+
elif "http_proxy" in secrets.keys():
|
|
39
|
+
self.envs[proxy_env] = secrets["http_proxy"]
|
|
40
|
+
elif os.getenv("http_proxy"):
|
|
41
|
+
self.envs[proxy_env] = os.getenv("http_proxy")
|
|
42
|
+
else:
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
self.proxy = (
|
|
46
|
+
self.envs.get("all_proxy")
|
|
47
|
+
or self.envs.get("http_proxy")
|
|
48
|
+
or self.envs.get("https_proxy")
|
|
49
|
+
or None
|
|
50
|
+
)
|
|
51
|
+
self.requests_proxies = {
|
|
52
|
+
"http": self.proxy,
|
|
53
|
+
"https": self.proxy,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if self.proxy:
|
|
57
|
+
logger.note(f"Using proxy: [{self.proxy}]")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
enver = OSEnver()
|