webscout 3.3__py3-none-any.whl → 3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIutel.py +1 -0
- webscout/DWEBS.py +772 -176
- webscout/Local/_version.py +1 -1
- webscout/Provider/Deepinfra.py +479 -0
- webscout/Provider/__init__.py +5 -0
- webscout/__init__.py +4 -2
- webscout/cli.py +17 -15
- webscout/exceptions.py +1 -1
- webscout/version.py +1 -1
- webscout/webai.py +15 -0
- webscout/webscout_search.py +48 -39
- webscout/webscout_search_async.py +11 -10
- webscout/websx_search.py +370 -0
- {webscout-3.3.dist-info → webscout-3.5.dist-info}/METADATA +149 -217
- {webscout-3.3.dist-info → webscout-3.5.dist-info}/RECORD +19 -29
- {webscout-3.3.dist-info → webscout-3.5.dist-info}/top_level.txt +0 -1
- DeepWEBS/__init__.py +0 -0
- DeepWEBS/documents/__init__.py +0 -0
- DeepWEBS/documents/query_results_extractor.py +0 -99
- DeepWEBS/documents/webpage_content_extractor.py +0 -145
- DeepWEBS/networks/__init__.py +0 -0
- DeepWEBS/networks/filepath_converter.py +0 -109
- DeepWEBS/networks/google_searcher.py +0 -52
- DeepWEBS/networks/network_configs.py +0 -30
- DeepWEBS/networks/webpage_fetcher.py +0 -95
- DeepWEBS/utilsdw/__init__.py +0 -0
- DeepWEBS/utilsdw/enver.py +0 -78
- DeepWEBS/utilsdw/logger.py +0 -269
- {webscout-3.3.dist-info → webscout-3.5.dist-info}/LICENSE.md +0 -0
- {webscout-3.3.dist-info → webscout-3.5.dist-info}/WHEEL +0 -0
- {webscout-3.3.dist-info → webscout-3.5.dist-info}/entry_points.txt +0 -0
webscout/DWEBS.py
CHANGED
|
@@ -1,197 +1,793 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import platform
|
|
4
|
+
import re
|
|
5
|
+
import concurrent.futures
|
|
6
|
+
import requests
|
|
7
|
+
import tldextract
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
12
|
+
import datetime
|
|
13
|
+
import functools
|
|
14
|
+
import inspect
|
|
15
|
+
import logging
|
|
1
16
|
|
|
2
|
-
from
|
|
3
|
-
from
|
|
17
|
+
from urllib.parse import quote, unquote
|
|
18
|
+
from tiktoken import get_encoding as tiktoken_get_encoding
|
|
19
|
+
from markdownify import markdownify
|
|
20
|
+
from termcolor import colored
|
|
21
|
+
class QueryResultsExtractor:
|
|
22
|
+
def __init__(self) -> None:
|
|
23
|
+
self.query_results = []
|
|
24
|
+
self.related_questions = []
|
|
4
25
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
26
|
+
def load_html(self, html_path):
|
|
27
|
+
try:
|
|
28
|
+
with open(html_path, "r", encoding="utf-8") as f:
|
|
29
|
+
html = f.read()
|
|
30
|
+
self.soup = BeautifulSoup(html, "html.parser")
|
|
31
|
+
except FileNotFoundError:
|
|
32
|
+
logger.error(f"File not found: {html_path}")
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.error(f"Error loading HTML: {e}")
|
|
12
35
|
|
|
13
|
-
|
|
36
|
+
def extract_query_results(self):
|
|
37
|
+
try:
|
|
38
|
+
self.query = self.soup.find("textarea").text.strip()
|
|
39
|
+
query_result_elements = self.soup.find_all("div", class_="g")
|
|
40
|
+
for idx, result in enumerate(query_result_elements):
|
|
41
|
+
try:
|
|
42
|
+
site = result.find("cite").find_previous("span").text.strip()
|
|
43
|
+
url = result.find("a")["href"]
|
|
44
|
+
title = result.find("h3").text.strip()
|
|
45
|
+
abstract_element_conditions = [
|
|
46
|
+
{"data-sncf": "1"},
|
|
47
|
+
{"class_": "ITZIwc"},
|
|
48
|
+
]
|
|
49
|
+
for condition in abstract_element_conditions:
|
|
50
|
+
abstract_element = result.find("div", condition)
|
|
51
|
+
if abstract_element is not None:
|
|
52
|
+
abstract = abstract_element.text.strip()
|
|
53
|
+
break
|
|
54
|
+
else:
|
|
55
|
+
abstract = ""
|
|
56
|
+
logger.mesg(
|
|
57
|
+
f"{title}\n"
|
|
58
|
+
f" - {site}\n"
|
|
59
|
+
f" - {url}\n"
|
|
60
|
+
f" - {abstract}\n"
|
|
61
|
+
f"\n"
|
|
62
|
+
)
|
|
63
|
+
self.query_results.append(
|
|
64
|
+
{
|
|
65
|
+
"title": title,
|
|
66
|
+
"site": site,
|
|
67
|
+
"url": url,
|
|
68
|
+
"abstract": abstract,
|
|
69
|
+
"index": idx,
|
|
70
|
+
"type": "web",
|
|
71
|
+
}
|
|
72
|
+
)
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.error(f"Error extracting query result: {e}")
|
|
75
|
+
logger.success(f"- {len(query_result_elements)} query results")
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Error extracting query results: {e}")
|
|
78
|
+
|
|
79
|
+
def extract_related_questions(self):
|
|
80
|
+
try:
|
|
81
|
+
related_question_elements = self.soup.find_all(
|
|
82
|
+
"div", class_="related-question-pair"
|
|
83
|
+
)
|
|
84
|
+
for question_element in related_question_elements:
|
|
85
|
+
try:
|
|
86
|
+
question = question_element.find("span").text.strip()
|
|
87
|
+
print(question)
|
|
88
|
+
self.related_questions.append(question)
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f"Error extracting related question: {e}")
|
|
91
|
+
logger.success(f"- {len(self.related_questions)} related questions")
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.error(f"Error extracting related questions: {e}")
|
|
94
|
+
|
|
95
|
+
def extract(self, html_path):
|
|
96
|
+
self.load_html(html_path)
|
|
97
|
+
self.extract_query_results()
|
|
98
|
+
self.extract_related_questions()
|
|
99
|
+
self.search_results = {
|
|
100
|
+
"query": self.query,
|
|
101
|
+
"query_results": self.query_results,
|
|
102
|
+
"related_questions": self.related_questions,
|
|
103
|
+
}
|
|
104
|
+
return self.search_results
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class WebpageContentExtractor:
|
|
14
110
|
def __init__(self):
|
|
15
|
-
|
|
111
|
+
self.tokenizer = tiktoken_get_encoding("cl100k_base")
|
|
16
112
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
)
|
|
22
|
-
result_num: int = Field(
|
|
23
|
-
default=10,
|
|
24
|
-
description="(int) Number of search results",
|
|
25
|
-
)
|
|
26
|
-
safe: bool = Field(
|
|
27
|
-
default=False,
|
|
28
|
-
description="(bool) Enable SafeSearch",
|
|
29
|
-
)
|
|
30
|
-
types: list = Field(
|
|
31
|
-
default=["web"],
|
|
32
|
-
description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
|
|
33
|
-
)
|
|
34
|
-
extract_webpage: bool = Field(
|
|
35
|
-
default=False,
|
|
36
|
-
description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
|
|
37
|
-
)
|
|
38
|
-
overwrite_query_html: bool = Field(
|
|
39
|
-
default=False,
|
|
40
|
-
description="(bool) Overwrite HTML file of query results",
|
|
41
|
-
)
|
|
42
|
-
overwrite_webpage_html: bool = Field(
|
|
43
|
-
default=False,
|
|
44
|
-
description="(bool) Overwrite HTML files of webpages from query results",
|
|
45
|
-
)
|
|
113
|
+
def count_tokens(self, text):
|
|
114
|
+
tokens = self.tokenizer.encode(text)
|
|
115
|
+
token_count = len(tokens)
|
|
116
|
+
return token_count
|
|
46
117
|
|
|
47
|
-
def
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
continue
|
|
54
|
-
try:
|
|
55
|
-
query_html_path = google_searcher.search(
|
|
56
|
-
query=query,
|
|
57
|
-
result_num=item.result_num,
|
|
58
|
-
safe=item.safe,
|
|
59
|
-
overwrite=item.overwrite_query_html,
|
|
60
|
-
)
|
|
61
|
-
except Exception as e:
|
|
62
|
-
logger.error(f"Failed to search for query '{query}': {e}")
|
|
63
|
-
continue
|
|
118
|
+
def html_to_markdown(self, html_str, ignore_links=True):
|
|
119
|
+
if ignore_links:
|
|
120
|
+
markdown_str = markdownify(html_str, strip="a")
|
|
121
|
+
else:
|
|
122
|
+
markdown_str = markdownify(html_str)
|
|
123
|
+
markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
|
|
64
124
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
queries_search_results.append(query_search_results)
|
|
72
|
-
logger.note(queries_search_results)
|
|
73
|
-
|
|
74
|
-
if item.extract_webpage:
|
|
75
|
-
queries_search_results = self.extract_webpages(
|
|
76
|
-
queries_search_results,
|
|
77
|
-
overwrite_webpage_html=item.overwrite_webpage_html,
|
|
78
|
-
)
|
|
79
|
-
return queries_search_results
|
|
125
|
+
self.markdown_token_count = self.count_tokens(markdown_str)
|
|
126
|
+
logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
|
|
127
|
+
|
|
128
|
+
self.markdown_str = markdown_str
|
|
129
|
+
|
|
130
|
+
return self.markdown_str
|
|
80
131
|
|
|
81
|
-
def
|
|
82
|
-
|
|
132
|
+
def remove_elements_from_html(self, html_str):
|
|
133
|
+
soup = BeautifulSoup(html_str, "html.parser")
|
|
134
|
+
ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
|
|
135
|
+
ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
|
|
136
|
+
removed_element_counts = 0
|
|
137
|
+
for element in soup.find_all():
|
|
138
|
+
class_str = ""
|
|
139
|
+
id_str = ""
|
|
83
140
|
try:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
overwrite=overwrite_webpage_html,
|
|
93
|
-
output_parent=query_search_results["query"],
|
|
94
|
-
)
|
|
95
|
-
except Exception as e:
|
|
96
|
-
logger.error(f"Failed to fetch webpages for query '{query_search_results['query']}': {e}")
|
|
97
|
-
continue
|
|
98
|
-
|
|
99
|
-
# Extract webpage contents from htmls
|
|
100
|
-
html_paths = [
|
|
101
|
-
str(url_and_html_path["html_path"])
|
|
102
|
-
for url_and_html_path in url_and_html_path_list
|
|
103
|
-
]
|
|
104
|
-
batch_webpage_content_extractor = BatchWebpageContentExtractor()
|
|
141
|
+
class_attr = element.get("class", [])
|
|
142
|
+
if class_attr:
|
|
143
|
+
class_str = " ".join(list(class_attr))
|
|
144
|
+
if id_str:
|
|
145
|
+
class_str = f"{class_str} {id_str}"
|
|
146
|
+
except:
|
|
147
|
+
pass
|
|
148
|
+
|
|
105
149
|
try:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
str(url_and_html_path["html_path"]): url_and_html_path["url"]
|
|
116
|
-
for url_and_html_path in url_and_html_path_list
|
|
117
|
-
}
|
|
118
|
-
url_to_extracted_content_dict = {
|
|
119
|
-
html_path_to_url_dict[
|
|
120
|
-
html_path_and_extracted_content["html_path"]
|
|
121
|
-
]: html_path_and_extracted_content["extracted_content"]
|
|
122
|
-
for html_path_and_extracted_content in html_path_and_extracted_content_list
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
# Write extracted contents (as 'text' field) to query_search_results
|
|
126
|
-
for query_result_idx, query_result in enumerate(
|
|
127
|
-
query_search_results["query_results"]
|
|
150
|
+
id_str = element.get("id", "")
|
|
151
|
+
except:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
if (
|
|
155
|
+
(not element.text.strip())
|
|
156
|
+
or (element.name in IGNORE_TAGS)
|
|
157
|
+
or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
|
|
158
|
+
or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
|
|
128
159
|
):
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
return queries_search_results
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
class ArgParser(argparse.ArgumentParser):
|
|
139
|
-
def __init__(self, *args, **kwargs):
|
|
140
|
-
super(ArgParser, self).__init__(*args, **kwargs)
|
|
141
|
-
|
|
142
|
-
self.add_argument(
|
|
143
|
-
"-q",
|
|
144
|
-
"--queries",
|
|
145
|
-
type=str,
|
|
146
|
-
nargs="+",
|
|
147
|
-
required=True,
|
|
148
|
-
help="Queries to search",
|
|
160
|
+
element.decompose()
|
|
161
|
+
removed_element_counts += 1
|
|
162
|
+
|
|
163
|
+
logger.mesg(
|
|
164
|
+
f"- Elements: "
|
|
165
|
+
f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
|
|
149
166
|
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
167
|
+
|
|
168
|
+
html_str = str(soup)
|
|
169
|
+
self.html_str = html_str
|
|
170
|
+
|
|
171
|
+
return self.html_str
|
|
172
|
+
|
|
173
|
+
def extract(self, html_path):
|
|
174
|
+
logger.note(f"Extracting content from: {html_path}")
|
|
175
|
+
|
|
176
|
+
if not Path(html_path).exists():
|
|
177
|
+
logger.warn(f"File not found: {html_path}")
|
|
178
|
+
return ""
|
|
179
|
+
|
|
180
|
+
encodings = ["utf-8", "latin-1"]
|
|
181
|
+
for encoding in encodings:
|
|
182
|
+
try:
|
|
183
|
+
with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
|
|
184
|
+
html_str = rf.read()
|
|
185
|
+
break
|
|
186
|
+
except UnicodeDecodeError:
|
|
187
|
+
pass
|
|
188
|
+
else:
|
|
189
|
+
logger.warn(f"No matching encodings: {html_path}")
|
|
190
|
+
return ""
|
|
191
|
+
|
|
192
|
+
html_str = self.remove_elements_from_html(html_str)
|
|
193
|
+
markdown_str = self.html_to_markdown(html_str)
|
|
194
|
+
return markdown_str
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class BatchWebpageContentExtractor:
|
|
198
|
+
def __init__(self) -> None:
|
|
199
|
+
self.html_path_and_extracted_content_list = []
|
|
200
|
+
self.done_count = 0
|
|
201
|
+
|
|
202
|
+
def extract_single_html(self, html_path):
|
|
203
|
+
webpage_content_extractor = WebpageContentExtractor()
|
|
204
|
+
extracted_content = webpage_content_extractor.extract(html_path)
|
|
205
|
+
self.html_path_and_extracted_content_list.append(
|
|
206
|
+
{"html_path": html_path, "extracted_content": extracted_content}
|
|
156
207
|
)
|
|
157
|
-
self.
|
|
158
|
-
|
|
159
|
-
"
|
|
160
|
-
default=False,
|
|
161
|
-
action="store_true",
|
|
162
|
-
help="Enable SafeSearch",
|
|
208
|
+
self.done_count += 1
|
|
209
|
+
logger.success(
|
|
210
|
+
f"> [{self.done_count}/{self.total_count}] Extracted: {html_path}"
|
|
163
211
|
)
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
212
|
+
|
|
213
|
+
def extract(self, html_paths):
|
|
214
|
+
self.html_path = html_paths
|
|
215
|
+
self.total_count = len(self.html_path)
|
|
216
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
217
|
+
futures = [
|
|
218
|
+
executor.submit(self.extract_single_html, html_path)
|
|
219
|
+
for html_path in self.html_path
|
|
220
|
+
]
|
|
221
|
+
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
222
|
+
result = future.result()
|
|
223
|
+
|
|
224
|
+
return self.html_path_and_extracted_content_list
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# What characters are forbidden in Windows and Linux directory names?
|
|
231
|
+
# https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
|
|
232
|
+
|
|
233
|
+
INVALID_FILE_PATH_CHARS = [
|
|
234
|
+
"\\",
|
|
235
|
+
"/",
|
|
236
|
+
":",
|
|
237
|
+
"*",
|
|
238
|
+
"?",
|
|
239
|
+
'"',
|
|
240
|
+
"<",
|
|
241
|
+
">",
|
|
242
|
+
"|",
|
|
243
|
+
"\n",
|
|
244
|
+
"\t",
|
|
245
|
+
"\r",
|
|
246
|
+
*[chr(i) for i in range(32)],
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
WINDOWS_INVALID_FILE_PATH_NAMES = [
|
|
250
|
+
"con",
|
|
251
|
+
"prn",
|
|
252
|
+
"aux",
|
|
253
|
+
"nul",
|
|
254
|
+
*[f"com{i+1}" for i in range(10)],
|
|
255
|
+
*[f"lpt{i+1}" for i in range(10)],
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class FilepathConverter:
|
|
260
|
+
def __init__(self, parent: str = None):
|
|
261
|
+
self.output_root = Path(__file__).parents[1] / "files"
|
|
262
|
+
self.parent = parent
|
|
263
|
+
|
|
264
|
+
def preprocess(self, input_string):
|
|
265
|
+
return input_string
|
|
266
|
+
|
|
267
|
+
def validate(self, input_string):
|
|
268
|
+
if not input_string:
|
|
269
|
+
return input_string
|
|
270
|
+
filename = input_string
|
|
271
|
+
for char in INVALID_FILE_PATH_CHARS:
|
|
272
|
+
filename = filename.replace(char, "_")
|
|
273
|
+
if platform.system() == "Windows":
|
|
274
|
+
filename_base = filename.split(".")[0]
|
|
275
|
+
if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
|
|
276
|
+
filename_base = filename_base + "_"
|
|
277
|
+
filename = ".".join([filename_base, *filename.split(".")[1:]])
|
|
278
|
+
return filename
|
|
279
|
+
|
|
280
|
+
def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
|
|
281
|
+
if ext:
|
|
282
|
+
filename_ext = "." + filename.split(".")[-1]
|
|
283
|
+
if filename_ext.lower() not in accept_exts:
|
|
284
|
+
filename += ext
|
|
285
|
+
return filename
|
|
286
|
+
|
|
287
|
+
def convert(self, input_string, parent=None):
|
|
288
|
+
filename = self.preprocess(input_string)
|
|
289
|
+
filename = self.validate(filename)
|
|
290
|
+
filename = self.append_extension(filename)
|
|
291
|
+
|
|
292
|
+
parent = parent or self.parent
|
|
293
|
+
parent = self.validate(parent)
|
|
294
|
+
if parent:
|
|
295
|
+
filepath = self.output_root / parent / filename
|
|
296
|
+
else:
|
|
297
|
+
filepath = self.output_root / filename
|
|
298
|
+
|
|
299
|
+
self.filename = filename
|
|
300
|
+
self.filepath = filepath
|
|
301
|
+
|
|
302
|
+
return self.filepath
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class UrlToFilepathConverter(FilepathConverter):
|
|
306
|
+
def __init__(self, parent: str = None):
|
|
307
|
+
super().__init__(parent)
|
|
308
|
+
self.output_root = self.output_root / "urls"
|
|
309
|
+
|
|
310
|
+
def preprocess(self, url):
|
|
311
|
+
filename = unquote(url.split("//")[1])
|
|
312
|
+
return filename
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class QueryToFilepathConverter(FilepathConverter):
|
|
316
|
+
def __init__(self, parent: str = None):
|
|
317
|
+
super().__init__(parent)
|
|
318
|
+
self.output_root = self.output_root / "queries"
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def add_fillers(text, filler="=", fill_side="both"):
|
|
322
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
323
|
+
text = text.strip()
|
|
324
|
+
text_width = len(text)
|
|
325
|
+
if text_width >= terminal_width:
|
|
326
|
+
return text
|
|
327
|
+
|
|
328
|
+
if fill_side[0].lower() == "b":
|
|
329
|
+
leading_fill_str = filler * ((terminal_width - text_width) // 2 - 1) + " "
|
|
330
|
+
trailing_fill_str = " " + filler * (
|
|
331
|
+
terminal_width - text_width - len(leading_fill_str) - 1
|
|
332
|
+
)
|
|
333
|
+
elif fill_side[0].lower() == "l":
|
|
334
|
+
leading_fill_str = filler * (terminal_width - text_width - 1) + " "
|
|
335
|
+
trailing_fill_str = ""
|
|
336
|
+
elif fill_side[0].lower() == "r":
|
|
337
|
+
leading_fill_str = ""
|
|
338
|
+
trailing_fill_str = " " + filler * (terminal_width - text_width - 1)
|
|
339
|
+
else:
|
|
340
|
+
raise ValueError("Invalid fill_side")
|
|
341
|
+
|
|
342
|
+
filled_str = f"{leading_fill_str}{text}{trailing_fill_str}"
|
|
343
|
+
return filled_str
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
class OSLogger(logging.Logger):
|
|
347
|
+
LOG_METHODS = {
|
|
348
|
+
"err": ("error", "red"),
|
|
349
|
+
"warn": ("warning", "light_red"),
|
|
350
|
+
"note": ("info", "light_magenta"),
|
|
351
|
+
"mesg": ("info", "light_cyan"),
|
|
352
|
+
"file": ("info", "light_blue"),
|
|
353
|
+
"line": ("info", "white"),
|
|
354
|
+
"success": ("info", "light_green"),
|
|
355
|
+
"fail": ("info", "light_red"),
|
|
356
|
+
"back": ("debug", "light_cyan"),
|
|
357
|
+
}
|
|
358
|
+
INDENT_METHODS = [
|
|
359
|
+
"indent",
|
|
360
|
+
"set_indent",
|
|
361
|
+
"reset_indent",
|
|
362
|
+
"store_indent",
|
|
363
|
+
"restore_indent",
|
|
364
|
+
"log_indent",
|
|
365
|
+
]
|
|
366
|
+
LEVEL_METHODS = [
|
|
367
|
+
"set_level",
|
|
368
|
+
"store_level",
|
|
369
|
+
"restore_level",
|
|
370
|
+
"quiet",
|
|
371
|
+
"enter_quiet",
|
|
372
|
+
"exit_quiet",
|
|
373
|
+
]
|
|
374
|
+
LEVEL_NAMES = {
|
|
375
|
+
"critical": logging.CRITICAL,
|
|
376
|
+
"error": logging.ERROR,
|
|
377
|
+
"warning": logging.WARNING,
|
|
378
|
+
"info": logging.INFO,
|
|
379
|
+
"debug": logging.DEBUG,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
def __init__(self, name=None, prefix=False):
|
|
383
|
+
if not name:
|
|
384
|
+
frame = inspect.stack()[1]
|
|
385
|
+
module = inspect.getmodule(frame[0])
|
|
386
|
+
name = module.__name__
|
|
387
|
+
|
|
388
|
+
super().__init__(name)
|
|
389
|
+
self.setLevel(logging.INFO)
|
|
390
|
+
|
|
391
|
+
if prefix:
|
|
392
|
+
formatter_prefix = "[%(asctime)s] - [%(name)s] - [%(levelname)s]\n"
|
|
393
|
+
else:
|
|
394
|
+
formatter_prefix = ""
|
|
395
|
+
|
|
396
|
+
self.formatter = logging.Formatter(formatter_prefix + "%(message)s")
|
|
397
|
+
|
|
398
|
+
stream_handler = logging.StreamHandler()
|
|
399
|
+
stream_handler.setLevel(logging.INFO)
|
|
400
|
+
stream_handler.setFormatter(self.formatter)
|
|
401
|
+
self.addHandler(stream_handler)
|
|
402
|
+
|
|
403
|
+
self.log_indent = 0
|
|
404
|
+
self.log_indents = []
|
|
405
|
+
|
|
406
|
+
self.log_level = "info"
|
|
407
|
+
self.log_levels = []
|
|
408
|
+
|
|
409
|
+
def indent(self, indent=2):
|
|
410
|
+
self.log_indent += indent
|
|
411
|
+
|
|
412
|
+
def set_indent(self, indent=2):
|
|
413
|
+
self.log_indent = indent
|
|
414
|
+
|
|
415
|
+
def reset_indent(self):
|
|
416
|
+
self.log_indent = 0
|
|
417
|
+
|
|
418
|
+
def store_indent(self):
|
|
419
|
+
self.log_indents.append(self.log_indent)
|
|
420
|
+
|
|
421
|
+
def restore_indent(self):
|
|
422
|
+
self.log_indent = self.log_indents.pop(-1)
|
|
423
|
+
|
|
424
|
+
def set_level(self, level):
|
|
425
|
+
self.log_level = level
|
|
426
|
+
self.setLevel(self.LEVEL_NAMES[level])
|
|
427
|
+
|
|
428
|
+
def store_level(self):
|
|
429
|
+
self.log_levels.append(self.log_level)
|
|
430
|
+
|
|
431
|
+
def restore_level(self):
|
|
432
|
+
self.log_level = self.log_levels.pop(-1)
|
|
433
|
+
self.set_level(self.log_level)
|
|
434
|
+
|
|
435
|
+
def quiet(self):
|
|
436
|
+
self.set_level("critical")
|
|
437
|
+
|
|
438
|
+
def enter_quiet(self, quiet=False):
|
|
439
|
+
if quiet:
|
|
440
|
+
self.store_level()
|
|
441
|
+
self.quiet()
|
|
442
|
+
|
|
443
|
+
def exit_quiet(self, quiet=False):
|
|
444
|
+
if quiet:
|
|
445
|
+
self.restore_level()
|
|
446
|
+
|
|
447
|
+
def log(
|
|
448
|
+
self,
|
|
449
|
+
level,
|
|
450
|
+
color,
|
|
451
|
+
msg,
|
|
452
|
+
indent=0,
|
|
453
|
+
fill=False,
|
|
454
|
+
fill_side="both",
|
|
455
|
+
end="\n",
|
|
456
|
+
*args,
|
|
457
|
+
**kwargs,
|
|
458
|
+
):
|
|
459
|
+
if type(msg) == str:
|
|
460
|
+
msg_str = msg
|
|
461
|
+
else:
|
|
462
|
+
msg_str = repr(msg)
|
|
463
|
+
quotes = ["'", '"']
|
|
464
|
+
if msg_str[0] in quotes and msg_str[-1] in quotes:
|
|
465
|
+
msg_str = msg_str[1:-1]
|
|
466
|
+
|
|
467
|
+
indent_str = " " * (self.log_indent + indent)
|
|
468
|
+
indented_msg = "\n".join([indent_str + line for line in msg_str.split("\n")])
|
|
469
|
+
|
|
470
|
+
if fill:
|
|
471
|
+
indented_msg = add_fillers(indented_msg, fill_side=fill_side)
|
|
472
|
+
|
|
473
|
+
handler = self.handlers[0]
|
|
474
|
+
handler.terminator = end
|
|
475
|
+
|
|
476
|
+
getattr(self, level)(colored(indented_msg, color), *args, **kwargs)
|
|
477
|
+
|
|
478
|
+
def route_log(self, method, msg, *args, **kwargs):
|
|
479
|
+
level, method = method
|
|
480
|
+
functools.partial(self.log, level, method, msg)(*args, **kwargs)
|
|
481
|
+
|
|
482
|
+
def err(self, msg: str = "", *args, **kwargs):
|
|
483
|
+
self.route_log(("error", "red"), msg, *args, **kwargs)
|
|
484
|
+
|
|
485
|
+
def warn(self, msg: str = "", *args, **kwargs):
|
|
486
|
+
self.route_log(("warning", "light_red"), msg, *args, **kwargs)
|
|
487
|
+
|
|
488
|
+
def note(self, msg: str = "", *args, **kwargs):
|
|
489
|
+
self.route_log(("info", "light_magenta"), msg, *args, **kwargs)
|
|
490
|
+
|
|
491
|
+
def mesg(self, msg: str = "", *args, **kwargs):
|
|
492
|
+
self.route_log(("info", "light_cyan"), msg, *args, **kwargs)
|
|
493
|
+
|
|
494
|
+
def file(self, msg: str = "", *args, **kwargs):
|
|
495
|
+
self.route_log(("info", "light_blue"), msg, *args, **kwargs)
|
|
496
|
+
|
|
497
|
+
def line(self, msg: str = "", *args, **kwargs):
|
|
498
|
+
self.route_log(("info", "white"), msg, *args, **kwargs)
|
|
499
|
+
|
|
500
|
+
def success(self, msg: str = "", *args, **kwargs):
|
|
501
|
+
self.route_log(("info", "light_green"), msg, *args, **kwargs)
|
|
502
|
+
|
|
503
|
+
def fail(self, msg: str = "", *args, **kwargs):
|
|
504
|
+
self.route_log(("info", "light_red"), msg, *args, **kwargs)
|
|
505
|
+
|
|
506
|
+
def back(self, msg: str = "", *args, **kwargs):
|
|
507
|
+
self.route_log(("debug", "light_cyan"), msg, *args, **kwargs)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
logger = OSLogger()
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def shell_cmd(cmd, getoutput=False, showcmd=True, env=None):
|
|
514
|
+
if showcmd:
|
|
515
|
+
logger.info(colored(f"\n$ [{os.getcwd()}]", "light_blue"))
|
|
516
|
+
logger.info(colored(f" $ {cmd}\n", "light_cyan"))
|
|
517
|
+
if getoutput:
|
|
518
|
+
output = subprocess.getoutput(cmd, env=env)
|
|
519
|
+
return output
|
|
520
|
+
else:
|
|
521
|
+
subprocess.run(cmd, shell=True, env=env)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
class Runtimer:
|
|
525
|
+
def __enter__(self):
|
|
526
|
+
self.t1, _ = self.start_time()
|
|
527
|
+
return self
|
|
528
|
+
|
|
529
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
530
|
+
self.t2, _ = self.end_time()
|
|
531
|
+
self.elapsed_time(self.t2 - self.t1)
|
|
532
|
+
|
|
533
|
+
def start_time(self):
|
|
534
|
+
t1 = datetime.datetime.now()
|
|
535
|
+
self.logger_time("start", t1)
|
|
536
|
+
return t1, self.time2str(t1)
|
|
537
|
+
|
|
538
|
+
def end_time(self):
|
|
539
|
+
t2 = datetime.datetime.now()
|
|
540
|
+
self.logger_time("end", t2)
|
|
541
|
+
return t2, self.time2str(t2)
|
|
542
|
+
|
|
543
|
+
def elapsed_time(self, dt=None):
|
|
544
|
+
if dt is None:
|
|
545
|
+
dt = self.t2 - self.t1
|
|
546
|
+
self.logger_time("elapsed", dt)
|
|
547
|
+
return dt, self.time2str(dt)
|
|
548
|
+
|
|
549
|
+
def logger_time(self, time_type, t):
|
|
550
|
+
time_types = {
|
|
551
|
+
"start": "Start",
|
|
552
|
+
"end": "End",
|
|
553
|
+
"elapsed": "Elapsed",
|
|
554
|
+
}
|
|
555
|
+
time_str = add_fillers(
|
|
556
|
+
colored(
|
|
557
|
+
f"{time_types[time_type]} time: [ {self.time2str(t)} ]",
|
|
558
|
+
"light_magenta",
|
|
559
|
+
),
|
|
560
|
+
fill_side="both",
|
|
172
561
|
)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
562
|
+
logger.line(time_str)
|
|
563
|
+
|
|
564
|
+
# Convert time to string
|
|
565
|
+
def time2str(self, t):
|
|
566
|
+
datetime_str_format = "%Y-%m-%d %H:%M:%S"
|
|
567
|
+
if isinstance(t, datetime.datetime):
|
|
568
|
+
return t.strftime(datetime_str_format)
|
|
569
|
+
elif isinstance(t, datetime.timedelta):
|
|
570
|
+
hours = t.seconds // 3600
|
|
571
|
+
hour_str = f"{hours} hr" if hours > 0 else ""
|
|
572
|
+
minutes = (t.seconds // 60) % 60
|
|
573
|
+
minute_str = f"{minutes:>2} min" if minutes > 0 else ""
|
|
574
|
+
seconds = t.seconds % 60
|
|
575
|
+
second_str = f"{seconds:>2} s"
|
|
576
|
+
time_str = " ".join([hour_str, minute_str, second_str]).strip()
|
|
577
|
+
return time_str
|
|
578
|
+
else:
|
|
579
|
+
return str(t)
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
class OSEnver:
|
|
583
|
+
def __init__(self):
|
|
584
|
+
self.envs_stack = []
|
|
585
|
+
self.envs = os.environ.copy()
|
|
586
|
+
|
|
587
|
+
def store_envs(self):
|
|
588
|
+
self.envs_stack.append(self.envs)
|
|
589
|
+
|
|
590
|
+
def restore_envs(self):
|
|
591
|
+
self.envs = self.envs_stack.pop()
|
|
592
|
+
|
|
593
|
+
def set_envs(self, secrets=True, proxies=None, store_envs=True):
|
|
594
|
+
# caller_info = inspect.stack()[1]
|
|
595
|
+
# logger.back(f"OS Envs is set by: {caller_info.filename}")
|
|
596
|
+
|
|
597
|
+
if store_envs:
|
|
598
|
+
self.store_envs()
|
|
599
|
+
|
|
600
|
+
if secrets:
|
|
601
|
+
secrets_path = Path(__file__).parents[1] / "secrets.json"
|
|
602
|
+
if secrets_path.exists():
|
|
603
|
+
with open(secrets_path, "r") as rf:
|
|
604
|
+
secrets = json.load(rf)
|
|
605
|
+
else:
|
|
606
|
+
secrets = {}
|
|
607
|
+
|
|
608
|
+
if proxies:
|
|
609
|
+
for proxy_env in ["http_proxy", "https_proxy"]:
|
|
610
|
+
if isinstance(proxies, str):
|
|
611
|
+
self.envs[proxy_env] = proxies
|
|
612
|
+
elif "http_proxy" in secrets.keys():
|
|
613
|
+
self.envs[proxy_env] = secrets["http_proxy"]
|
|
614
|
+
elif os.getenv("http_proxy"):
|
|
615
|
+
self.envs[proxy_env] = os.getenv("http_proxy")
|
|
616
|
+
else:
|
|
617
|
+
continue
|
|
618
|
+
|
|
619
|
+
self.proxy = (
|
|
620
|
+
self.envs.get("all_proxy")
|
|
621
|
+
or self.envs.get("http_proxy")
|
|
622
|
+
or self.envs.get("https_proxy")
|
|
623
|
+
or None
|
|
179
624
|
)
|
|
180
|
-
self.
|
|
181
|
-
"
|
|
182
|
-
"
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
625
|
+
self.requests_proxies = {
|
|
626
|
+
"http": self.proxy,
|
|
627
|
+
"https": self.proxy,
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
if self.proxy:
|
|
631
|
+
logger.note(f"Using proxy: [{self.proxy}]")
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
enver = OSEnver()
|
|
635
|
+
|
|
636
|
+
class GoogleSearcher:
|
|
637
|
+
def __init__(self):
|
|
638
|
+
self.url = "https://www.google.com/search"
|
|
639
|
+
self.enver = enver
|
|
640
|
+
self.enver.set_envs(proxies=True)
|
|
641
|
+
self.filepath_converter = QueryToFilepathConverter()
|
|
642
|
+
|
|
643
|
+
def send_request(self, result_num=10, safe=False):
|
|
644
|
+
self.request_response = requests.get(
|
|
645
|
+
url=self.url,
|
|
646
|
+
headers=REQUESTS_HEADERS,
|
|
647
|
+
params={
|
|
648
|
+
"q": self.query,
|
|
649
|
+
"num": result_num,
|
|
650
|
+
},
|
|
651
|
+
proxies=self.enver.requests_proxies,
|
|
186
652
|
)
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
653
|
+
|
|
654
|
+
def save_response(self):
|
|
655
|
+
if not self.html_path.exists():
|
|
656
|
+
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
657
|
+
logger.note(f"Saving to: [{self.html_path}]")
|
|
658
|
+
with open(self.html_path, "wb") as wf:
|
|
659
|
+
wf.write(self.request_response.content)
|
|
660
|
+
|
|
661
|
+
def search(self, query, result_num=10, safe=False, overwrite=False):
|
|
662
|
+
self.query = query
|
|
663
|
+
self.html_path = self.filepath_converter.convert(self.query)
|
|
664
|
+
logger.note(f"Searching: [{self.query}]")
|
|
665
|
+
if self.html_path.exists() and not overwrite:
|
|
666
|
+
logger.success(f"HTML existed: {self.html_path}")
|
|
667
|
+
else:
|
|
668
|
+
self.send_request(result_num=result_num, safe=safe)
|
|
669
|
+
self.save_response()
|
|
670
|
+
return self.html_path
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
IGNORE_TAGS = ["script", "style", "button"]
|
|
674
|
+
IGNORE_CLASSES = [
|
|
675
|
+
# common
|
|
676
|
+
"sidebar",
|
|
677
|
+
"footer",
|
|
678
|
+
"related",
|
|
679
|
+
"comment",
|
|
680
|
+
"topbar",
|
|
681
|
+
"offcanvas",
|
|
682
|
+
"navbar",
|
|
683
|
+
# 163.com
|
|
684
|
+
"post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
|
|
685
|
+
"ntes\-.*nav",
|
|
686
|
+
"nav\-bottom",
|
|
687
|
+
# wikipedia.org
|
|
688
|
+
"language\-list",
|
|
689
|
+
"vector\-(header)|(column)|(sticky\-pinned)|(dropdown\-content)",
|
|
690
|
+
"navbox",
|
|
691
|
+
"catlinks",
|
|
692
|
+
]
|
|
693
|
+
|
|
694
|
+
IGNORE_HOSTS = [
|
|
695
|
+
"weibo.com",
|
|
696
|
+
"hymson.com",
|
|
697
|
+
"yahoo.com",
|
|
698
|
+
]
|
|
699
|
+
|
|
700
|
+
REQUESTS_HEADERS = {
|
|
701
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
class WebpageFetcher:
|
|
708
|
+
def __init__(self):
|
|
709
|
+
self.enver = enver
|
|
710
|
+
self.enver.set_envs(proxies=True)
|
|
711
|
+
self.filepath_converter = UrlToFilepathConverter()
|
|
712
|
+
|
|
713
|
+
def is_ignored_host(self, url):
|
|
714
|
+
self.host = tldextract.extract(url).registered_domain
|
|
715
|
+
if self.host in IGNORE_HOSTS:
|
|
716
|
+
return True
|
|
717
|
+
else:
|
|
718
|
+
return False
|
|
719
|
+
|
|
720
|
+
def send_request(self):
|
|
721
|
+
try:
|
|
722
|
+
self.request_response = requests.get(
|
|
723
|
+
url=self.url,
|
|
724
|
+
headers=REQUESTS_HEADERS,
|
|
725
|
+
proxies=self.enver.requests_proxies,
|
|
726
|
+
timeout=15,
|
|
727
|
+
)
|
|
728
|
+
except:
|
|
729
|
+
logger.warn(f"Failed to fetch: [{self.url}]")
|
|
730
|
+
self.request_response = None
|
|
731
|
+
|
|
732
|
+
def save_response(self):
|
|
733
|
+
if not self.html_path.exists():
|
|
734
|
+
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
735
|
+
logger.success(f"Saving to: [{self.html_path}]")
|
|
736
|
+
|
|
737
|
+
if self.request_response is None:
|
|
738
|
+
return
|
|
739
|
+
else:
|
|
740
|
+
with open(self.html_path, "wb") as wf:
|
|
741
|
+
wf.write(self.request_response.content)
|
|
742
|
+
|
|
743
|
+
def fetch(self, url, overwrite=False, output_parent=None):
|
|
744
|
+
self.url = url
|
|
745
|
+
logger.note(f"Fetching: [{self.url}]")
|
|
746
|
+
self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
|
|
747
|
+
|
|
748
|
+
if self.is_ignored_host(self.url):
|
|
749
|
+
logger.warn(f"Ignore host: [{self.host}]")
|
|
750
|
+
return self.html_path
|
|
751
|
+
|
|
752
|
+
if self.html_path.exists() and not overwrite:
|
|
753
|
+
logger.success(f"HTML existed: [{self.html_path}]")
|
|
754
|
+
else:
|
|
755
|
+
self.send_request()
|
|
756
|
+
self.save_response()
|
|
757
|
+
return self.html_path
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
class BatchWebpageFetcher:
|
|
761
|
+
def __init__(self):
|
|
762
|
+
self.done_count = 0
|
|
763
|
+
self.total_count = 0
|
|
764
|
+
self.url_and_html_path_list = []
|
|
765
|
+
|
|
766
|
+
def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
|
|
767
|
+
webpage_fetcher = WebpageFetcher()
|
|
768
|
+
html_path = webpage_fetcher.fetch(
|
|
769
|
+
url=url, overwrite=overwrite, output_parent=output_parent
|
|
193
770
|
)
|
|
771
|
+
self.url_and_html_path_list.append({"url": url, "html_path": html_path})
|
|
772
|
+
self.done_count += 1
|
|
773
|
+
logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
|
|
774
|
+
|
|
775
|
+
def fetch(self, urls, overwrite=False, output_parent=None):
|
|
776
|
+
self.urls = urls
|
|
777
|
+
self.total_count = len(self.urls)
|
|
778
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
779
|
+
futures = [
|
|
780
|
+
executor.submit(
|
|
781
|
+
self.fecth_single_webpage,
|
|
782
|
+
url=url,
|
|
783
|
+
overwrite=overwrite,
|
|
784
|
+
output_parent=output_parent,
|
|
785
|
+
)
|
|
786
|
+
for url in urls
|
|
787
|
+
]
|
|
194
788
|
|
|
195
|
-
|
|
789
|
+
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
790
|
+
result = future.result()
|
|
791
|
+
return self.url_and_html_path_list
|
|
196
792
|
|
|
197
793
|
|