webscout 4.8__py3-none-any.whl → 5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Agents/functioncall.py +58 -102
- webscout/Bing_search.py +124 -0
- webscout/DWEBS.py +141 -777
- webscout/Provider/Cloudflare.py +286 -0
- webscout/Provider/DiscordRocks.py +5 -4
- webscout/Provider/Farfalle.py +3 -3
- webscout/Provider/Llama3.py +3 -3
- webscout/Provider/PI.py +208 -0
- webscout/Provider/Youchat.py +247 -0
- webscout/Provider/__init__.py +16 -2
- webscout/Provider/felo_search.py +238 -0
- webscout/Provider/julius.py +263 -0
- webscout/Provider/turboseek.py +237 -0
- webscout/Provider/xdash.py +202 -0
- webscout/Provider/yep.py +258 -0
- webscout/__init__.py +1 -59
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/METADATA +25 -74
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/RECORD +22 -14
- webscout/GoogleS.py +0 -342
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/LICENSE.md +0 -0
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/WHEEL +0 -0
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/entry_points.txt +0 -0
- {webscout-4.8.dist-info → webscout-5.0.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py
CHANGED
|
@@ -1,793 +1,157 @@
|
|
|
1
1
|
from bs4 import BeautifulSoup
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
import platform
|
|
4
|
-
import re
|
|
5
|
-
import concurrent.futures
|
|
6
2
|
import requests
|
|
7
|
-
import
|
|
8
|
-
import
|
|
9
|
-
import
|
|
10
|
-
import shutil
|
|
11
|
-
import subprocess
|
|
12
|
-
import datetime
|
|
13
|
-
import functools
|
|
14
|
-
import inspect
|
|
15
|
-
import logging
|
|
16
|
-
|
|
17
|
-
from urllib.parse import quote, unquote
|
|
18
|
-
from tiktoken import get_encoding as tiktoken_get_encoding
|
|
19
|
-
from markdownify import markdownify
|
|
3
|
+
from typing import Dict, List, Optional, Union
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
+
from urllib.parse import quote
|
|
20
6
|
from termcolor import colored
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
self.query_results = []
|
|
24
|
-
self.related_questions = []
|
|
25
|
-
|
|
26
|
-
def load_html(self, html_path):
|
|
27
|
-
try:
|
|
28
|
-
with open(html_path, "r", encoding="utf-8") as f:
|
|
29
|
-
html = f.read()
|
|
30
|
-
self.soup = BeautifulSoup(html, "html.parser")
|
|
31
|
-
except FileNotFoundError:
|
|
32
|
-
logger.error(f"File not found: {html_path}")
|
|
33
|
-
except Exception as e:
|
|
34
|
-
logger.error(f"Error loading HTML: {e}")
|
|
35
|
-
|
|
36
|
-
def extract_query_results(self):
|
|
37
|
-
try:
|
|
38
|
-
self.query = self.soup.find("textarea").text.strip()
|
|
39
|
-
query_result_elements = self.soup.find_all("div", class_="g")
|
|
40
|
-
for idx, result in enumerate(query_result_elements):
|
|
41
|
-
try:
|
|
42
|
-
site = result.find("cite").find_previous("span").text.strip()
|
|
43
|
-
url = result.find("a")["href"]
|
|
44
|
-
title = result.find("h3").text.strip()
|
|
45
|
-
abstract_element_conditions = [
|
|
46
|
-
{"data-sncf": "1"},
|
|
47
|
-
{"class_": "ITZIwc"},
|
|
48
|
-
]
|
|
49
|
-
for condition in abstract_element_conditions:
|
|
50
|
-
abstract_element = result.find("div", condition)
|
|
51
|
-
if abstract_element is not None:
|
|
52
|
-
abstract = abstract_element.text.strip()
|
|
53
|
-
break
|
|
54
|
-
else:
|
|
55
|
-
abstract = ""
|
|
56
|
-
logger.mesg(
|
|
57
|
-
f"{title}\n"
|
|
58
|
-
f" - {site}\n"
|
|
59
|
-
f" - {url}\n"
|
|
60
|
-
f" - {abstract}\n"
|
|
61
|
-
f"\n"
|
|
62
|
-
)
|
|
63
|
-
self.query_results.append(
|
|
64
|
-
{
|
|
65
|
-
"title": title,
|
|
66
|
-
"site": site,
|
|
67
|
-
"url": url,
|
|
68
|
-
"abstract": abstract,
|
|
69
|
-
"index": idx,
|
|
70
|
-
"type": "web",
|
|
71
|
-
}
|
|
72
|
-
)
|
|
73
|
-
except Exception as e:
|
|
74
|
-
logger.error(f"Error extracting query result: {e}")
|
|
75
|
-
logger.success(f"- {len(query_result_elements)} query results")
|
|
76
|
-
except Exception as e:
|
|
77
|
-
logger.error(f"Error extracting query results: {e}")
|
|
78
|
-
|
|
79
|
-
def extract_related_questions(self):
|
|
80
|
-
try:
|
|
81
|
-
related_question_elements = self.soup.find_all(
|
|
82
|
-
"div", class_="related-question-pair"
|
|
83
|
-
)
|
|
84
|
-
for question_element in related_question_elements:
|
|
85
|
-
try:
|
|
86
|
-
question = question_element.find("span").text.strip()
|
|
87
|
-
print(question)
|
|
88
|
-
self.related_questions.append(question)
|
|
89
|
-
except Exception as e:
|
|
90
|
-
logger.error(f"Error extracting related question: {e}")
|
|
91
|
-
logger.success(f"- {len(self.related_questions)} related questions")
|
|
92
|
-
except Exception as e:
|
|
93
|
-
logger.error(f"Error extracting related questions: {e}")
|
|
94
|
-
|
|
95
|
-
def extract(self, html_path):
|
|
96
|
-
self.load_html(html_path)
|
|
97
|
-
self.extract_query_results()
|
|
98
|
-
self.extract_related_questions()
|
|
99
|
-
self.search_results = {
|
|
100
|
-
"query": self.query,
|
|
101
|
-
"query_results": self.query_results,
|
|
102
|
-
"related_questions": self.related_questions,
|
|
103
|
-
}
|
|
104
|
-
return self.search_results
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
class WebpageContentExtractor:
|
|
110
|
-
def __init__(self):
|
|
111
|
-
self.tokenizer = tiktoken_get_encoding("cl100k_base")
|
|
112
|
-
|
|
113
|
-
def count_tokens(self, text):
|
|
114
|
-
tokens = self.tokenizer.encode(text)
|
|
115
|
-
token_count = len(tokens)
|
|
116
|
-
return token_count
|
|
117
|
-
|
|
118
|
-
def html_to_markdown(self, html_str, ignore_links=True):
|
|
119
|
-
if ignore_links:
|
|
120
|
-
markdown_str = markdownify(html_str, strip="a")
|
|
121
|
-
else:
|
|
122
|
-
markdown_str = markdownify(html_str)
|
|
123
|
-
markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
|
|
124
|
-
|
|
125
|
-
self.markdown_token_count = self.count_tokens(markdown_str)
|
|
126
|
-
logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
|
|
127
|
-
|
|
128
|
-
self.markdown_str = markdown_str
|
|
129
|
-
|
|
130
|
-
return self.markdown_str
|
|
131
|
-
|
|
132
|
-
def remove_elements_from_html(self, html_str):
|
|
133
|
-
soup = BeautifulSoup(html_str, "html.parser")
|
|
134
|
-
ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
|
|
135
|
-
ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
|
|
136
|
-
removed_element_counts = 0
|
|
137
|
-
for element in soup.find_all():
|
|
138
|
-
class_str = ""
|
|
139
|
-
id_str = ""
|
|
140
|
-
try:
|
|
141
|
-
class_attr = element.get("class", [])
|
|
142
|
-
if class_attr:
|
|
143
|
-
class_str = " ".join(list(class_attr))
|
|
144
|
-
if id_str:
|
|
145
|
-
class_str = f"{class_str} {id_str}"
|
|
146
|
-
except:
|
|
147
|
-
pass
|
|
148
|
-
|
|
149
|
-
try:
|
|
150
|
-
id_str = element.get("id", "")
|
|
151
|
-
except:
|
|
152
|
-
pass
|
|
153
|
-
|
|
154
|
-
if (
|
|
155
|
-
(not element.text.strip())
|
|
156
|
-
or (element.name in IGNORE_TAGS)
|
|
157
|
-
or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
|
|
158
|
-
or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
|
|
159
|
-
):
|
|
160
|
-
element.decompose()
|
|
161
|
-
removed_element_counts += 1
|
|
162
|
-
|
|
163
|
-
logger.mesg(
|
|
164
|
-
f"- Elements: "
|
|
165
|
-
f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
html_str = str(soup)
|
|
169
|
-
self.html_str = html_str
|
|
170
|
-
|
|
171
|
-
return self.html_str
|
|
172
|
-
|
|
173
|
-
def extract(self, html_path):
|
|
174
|
-
logger.note(f"Extracting content from: {html_path}")
|
|
175
|
-
|
|
176
|
-
if not Path(html_path).exists():
|
|
177
|
-
logger.warn(f"File not found: {html_path}")
|
|
178
|
-
return ""
|
|
179
|
-
|
|
180
|
-
encodings = ["utf-8", "latin-1"]
|
|
181
|
-
for encoding in encodings:
|
|
182
|
-
try:
|
|
183
|
-
with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
|
|
184
|
-
html_str = rf.read()
|
|
185
|
-
break
|
|
186
|
-
except UnicodeDecodeError:
|
|
187
|
-
pass
|
|
188
|
-
else:
|
|
189
|
-
logger.warn(f"No matching encodings: {html_path}")
|
|
190
|
-
return ""
|
|
191
|
-
|
|
192
|
-
html_str = self.remove_elements_from_html(html_str)
|
|
193
|
-
markdown_str = self.html_to_markdown(html_str)
|
|
194
|
-
return markdown_str
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
class BatchWebpageContentExtractor:
|
|
198
|
-
def __init__(self) -> None:
|
|
199
|
-
self.html_path_and_extracted_content_list = []
|
|
200
|
-
self.done_count = 0
|
|
201
|
-
|
|
202
|
-
def extract_single_html(self, html_path):
|
|
203
|
-
webpage_content_extractor = WebpageContentExtractor()
|
|
204
|
-
extracted_content = webpage_content_extractor.extract(html_path)
|
|
205
|
-
self.html_path_and_extracted_content_list.append(
|
|
206
|
-
{"html_path": html_path, "extracted_content": extracted_content}
|
|
207
|
-
)
|
|
208
|
-
self.done_count += 1
|
|
209
|
-
logger.success(
|
|
210
|
-
f"> [{self.done_count}/{self.total_count}] Extracted: {html_path}"
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
def extract(self, html_paths):
|
|
214
|
-
self.html_path = html_paths
|
|
215
|
-
self.total_count = len(self.html_path)
|
|
216
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
217
|
-
futures = [
|
|
218
|
-
executor.submit(self.extract_single_html, html_path)
|
|
219
|
-
for html_path in self.html_path
|
|
220
|
-
]
|
|
221
|
-
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
222
|
-
result = future.result()
|
|
223
|
-
|
|
224
|
-
return self.html_path_and_extracted_content_list
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
# What characters are forbidden in Windows and Linux directory names?
|
|
231
|
-
# https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
|
|
232
|
-
|
|
233
|
-
INVALID_FILE_PATH_CHARS = [
|
|
234
|
-
"\\",
|
|
235
|
-
"/",
|
|
236
|
-
":",
|
|
237
|
-
"*",
|
|
238
|
-
"?",
|
|
239
|
-
'"',
|
|
240
|
-
"<",
|
|
241
|
-
">",
|
|
242
|
-
"|",
|
|
243
|
-
"\n",
|
|
244
|
-
"\t",
|
|
245
|
-
"\r",
|
|
246
|
-
*[chr(i) for i in range(32)],
|
|
247
|
-
]
|
|
248
|
-
|
|
249
|
-
WINDOWS_INVALID_FILE_PATH_NAMES = [
|
|
250
|
-
"con",
|
|
251
|
-
"prn",
|
|
252
|
-
"aux",
|
|
253
|
-
"nul",
|
|
254
|
-
*[f"com{i+1}" for i in range(10)],
|
|
255
|
-
*[f"lpt{i+1}" for i in range(10)],
|
|
256
|
-
]
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
class FilepathConverter:
|
|
260
|
-
def __init__(self, parent: str = None):
|
|
261
|
-
self.output_root = Path(__file__).parents[1] / "files"
|
|
262
|
-
self.parent = parent
|
|
263
|
-
|
|
264
|
-
def preprocess(self, input_string):
|
|
265
|
-
return input_string
|
|
266
|
-
|
|
267
|
-
def validate(self, input_string):
|
|
268
|
-
if not input_string:
|
|
269
|
-
return input_string
|
|
270
|
-
filename = input_string
|
|
271
|
-
for char in INVALID_FILE_PATH_CHARS:
|
|
272
|
-
filename = filename.replace(char, "_")
|
|
273
|
-
if platform.system() == "Windows":
|
|
274
|
-
filename_base = filename.split(".")[0]
|
|
275
|
-
if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
|
|
276
|
-
filename_base = filename_base + "_"
|
|
277
|
-
filename = ".".join([filename_base, *filename.split(".")[1:]])
|
|
278
|
-
return filename
|
|
279
|
-
|
|
280
|
-
def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
|
|
281
|
-
if ext:
|
|
282
|
-
filename_ext = "." + filename.split(".")[-1]
|
|
283
|
-
if filename_ext.lower() not in accept_exts:
|
|
284
|
-
filename += ext
|
|
285
|
-
return filename
|
|
286
|
-
|
|
287
|
-
def convert(self, input_string, parent=None):
|
|
288
|
-
filename = self.preprocess(input_string)
|
|
289
|
-
filename = self.validate(filename)
|
|
290
|
-
filename = self.append_extension(filename)
|
|
291
|
-
|
|
292
|
-
parent = parent or self.parent
|
|
293
|
-
parent = self.validate(parent)
|
|
294
|
-
if parent:
|
|
295
|
-
filepath = self.output_root / parent / filename
|
|
296
|
-
else:
|
|
297
|
-
filepath = self.output_root / filename
|
|
298
|
-
|
|
299
|
-
self.filename = filename
|
|
300
|
-
self.filepath = filepath
|
|
301
|
-
|
|
302
|
-
return self.filepath
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
class UrlToFilepathConverter(FilepathConverter):
|
|
306
|
-
def __init__(self, parent: str = None):
|
|
307
|
-
super().__init__(parent)
|
|
308
|
-
self.output_root = self.output_root / "urls"
|
|
309
|
-
|
|
310
|
-
def preprocess(self, url):
|
|
311
|
-
filename = unquote(url.split("//")[1])
|
|
312
|
-
return filename
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
class QueryToFilepathConverter(FilepathConverter):
|
|
316
|
-
def __init__(self, parent: str = None):
|
|
317
|
-
super().__init__(parent)
|
|
318
|
-
self.output_root = self.output_root / "queries"
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
def add_fillers(text, filler="=", fill_side="both"):
|
|
322
|
-
terminal_width = shutil.get_terminal_size().columns
|
|
323
|
-
text = text.strip()
|
|
324
|
-
text_width = len(text)
|
|
325
|
-
if text_width >= terminal_width:
|
|
326
|
-
return text
|
|
327
|
-
|
|
328
|
-
if fill_side[0].lower() == "b":
|
|
329
|
-
leading_fill_str = filler * ((terminal_width - text_width) // 2 - 1) + " "
|
|
330
|
-
trailing_fill_str = " " + filler * (
|
|
331
|
-
terminal_width - text_width - len(leading_fill_str) - 1
|
|
332
|
-
)
|
|
333
|
-
elif fill_side[0].lower() == "l":
|
|
334
|
-
leading_fill_str = filler * (terminal_width - text_width - 1) + " "
|
|
335
|
-
trailing_fill_str = ""
|
|
336
|
-
elif fill_side[0].lower() == "r":
|
|
337
|
-
leading_fill_str = ""
|
|
338
|
-
trailing_fill_str = " " + filler * (terminal_width - text_width - 1)
|
|
339
|
-
else:
|
|
340
|
-
raise ValueError("Invalid fill_side")
|
|
341
|
-
|
|
342
|
-
filled_str = f"{leading_fill_str}{text}{trailing_fill_str}"
|
|
343
|
-
return filled_str
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
class OSLogger(logging.Logger):
|
|
347
|
-
LOG_METHODS = {
|
|
348
|
-
"err": ("error", "red"),
|
|
349
|
-
"warn": ("warning", "light_red"),
|
|
350
|
-
"note": ("info", "light_magenta"),
|
|
351
|
-
"mesg": ("info", "light_cyan"),
|
|
352
|
-
"file": ("info", "light_blue"),
|
|
353
|
-
"line": ("info", "white"),
|
|
354
|
-
"success": ("info", "light_green"),
|
|
355
|
-
"fail": ("info", "light_red"),
|
|
356
|
-
"back": ("debug", "light_cyan"),
|
|
357
|
-
}
|
|
358
|
-
INDENT_METHODS = [
|
|
359
|
-
"indent",
|
|
360
|
-
"set_indent",
|
|
361
|
-
"reset_indent",
|
|
362
|
-
"store_indent",
|
|
363
|
-
"restore_indent",
|
|
364
|
-
"log_indent",
|
|
365
|
-
]
|
|
366
|
-
LEVEL_METHODS = [
|
|
367
|
-
"set_level",
|
|
368
|
-
"store_level",
|
|
369
|
-
"restore_level",
|
|
370
|
-
"quiet",
|
|
371
|
-
"enter_quiet",
|
|
372
|
-
"exit_quiet",
|
|
373
|
-
]
|
|
374
|
-
LEVEL_NAMES = {
|
|
375
|
-
"critical": logging.CRITICAL,
|
|
376
|
-
"error": logging.ERROR,
|
|
377
|
-
"warning": logging.WARNING,
|
|
378
|
-
"info": logging.INFO,
|
|
379
|
-
"debug": logging.DEBUG,
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
def __init__(self, name=None, prefix=False):
|
|
383
|
-
if not name:
|
|
384
|
-
frame = inspect.stack()[1]
|
|
385
|
-
module = inspect.getmodule(frame[0])
|
|
386
|
-
name = module.__name__
|
|
387
|
-
|
|
388
|
-
super().__init__(name)
|
|
389
|
-
self.setLevel(logging.INFO)
|
|
390
|
-
|
|
391
|
-
if prefix:
|
|
392
|
-
formatter_prefix = "[%(asctime)s] - [%(name)s] - [%(levelname)s]\n"
|
|
393
|
-
else:
|
|
394
|
-
formatter_prefix = ""
|
|
395
|
-
|
|
396
|
-
self.formatter = logging.Formatter(formatter_prefix + "%(message)s")
|
|
397
|
-
|
|
398
|
-
stream_handler = logging.StreamHandler()
|
|
399
|
-
stream_handler.setLevel(logging.INFO)
|
|
400
|
-
stream_handler.setFormatter(self.formatter)
|
|
401
|
-
self.addHandler(stream_handler)
|
|
402
|
-
|
|
403
|
-
self.log_indent = 0
|
|
404
|
-
self.log_indents = []
|
|
405
|
-
|
|
406
|
-
self.log_level = "info"
|
|
407
|
-
self.log_levels = []
|
|
408
|
-
|
|
409
|
-
def indent(self, indent=2):
|
|
410
|
-
self.log_indent += indent
|
|
7
|
+
import time
|
|
8
|
+
import random
|
|
411
9
|
|
|
412
|
-
|
|
413
|
-
|
|
10
|
+
class GoogleS:
|
|
11
|
+
"""Google search class to get search results from google.com."""
|
|
414
12
|
|
|
415
|
-
|
|
416
|
-
self.log_indent = 0
|
|
13
|
+
_executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=10)
|
|
417
14
|
|
|
418
|
-
def
|
|
419
|
-
self.log_indents.append(self.log_indent)
|
|
420
|
-
|
|
421
|
-
def restore_indent(self):
|
|
422
|
-
self.log_indent = self.log_indents.pop(-1)
|
|
423
|
-
|
|
424
|
-
def set_level(self, level):
|
|
425
|
-
self.log_level = level
|
|
426
|
-
self.setLevel(self.LEVEL_NAMES[level])
|
|
427
|
-
|
|
428
|
-
def store_level(self):
|
|
429
|
-
self.log_levels.append(self.log_level)
|
|
430
|
-
|
|
431
|
-
def restore_level(self):
|
|
432
|
-
self.log_level = self.log_levels.pop(-1)
|
|
433
|
-
self.set_level(self.log_level)
|
|
434
|
-
|
|
435
|
-
def quiet(self):
|
|
436
|
-
self.set_level("critical")
|
|
437
|
-
|
|
438
|
-
def enter_quiet(self, quiet=False):
|
|
439
|
-
if quiet:
|
|
440
|
-
self.store_level()
|
|
441
|
-
self.quiet()
|
|
442
|
-
|
|
443
|
-
def exit_quiet(self, quiet=False):
|
|
444
|
-
if quiet:
|
|
445
|
-
self.restore_level()
|
|
446
|
-
|
|
447
|
-
def log(
|
|
15
|
+
def __init__(
|
|
448
16
|
self,
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
**kwargs,
|
|
458
|
-
):
|
|
459
|
-
if type(msg) == str:
|
|
460
|
-
msg_str = msg
|
|
461
|
-
else:
|
|
462
|
-
msg_str = repr(msg)
|
|
463
|
-
quotes = ["'", '"']
|
|
464
|
-
if msg_str[0] in quotes and msg_str[-1] in quotes:
|
|
465
|
-
msg_str = msg_str[1:-1]
|
|
466
|
-
|
|
467
|
-
indent_str = " " * (self.log_indent + indent)
|
|
468
|
-
indented_msg = "\n".join([indent_str + line for line in msg_str.split("\n")])
|
|
469
|
-
|
|
470
|
-
if fill:
|
|
471
|
-
indented_msg = add_fillers(indented_msg, fill_side=fill_side)
|
|
472
|
-
|
|
473
|
-
handler = self.handlers[0]
|
|
474
|
-
handler.terminator = end
|
|
475
|
-
|
|
476
|
-
getattr(self, level)(colored(indented_msg, color), *args, **kwargs)
|
|
477
|
-
|
|
478
|
-
def route_log(self, method, msg, *args, **kwargs):
|
|
479
|
-
level, method = method
|
|
480
|
-
functools.partial(self.log, level, method, msg)(*args, **kwargs)
|
|
481
|
-
|
|
482
|
-
def err(self, msg: str = "", *args, **kwargs):
|
|
483
|
-
self.route_log(("error", "red"), msg, *args, **kwargs)
|
|
484
|
-
|
|
485
|
-
def warn(self, msg: str = "", *args, **kwargs):
|
|
486
|
-
self.route_log(("warning", "light_red"), msg, *args, **kwargs)
|
|
487
|
-
|
|
488
|
-
def note(self, msg: str = "", *args, **kwargs):
|
|
489
|
-
self.route_log(("info", "light_magenta"), msg, *args, **kwargs)
|
|
490
|
-
|
|
491
|
-
def mesg(self, msg: str = "", *args, **kwargs):
|
|
492
|
-
self.route_log(("info", "light_cyan"), msg, *args, **kwargs)
|
|
493
|
-
|
|
494
|
-
def file(self, msg: str = "", *args, **kwargs):
|
|
495
|
-
self.route_log(("info", "light_blue"), msg, *args, **kwargs)
|
|
496
|
-
|
|
497
|
-
def line(self, msg: str = "", *args, **kwargs):
|
|
498
|
-
self.route_log(("info", "white"), msg, *args, **kwargs)
|
|
499
|
-
|
|
500
|
-
def success(self, msg: str = "", *args, **kwargs):
|
|
501
|
-
self.route_log(("info", "light_green"), msg, *args, **kwargs)
|
|
502
|
-
|
|
503
|
-
def fail(self, msg: str = "", *args, **kwargs):
|
|
504
|
-
self.route_log(("info", "light_red"), msg, *args, **kwargs)
|
|
505
|
-
|
|
506
|
-
def back(self, msg: str = "", *args, **kwargs):
|
|
507
|
-
self.route_log(("debug", "light_cyan"), msg, *args, **kwargs)
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
logger = OSLogger()
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
def shell_cmd(cmd, getoutput=False, showcmd=True, env=None):
|
|
514
|
-
if showcmd:
|
|
515
|
-
logger.info(colored(f"\n$ [{os.getcwd()}]", "light_blue"))
|
|
516
|
-
logger.info(colored(f" $ {cmd}\n", "light_cyan"))
|
|
517
|
-
if getoutput:
|
|
518
|
-
output = subprocess.getoutput(cmd, env=env)
|
|
519
|
-
return output
|
|
520
|
-
else:
|
|
521
|
-
subprocess.run(cmd, shell=True, env=env)
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
class Runtimer:
|
|
525
|
-
def __enter__(self):
|
|
526
|
-
self.t1, _ = self.start_time()
|
|
527
|
-
return self
|
|
528
|
-
|
|
529
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
|
530
|
-
self.t2, _ = self.end_time()
|
|
531
|
-
self.elapsed_time(self.t2 - self.t1)
|
|
532
|
-
|
|
533
|
-
def start_time(self):
|
|
534
|
-
t1 = datetime.datetime.now()
|
|
535
|
-
self.logger_time("start", t1)
|
|
536
|
-
return t1, self.time2str(t1)
|
|
537
|
-
|
|
538
|
-
def end_time(self):
|
|
539
|
-
t2 = datetime.datetime.now()
|
|
540
|
-
self.logger_time("end", t2)
|
|
541
|
-
return t2, self.time2str(t2)
|
|
542
|
-
|
|
543
|
-
def elapsed_time(self, dt=None):
|
|
544
|
-
if dt is None:
|
|
545
|
-
dt = self.t2 - self.t1
|
|
546
|
-
self.logger_time("elapsed", dt)
|
|
547
|
-
return dt, self.time2str(dt)
|
|
548
|
-
|
|
549
|
-
def logger_time(self, time_type, t):
|
|
550
|
-
time_types = {
|
|
551
|
-
"start": "Start",
|
|
552
|
-
"end": "End",
|
|
553
|
-
"elapsed": "Elapsed",
|
|
554
|
-
}
|
|
555
|
-
time_str = add_fillers(
|
|
556
|
-
colored(
|
|
557
|
-
f"{time_types[time_type]} time: [ {self.time2str(t)} ]",
|
|
558
|
-
"light_magenta",
|
|
559
|
-
),
|
|
560
|
-
fill_side="both",
|
|
561
|
-
)
|
|
562
|
-
logger.line(time_str)
|
|
563
|
-
|
|
564
|
-
# Convert time to string
|
|
565
|
-
def time2str(self, t):
|
|
566
|
-
datetime_str_format = "%Y-%m-%d %H:%M:%S"
|
|
567
|
-
if isinstance(t, datetime.datetime):
|
|
568
|
-
return t.strftime(datetime_str_format)
|
|
569
|
-
elif isinstance(t, datetime.timedelta):
|
|
570
|
-
hours = t.seconds // 3600
|
|
571
|
-
hour_str = f"{hours} hr" if hours > 0 else ""
|
|
572
|
-
minutes = (t.seconds // 60) % 60
|
|
573
|
-
minute_str = f"{minutes:>2} min" if minutes > 0 else ""
|
|
574
|
-
seconds = t.seconds % 60
|
|
575
|
-
second_str = f"{seconds:>2} s"
|
|
576
|
-
time_str = " ".join([hour_str, minute_str, second_str]).strip()
|
|
577
|
-
return time_str
|
|
578
|
-
else:
|
|
579
|
-
return str(t)
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
class OSEnver:
|
|
583
|
-
def __init__(self):
|
|
584
|
-
self.envs_stack = []
|
|
585
|
-
self.envs = os.environ.copy()
|
|
586
|
-
|
|
587
|
-
def store_envs(self):
|
|
588
|
-
self.envs_stack.append(self.envs)
|
|
589
|
-
|
|
590
|
-
def restore_envs(self):
|
|
591
|
-
self.envs = self.envs_stack.pop()
|
|
592
|
-
|
|
593
|
-
def set_envs(self, secrets=True, proxies=None, store_envs=True):
|
|
594
|
-
# caller_info = inspect.stack()[1]
|
|
595
|
-
# logger.back(f"OS Envs is set by: {caller_info.filename}")
|
|
596
|
-
|
|
597
|
-
if store_envs:
|
|
598
|
-
self.store_envs()
|
|
599
|
-
|
|
600
|
-
if secrets:
|
|
601
|
-
secrets_path = Path(__file__).parents[1] / "secrets.json"
|
|
602
|
-
if secrets_path.exists():
|
|
603
|
-
with open(secrets_path, "r") as rf:
|
|
604
|
-
secrets = json.load(rf)
|
|
605
|
-
else:
|
|
606
|
-
secrets = {}
|
|
607
|
-
|
|
608
|
-
if proxies:
|
|
609
|
-
for proxy_env in ["http_proxy", "https_proxy"]:
|
|
610
|
-
if isinstance(proxies, str):
|
|
611
|
-
self.envs[proxy_env] = proxies
|
|
612
|
-
elif "http_proxy" in secrets.keys():
|
|
613
|
-
self.envs[proxy_env] = secrets["http_proxy"]
|
|
614
|
-
elif os.getenv("http_proxy"):
|
|
615
|
-
self.envs[proxy_env] = os.getenv("http_proxy")
|
|
616
|
-
else:
|
|
617
|
-
continue
|
|
618
|
-
|
|
619
|
-
self.proxy = (
|
|
620
|
-
self.envs.get("all_proxy")
|
|
621
|
-
or self.envs.get("http_proxy")
|
|
622
|
-
or self.envs.get("https_proxy")
|
|
623
|
-
or None
|
|
624
|
-
)
|
|
625
|
-
self.requests_proxies = {
|
|
626
|
-
"http": self.proxy,
|
|
627
|
-
"https": self.proxy,
|
|
17
|
+
headers: Optional[Dict[str, str]] = None,
|
|
18
|
+
proxy: Optional[str] = None,
|
|
19
|
+
timeout: Optional[int] = 10,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""Initialize the GoogleS object."""
|
|
22
|
+
self.proxy: Optional[str] = proxy
|
|
23
|
+
self.headers = headers if headers else {
|
|
24
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"
|
|
628
25
|
}
|
|
26
|
+
self.headers["Referer"] = "https://www.google.com/"
|
|
27
|
+
self.client = requests.Session()
|
|
28
|
+
self.client.headers.update(self.headers)
|
|
29
|
+
self.client.proxies.update({"http": self.proxy, "https": self.proxy})
|
|
30
|
+
self.timeout = timeout
|
|
629
31
|
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
enver = OSEnver()
|
|
635
|
-
|
|
636
|
-
class GoogleSearcher:
|
|
637
|
-
def __init__(self):
|
|
638
|
-
self.url = "https://www.google.com/search"
|
|
639
|
-
self.enver = enver
|
|
640
|
-
self.enver.set_envs(proxies=True)
|
|
641
|
-
self.filepath_converter = QueryToFilepathConverter()
|
|
642
|
-
|
|
643
|
-
def send_request(self, result_num=10, safe=False):
|
|
644
|
-
self.request_response = requests.get(
|
|
645
|
-
url=self.url,
|
|
646
|
-
headers=REQUESTS_HEADERS,
|
|
647
|
-
params={
|
|
648
|
-
"q": self.query,
|
|
649
|
-
"num": result_num,
|
|
650
|
-
},
|
|
651
|
-
proxies=self.enver.requests_proxies,
|
|
652
|
-
)
|
|
653
|
-
|
|
654
|
-
def save_response(self):
|
|
655
|
-
if not self.html_path.exists():
|
|
656
|
-
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
657
|
-
logger.note(f"Saving to: [{self.html_path}]")
|
|
658
|
-
with open(self.html_path, "wb") as wf:
|
|
659
|
-
wf.write(self.request_response.content)
|
|
660
|
-
|
|
661
|
-
def search(self, query, result_num=10, safe=False, overwrite=False):
|
|
662
|
-
self.query = query
|
|
663
|
-
self.html_path = self.filepath_converter.convert(self.query)
|
|
664
|
-
logger.note(f"Searching: [{self.query}]")
|
|
665
|
-
if self.html_path.exists() and not overwrite:
|
|
666
|
-
logger.success(f"HTML existed: {self.html_path}")
|
|
667
|
-
else:
|
|
668
|
-
self.send_request(result_num=result_num, safe=safe)
|
|
669
|
-
self.save_response()
|
|
670
|
-
return self.html_path
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
IGNORE_TAGS = ["script", "style", "button"]
|
|
674
|
-
IGNORE_CLASSES = [
|
|
675
|
-
# common
|
|
676
|
-
"sidebar",
|
|
677
|
-
"footer",
|
|
678
|
-
"related",
|
|
679
|
-
"comment",
|
|
680
|
-
"topbar",
|
|
681
|
-
"offcanvas",
|
|
682
|
-
"navbar",
|
|
683
|
-
# 163.com
|
|
684
|
-
"post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
|
|
685
|
-
"ntes\-.*nav",
|
|
686
|
-
"nav\-bottom",
|
|
687
|
-
# wikipedia.org
|
|
688
|
-
"language\-list",
|
|
689
|
-
"vector\-(header)|(column)|(sticky\-pinned)|(dropdown\-content)",
|
|
690
|
-
"navbox",
|
|
691
|
-
"catlinks",
|
|
692
|
-
]
|
|
693
|
-
|
|
694
|
-
IGNORE_HOSTS = [
|
|
695
|
-
"weibo.com",
|
|
696
|
-
"hymson.com",
|
|
697
|
-
"yahoo.com",
|
|
698
|
-
]
|
|
699
|
-
|
|
700
|
-
REQUESTS_HEADERS = {
|
|
701
|
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
|
702
|
-
}
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
class WebpageFetcher:
|
|
708
|
-
def __init__(self):
|
|
709
|
-
self.enver = enver
|
|
710
|
-
self.enver.set_envs(proxies=True)
|
|
711
|
-
self.filepath_converter = UrlToFilepathConverter()
|
|
32
|
+
def __enter__(self) -> "GoogleS":
|
|
33
|
+
return self
|
|
712
34
|
|
|
713
|
-
def
|
|
714
|
-
self.
|
|
715
|
-
if self.host in IGNORE_HOSTS:
|
|
716
|
-
return True
|
|
717
|
-
else:
|
|
718
|
-
return False
|
|
35
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
36
|
+
self.client.close()
|
|
719
37
|
|
|
720
|
-
def
|
|
38
|
+
def _get_url(
|
|
39
|
+
self,
|
|
40
|
+
method: str,
|
|
41
|
+
url: str,
|
|
42
|
+
params: Optional[Dict[str, str]] = None,
|
|
43
|
+
data: Optional[Union[Dict[str, str], bytes]] = None,
|
|
44
|
+
) -> bytes:
|
|
721
45
|
try:
|
|
722
|
-
self.
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
if
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
46
|
+
resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
|
|
47
|
+
except Exception as ex:
|
|
48
|
+
raise Exception(f"{url} {type(ex).__name__}: {ex}") from ex
|
|
49
|
+
if resp.status_code == 200:
|
|
50
|
+
return resp.content
|
|
51
|
+
raise Exception(f"{resp.url} returned status code {resp.status_code}. {params=} {data=}")
|
|
52
|
+
|
|
53
|
+
def extract_text_from_webpage(self, html_content, max_characters=None):
|
|
54
|
+
"""Extracts visible text from HTML content using BeautifulSoup."""
|
|
55
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
56
|
+
# Remove unwanted tags
|
|
57
|
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
|
58
|
+
tag.extract()
|
|
59
|
+
# Get the remaining visible text
|
|
60
|
+
visible_text = soup.get_text(strip=True)
|
|
61
|
+
if max_characters:
|
|
62
|
+
visible_text = visible_text[:max_characters]
|
|
63
|
+
return visible_text
|
|
64
|
+
|
|
65
|
+
def search(
|
|
66
|
+
self,
|
|
67
|
+
keywords: str,
|
|
68
|
+
region: str = "us-en",
|
|
69
|
+
lang: str = "en",
|
|
70
|
+
safe: str = "off",
|
|
71
|
+
timelimit: Optional[str] = None,
|
|
72
|
+
max_results: Optional[int] = None,
|
|
73
|
+
extract_webpage_text: bool = False,
|
|
74
|
+
max_extract_characters: Optional[int] = 100,
|
|
75
|
+
) -> List[Dict[str, str]]:
|
|
76
|
+
"""Google text search."""
|
|
77
|
+
assert keywords, "keywords is mandatory"
|
|
78
|
+
|
|
79
|
+
results = []
|
|
80
|
+
futures = []
|
|
81
|
+
start = 0
|
|
82
|
+
while len(results) < max_results:
|
|
83
|
+
params = {
|
|
84
|
+
"q": keywords,
|
|
85
|
+
"num": 10, # Number of results per page
|
|
86
|
+
"hl": lang,
|
|
87
|
+
"start": start,
|
|
88
|
+
"safe": safe,
|
|
89
|
+
"gl": region,
|
|
90
|
+
}
|
|
91
|
+
if timelimit:
|
|
92
|
+
params["tbs"] = f"qdr:{timelimit}"
|
|
93
|
+
|
|
94
|
+
futures.append(self._executor.submit(self._get_url, "GET", "https://www.google.com/search", params=params))
|
|
95
|
+
start += 10
|
|
96
|
+
|
|
97
|
+
for future in as_completed(futures):
|
|
98
|
+
try:
|
|
99
|
+
resp_content = future.result()
|
|
100
|
+
soup = BeautifulSoup(resp_content, "html.parser")
|
|
101
|
+
result_block = soup.find_all("div", class_="g")
|
|
102
|
+
|
|
103
|
+
if not result_block:
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
for result in result_block:
|
|
107
|
+
try:
|
|
108
|
+
link = result.find("a", href=True)
|
|
109
|
+
title = result.find("h3")
|
|
110
|
+
description_box = result.find(
|
|
111
|
+
"div", {"style": "-webkit-line-clamp:2"}
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if link and title and description_box:
|
|
115
|
+
url = link["href"]
|
|
116
|
+
title = title.text
|
|
117
|
+
description = description_box.text
|
|
118
|
+
|
|
119
|
+
visible_text = ""
|
|
120
|
+
if extract_webpage_text:
|
|
121
|
+
try:
|
|
122
|
+
page_content = self._get_url("GET", url)
|
|
123
|
+
visible_text = self.extract_text_from_webpage(
|
|
124
|
+
page_content, max_characters=max_extract_characters
|
|
125
|
+
)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"Error extracting text from {url}: {e}")
|
|
128
|
+
|
|
129
|
+
results.append(
|
|
130
|
+
{
|
|
131
|
+
"title": title,
|
|
132
|
+
"href": url,
|
|
133
|
+
"abstract": description,
|
|
134
|
+
"index": len(results),
|
|
135
|
+
"type": "web",
|
|
136
|
+
"visible_text": visible_text,
|
|
137
|
+
}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if len(results) >= max_results:
|
|
141
|
+
return results
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
print(f"Error extracting result: {e}")
|
|
774
145
|
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
self.total_count = len(self.urls)
|
|
778
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
779
|
-
futures = [
|
|
780
|
-
executor.submit(
|
|
781
|
-
self.fecth_single_webpage,
|
|
782
|
-
url=url,
|
|
783
|
-
overwrite=overwrite,
|
|
784
|
-
output_parent=output_parent,
|
|
785
|
-
)
|
|
786
|
-
for url in urls
|
|
787
|
-
]
|
|
146
|
+
except Exception as e:
|
|
147
|
+
print(f"Error fetching URL: {e}")
|
|
788
148
|
|
|
789
|
-
|
|
790
|
-
result = future.result()
|
|
791
|
-
return self.url_and_html_path_list
|
|
149
|
+
return results
|
|
792
150
|
|
|
793
151
|
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
from rich import print
|
|
154
|
+
searcher = GoogleS()
|
|
155
|
+
results = searcher.search("HelpingAI-9B", max_results=20, extract_webpage_text=True, max_extract_characters=200)
|
|
156
|
+
for result in results:
|
|
157
|
+
print(result)
|