webscout 3.4__py3-none-any.whl → 3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
webscout/DWEBS.py CHANGED
@@ -1,197 +1,793 @@
1
+ from bs4 import BeautifulSoup
2
+ from pathlib import Path
3
+ import platform
4
+ import re
5
+ import concurrent.futures
6
+ import requests
7
+ import tldextract
8
+ import json
9
+ import os
10
+ import shutil
11
+ import subprocess
12
+ import datetime
13
+ import functools
14
+ import inspect
15
+ import logging
1
16
 
2
- from pydantic import BaseModel, Field
3
- from typing import Union
17
+ from urllib.parse import quote, unquote
18
+ from tiktoken import get_encoding as tiktoken_get_encoding
19
+ from markdownify import markdownify
20
+ from termcolor import colored
21
+ class QueryResultsExtractor:
22
+ def __init__(self) -> None:
23
+ self.query_results = []
24
+ self.related_questions = []
4
25
 
5
- from DeepWEBS.utilsdw.logger import logger
6
- from DeepWEBS.networks.google_searcher import GoogleSearcher
7
- from DeepWEBS.networks.webpage_fetcher import BatchWebpageFetcher
8
- from DeepWEBS.documents.query_results_extractor import QueryResultsExtractor
9
- from DeepWEBS.documents.webpage_content_extractor import BatchWebpageContentExtractor
10
- from DeepWEBS.utilsdw.logger import logger
11
- import argparse
26
+ def load_html(self, html_path):
27
+ try:
28
+ with open(html_path, "r", encoding="utf-8") as f:
29
+ html = f.read()
30
+ self.soup = BeautifulSoup(html, "html.parser")
31
+ except FileNotFoundError:
32
+ logger.error(f"File not found: {html_path}")
33
+ except Exception as e:
34
+ logger.error(f"Error loading HTML: {e}")
12
35
 
13
- class DeepWEBS:
36
+ def extract_query_results(self):
37
+ try:
38
+ self.query = self.soup.find("textarea").text.strip()
39
+ query_result_elements = self.soup.find_all("div", class_="g")
40
+ for idx, result in enumerate(query_result_elements):
41
+ try:
42
+ site = result.find("cite").find_previous("span").text.strip()
43
+ url = result.find("a")["href"]
44
+ title = result.find("h3").text.strip()
45
+ abstract_element_conditions = [
46
+ {"data-sncf": "1"},
47
+ {"class_": "ITZIwc"},
48
+ ]
49
+ for condition in abstract_element_conditions:
50
+ abstract_element = result.find("div", condition)
51
+ if abstract_element is not None:
52
+ abstract = abstract_element.text.strip()
53
+ break
54
+ else:
55
+ abstract = ""
56
+ logger.mesg(
57
+ f"{title}\n"
58
+ f" - {site}\n"
59
+ f" - {url}\n"
60
+ f" - {abstract}\n"
61
+ f"\n"
62
+ )
63
+ self.query_results.append(
64
+ {
65
+ "title": title,
66
+ "site": site,
67
+ "url": url,
68
+ "abstract": abstract,
69
+ "index": idx,
70
+ "type": "web",
71
+ }
72
+ )
73
+ except Exception as e:
74
+ logger.error(f"Error extracting query result: {e}")
75
+ logger.success(f"- {len(query_result_elements)} query results")
76
+ except Exception as e:
77
+ logger.error(f"Error extracting query results: {e}")
78
+
79
+ def extract_related_questions(self):
80
+ try:
81
+ related_question_elements = self.soup.find_all(
82
+ "div", class_="related-question-pair"
83
+ )
84
+ for question_element in related_question_elements:
85
+ try:
86
+ question = question_element.find("span").text.strip()
87
+ print(question)
88
+ self.related_questions.append(question)
89
+ except Exception as e:
90
+ logger.error(f"Error extracting related question: {e}")
91
+ logger.success(f"- {len(self.related_questions)} related questions")
92
+ except Exception as e:
93
+ logger.error(f"Error extracting related questions: {e}")
94
+
95
+ def extract(self, html_path):
96
+ self.load_html(html_path)
97
+ self.extract_query_results()
98
+ self.extract_related_questions()
99
+ self.search_results = {
100
+ "query": self.query,
101
+ "query_results": self.query_results,
102
+ "related_questions": self.related_questions,
103
+ }
104
+ return self.search_results
105
+
106
+
107
+
108
+
109
+ class WebpageContentExtractor:
14
110
  def __init__(self):
15
- pass
111
+ self.tokenizer = tiktoken_get_encoding("cl100k_base")
16
112
 
17
- class DeepSearch(BaseModel):
18
- queries: list = Field(
19
- default=[""],
20
- description="(list[str]) Queries to search",
21
- )
22
- result_num: int = Field(
23
- default=10,
24
- description="(int) Number of search results",
25
- )
26
- safe: bool = Field(
27
- default=False,
28
- description="(bool) Enable SafeSearch",
29
- )
30
- types: list = Field(
31
- default=["web"],
32
- description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
33
- )
34
- extract_webpage: bool = Field(
35
- default=False,
36
- description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
37
- )
38
- overwrite_query_html: bool = Field(
39
- default=False,
40
- description="(bool) Overwrite HTML file of query results",
41
- )
42
- overwrite_webpage_html: bool = Field(
43
- default=False,
44
- description="(bool) Overwrite HTML files of webpages from query results",
45
- )
113
+ def count_tokens(self, text):
114
+ tokens = self.tokenizer.encode(text)
115
+ token_count = len(tokens)
116
+ return token_count
46
117
 
47
- def queries_to_search_results(self, item: DeepSearch):
48
- google_searcher = GoogleSearcher()
49
- queries_search_results = []
50
- for query in item.queries:
51
- query_results_extractor = QueryResultsExtractor()
52
- if not query.strip():
53
- continue
54
- try:
55
- query_html_path = google_searcher.search(
56
- query=query,
57
- result_num=item.result_num,
58
- safe=item.safe,
59
- overwrite=item.overwrite_query_html,
60
- )
61
- except Exception as e:
62
- logger.error(f"Failed to search for query '{query}': {e}")
63
- continue
118
+ def html_to_markdown(self, html_str, ignore_links=True):
119
+ if ignore_links:
120
+ markdown_str = markdownify(html_str, strip="a")
121
+ else:
122
+ markdown_str = markdownify(html_str)
123
+ markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
64
124
 
65
- try:
66
- query_search_results = query_results_extractor.extract(query_html_path)
67
- except Exception as e:
68
- logger.error(f"Failed to extract search results for query '{query}': {e}")
69
- continue
70
-
71
- queries_search_results.append(query_search_results)
72
- logger.note(queries_search_results)
73
-
74
- if item.extract_webpage:
75
- queries_search_results = self.extract_webpages(
76
- queries_search_results,
77
- overwrite_webpage_html=item.overwrite_webpage_html,
78
- )
79
- return queries_search_results
125
+ self.markdown_token_count = self.count_tokens(markdown_str)
126
+ logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
127
+
128
+ self.markdown_str = markdown_str
129
+
130
+ return self.markdown_str
80
131
 
81
- def extract_webpages(self, queries_search_results, overwrite_webpage_html=False):
82
- for query_idx, query_search_results in enumerate(queries_search_results):
132
+ def remove_elements_from_html(self, html_str):
133
+ soup = BeautifulSoup(html_str, "html.parser")
134
+ ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
135
+ ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
136
+ removed_element_counts = 0
137
+ for element in soup.find_all():
138
+ class_str = ""
139
+ id_str = ""
83
140
  try:
84
- # Fetch webpages with urls
85
- batch_webpage_fetcher = BatchWebpageFetcher()
86
- urls = [
87
- query_result["url"]
88
- for query_result in query_search_results["query_results"]
89
- ]
90
- url_and_html_path_list = batch_webpage_fetcher.fetch(
91
- urls,
92
- overwrite=overwrite_webpage_html,
93
- output_parent=query_search_results["query"],
94
- )
95
- except Exception as e:
96
- logger.error(f"Failed to fetch webpages for query '{query_search_results['query']}': {e}")
97
- continue
98
-
99
- # Extract webpage contents from htmls
100
- html_paths = [
101
- str(url_and_html_path["html_path"])
102
- for url_and_html_path in url_and_html_path_list
103
- ]
104
- batch_webpage_content_extractor = BatchWebpageContentExtractor()
141
+ class_attr = element.get("class", [])
142
+ if class_attr:
143
+ class_str = " ".join(list(class_attr))
144
+ if id_str:
145
+ class_str = f"{class_str} {id_str}"
146
+ except:
147
+ pass
148
+
105
149
  try:
106
- html_path_and_extracted_content_list = (
107
- batch_webpage_content_extractor.extract(html_paths)
108
- )
109
- except Exception as e:
110
- logger.error(f"Failed to extract webpage contents for query '{query_search_results['query']}': {e}")
111
- continue
112
-
113
- # Build the map of url to extracted_content
114
- html_path_to_url_dict = {
115
- str(url_and_html_path["html_path"]): url_and_html_path["url"]
116
- for url_and_html_path in url_and_html_path_list
117
- }
118
- url_to_extracted_content_dict = {
119
- html_path_to_url_dict[
120
- html_path_and_extracted_content["html_path"]
121
- ]: html_path_and_extracted_content["extracted_content"]
122
- for html_path_and_extracted_content in html_path_and_extracted_content_list
123
- }
124
-
125
- # Write extracted contents (as 'text' field) to query_search_results
126
- for query_result_idx, query_result in enumerate(
127
- query_search_results["query_results"]
150
+ id_str = element.get("id", "")
151
+ except:
152
+ pass
153
+
154
+ if (
155
+ (not element.text.strip())
156
+ or (element.name in IGNORE_TAGS)
157
+ or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
158
+ or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
128
159
  ):
129
- url = query_result["url"]
130
- extracted_content = url_to_extracted_content_dict.get(url, "")
131
- queries_search_results[query_idx]["query_results"][query_result_idx][
132
- "text"
133
- ] = extracted_content
134
-
135
- return queries_search_results
136
-
137
-
138
- class ArgParser(argparse.ArgumentParser):
139
- def __init__(self, *args, **kwargs):
140
- super(ArgParser, self).__init__(*args, **kwargs)
141
-
142
- self.add_argument(
143
- "-q",
144
- "--queries",
145
- type=str,
146
- nargs="+",
147
- required=True,
148
- help="Queries to search",
160
+ element.decompose()
161
+ removed_element_counts += 1
162
+
163
+ logger.mesg(
164
+ f"- Elements: "
165
+ f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
149
166
  )
150
- self.add_argument(
151
- "-n",
152
- "--result_num",
153
- type=int,
154
- default=10,
155
- help="Number of search results",
167
+
168
+ html_str = str(soup)
169
+ self.html_str = html_str
170
+
171
+ return self.html_str
172
+
173
+ def extract(self, html_path):
174
+ logger.note(f"Extracting content from: {html_path}")
175
+
176
+ if not Path(html_path).exists():
177
+ logger.warn(f"File not found: {html_path}")
178
+ return ""
179
+
180
+ encodings = ["utf-8", "latin-1"]
181
+ for encoding in encodings:
182
+ try:
183
+ with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
184
+ html_str = rf.read()
185
+ break
186
+ except UnicodeDecodeError:
187
+ pass
188
+ else:
189
+ logger.warn(f"No matching encodings: {html_path}")
190
+ return ""
191
+
192
+ html_str = self.remove_elements_from_html(html_str)
193
+ markdown_str = self.html_to_markdown(html_str)
194
+ return markdown_str
195
+
196
+
197
+ class BatchWebpageContentExtractor:
198
+ def __init__(self) -> None:
199
+ self.html_path_and_extracted_content_list = []
200
+ self.done_count = 0
201
+
202
+ def extract_single_html(self, html_path):
203
+ webpage_content_extractor = WebpageContentExtractor()
204
+ extracted_content = webpage_content_extractor.extract(html_path)
205
+ self.html_path_and_extracted_content_list.append(
206
+ {"html_path": html_path, "extracted_content": extracted_content}
156
207
  )
157
- self.add_argument(
158
- "-s",
159
- "--safe",
160
- default=False,
161
- action="store_true",
162
- help="Enable SafeSearch",
208
+ self.done_count += 1
209
+ logger.success(
210
+ f"> [{self.done_count}/{self.total_count}] Extracted: {html_path}"
163
211
  )
164
- self.add_argument(
165
- "-t",
166
- "--types",
167
- type=str,
168
- nargs="+",
169
- default=["web"],
170
- choices=["web", "image", "videos", "news"],
171
- help="Types of search results",
212
+
213
+ def extract(self, html_paths):
214
+ self.html_path = html_paths
215
+ self.total_count = len(self.html_path)
216
+ with concurrent.futures.ThreadPoolExecutor() as executor:
217
+ futures = [
218
+ executor.submit(self.extract_single_html, html_path)
219
+ for html_path in self.html_path
220
+ ]
221
+ for idx, future in enumerate(concurrent.futures.as_completed(futures)):
222
+ result = future.result()
223
+
224
+ return self.html_path_and_extracted_content_list
225
+
226
+
227
+
228
+
229
+
230
+ # What characters are forbidden in Windows and Linux directory names?
231
+ # https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
232
+
233
+ INVALID_FILE_PATH_CHARS = [
234
+ "\\",
235
+ "/",
236
+ ":",
237
+ "*",
238
+ "?",
239
+ '"',
240
+ "<",
241
+ ">",
242
+ "|",
243
+ "\n",
244
+ "\t",
245
+ "\r",
246
+ *[chr(i) for i in range(32)],
247
+ ]
248
+
249
+ WINDOWS_INVALID_FILE_PATH_NAMES = [
250
+ "con",
251
+ "prn",
252
+ "aux",
253
+ "nul",
254
+ *[f"com{i+1}" for i in range(10)],
255
+ *[f"lpt{i+1}" for i in range(10)],
256
+ ]
257
+
258
+
259
+ class FilepathConverter:
260
+ def __init__(self, parent: str = None):
261
+ self.output_root = Path(__file__).parents[1] / "files"
262
+ self.parent = parent
263
+
264
+ def preprocess(self, input_string):
265
+ return input_string
266
+
267
+ def validate(self, input_string):
268
+ if not input_string:
269
+ return input_string
270
+ filename = input_string
271
+ for char in INVALID_FILE_PATH_CHARS:
272
+ filename = filename.replace(char, "_")
273
+ if platform.system() == "Windows":
274
+ filename_base = filename.split(".")[0]
275
+ if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
276
+ filename_base = filename_base + "_"
277
+ filename = ".".join([filename_base, *filename.split(".")[1:]])
278
+ return filename
279
+
280
+ def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
281
+ if ext:
282
+ filename_ext = "." + filename.split(".")[-1]
283
+ if filename_ext.lower() not in accept_exts:
284
+ filename += ext
285
+ return filename
286
+
287
+ def convert(self, input_string, parent=None):
288
+ filename = self.preprocess(input_string)
289
+ filename = self.validate(filename)
290
+ filename = self.append_extension(filename)
291
+
292
+ parent = parent or self.parent
293
+ parent = self.validate(parent)
294
+ if parent:
295
+ filepath = self.output_root / parent / filename
296
+ else:
297
+ filepath = self.output_root / filename
298
+
299
+ self.filename = filename
300
+ self.filepath = filepath
301
+
302
+ return self.filepath
303
+
304
+
305
+ class UrlToFilepathConverter(FilepathConverter):
306
+ def __init__(self, parent: str = None):
307
+ super().__init__(parent)
308
+ self.output_root = self.output_root / "urls"
309
+
310
+ def preprocess(self, url):
311
+ filename = unquote(url.split("//")[1])
312
+ return filename
313
+
314
+
315
+ class QueryToFilepathConverter(FilepathConverter):
316
+ def __init__(self, parent: str = None):
317
+ super().__init__(parent)
318
+ self.output_root = self.output_root / "queries"
319
+
320
+
321
+ def add_fillers(text, filler="=", fill_side="both"):
322
+ terminal_width = shutil.get_terminal_size().columns
323
+ text = text.strip()
324
+ text_width = len(text)
325
+ if text_width >= terminal_width:
326
+ return text
327
+
328
+ if fill_side[0].lower() == "b":
329
+ leading_fill_str = filler * ((terminal_width - text_width) // 2 - 1) + " "
330
+ trailing_fill_str = " " + filler * (
331
+ terminal_width - text_width - len(leading_fill_str) - 1
332
+ )
333
+ elif fill_side[0].lower() == "l":
334
+ leading_fill_str = filler * (terminal_width - text_width - 1) + " "
335
+ trailing_fill_str = ""
336
+ elif fill_side[0].lower() == "r":
337
+ leading_fill_str = ""
338
+ trailing_fill_str = " " + filler * (terminal_width - text_width - 1)
339
+ else:
340
+ raise ValueError("Invalid fill_side")
341
+
342
+ filled_str = f"{leading_fill_str}{text}{trailing_fill_str}"
343
+ return filled_str
344
+
345
+
346
+ class OSLogger(logging.Logger):
347
+ LOG_METHODS = {
348
+ "err": ("error", "red"),
349
+ "warn": ("warning", "light_red"),
350
+ "note": ("info", "light_magenta"),
351
+ "mesg": ("info", "light_cyan"),
352
+ "file": ("info", "light_blue"),
353
+ "line": ("info", "white"),
354
+ "success": ("info", "light_green"),
355
+ "fail": ("info", "light_red"),
356
+ "back": ("debug", "light_cyan"),
357
+ }
358
+ INDENT_METHODS = [
359
+ "indent",
360
+ "set_indent",
361
+ "reset_indent",
362
+ "store_indent",
363
+ "restore_indent",
364
+ "log_indent",
365
+ ]
366
+ LEVEL_METHODS = [
367
+ "set_level",
368
+ "store_level",
369
+ "restore_level",
370
+ "quiet",
371
+ "enter_quiet",
372
+ "exit_quiet",
373
+ ]
374
+ LEVEL_NAMES = {
375
+ "critical": logging.CRITICAL,
376
+ "error": logging.ERROR,
377
+ "warning": logging.WARNING,
378
+ "info": logging.INFO,
379
+ "debug": logging.DEBUG,
380
+ }
381
+
382
+ def __init__(self, name=None, prefix=False):
383
+ if not name:
384
+ frame = inspect.stack()[1]
385
+ module = inspect.getmodule(frame[0])
386
+ name = module.__name__
387
+
388
+ super().__init__(name)
389
+ self.setLevel(logging.INFO)
390
+
391
+ if prefix:
392
+ formatter_prefix = "[%(asctime)s] - [%(name)s] - [%(levelname)s]\n"
393
+ else:
394
+ formatter_prefix = ""
395
+
396
+ self.formatter = logging.Formatter(formatter_prefix + "%(message)s")
397
+
398
+ stream_handler = logging.StreamHandler()
399
+ stream_handler.setLevel(logging.INFO)
400
+ stream_handler.setFormatter(self.formatter)
401
+ self.addHandler(stream_handler)
402
+
403
+ self.log_indent = 0
404
+ self.log_indents = []
405
+
406
+ self.log_level = "info"
407
+ self.log_levels = []
408
+
409
+ def indent(self, indent=2):
410
+ self.log_indent += indent
411
+
412
+ def set_indent(self, indent=2):
413
+ self.log_indent = indent
414
+
415
+ def reset_indent(self):
416
+ self.log_indent = 0
417
+
418
+ def store_indent(self):
419
+ self.log_indents.append(self.log_indent)
420
+
421
+ def restore_indent(self):
422
+ self.log_indent = self.log_indents.pop(-1)
423
+
424
+ def set_level(self, level):
425
+ self.log_level = level
426
+ self.setLevel(self.LEVEL_NAMES[level])
427
+
428
+ def store_level(self):
429
+ self.log_levels.append(self.log_level)
430
+
431
+ def restore_level(self):
432
+ self.log_level = self.log_levels.pop(-1)
433
+ self.set_level(self.log_level)
434
+
435
+ def quiet(self):
436
+ self.set_level("critical")
437
+
438
+ def enter_quiet(self, quiet=False):
439
+ if quiet:
440
+ self.store_level()
441
+ self.quiet()
442
+
443
+ def exit_quiet(self, quiet=False):
444
+ if quiet:
445
+ self.restore_level()
446
+
447
+ def log(
448
+ self,
449
+ level,
450
+ color,
451
+ msg,
452
+ indent=0,
453
+ fill=False,
454
+ fill_side="both",
455
+ end="\n",
456
+ *args,
457
+ **kwargs,
458
+ ):
459
+ if type(msg) == str:
460
+ msg_str = msg
461
+ else:
462
+ msg_str = repr(msg)
463
+ quotes = ["'", '"']
464
+ if msg_str[0] in quotes and msg_str[-1] in quotes:
465
+ msg_str = msg_str[1:-1]
466
+
467
+ indent_str = " " * (self.log_indent + indent)
468
+ indented_msg = "\n".join([indent_str + line for line in msg_str.split("\n")])
469
+
470
+ if fill:
471
+ indented_msg = add_fillers(indented_msg, fill_side=fill_side)
472
+
473
+ handler = self.handlers[0]
474
+ handler.terminator = end
475
+
476
+ getattr(self, level)(colored(indented_msg, color), *args, **kwargs)
477
+
478
+ def route_log(self, method, msg, *args, **kwargs):
479
+ level, method = method
480
+ functools.partial(self.log, level, method, msg)(*args, **kwargs)
481
+
482
+ def err(self, msg: str = "", *args, **kwargs):
483
+ self.route_log(("error", "red"), msg, *args, **kwargs)
484
+
485
+ def warn(self, msg: str = "", *args, **kwargs):
486
+ self.route_log(("warning", "light_red"), msg, *args, **kwargs)
487
+
488
+ def note(self, msg: str = "", *args, **kwargs):
489
+ self.route_log(("info", "light_magenta"), msg, *args, **kwargs)
490
+
491
+ def mesg(self, msg: str = "", *args, **kwargs):
492
+ self.route_log(("info", "light_cyan"), msg, *args, **kwargs)
493
+
494
+ def file(self, msg: str = "", *args, **kwargs):
495
+ self.route_log(("info", "light_blue"), msg, *args, **kwargs)
496
+
497
+ def line(self, msg: str = "", *args, **kwargs):
498
+ self.route_log(("info", "white"), msg, *args, **kwargs)
499
+
500
+ def success(self, msg: str = "", *args, **kwargs):
501
+ self.route_log(("info", "light_green"), msg, *args, **kwargs)
502
+
503
+ def fail(self, msg: str = "", *args, **kwargs):
504
+ self.route_log(("info", "light_red"), msg, *args, **kwargs)
505
+
506
+ def back(self, msg: str = "", *args, **kwargs):
507
+ self.route_log(("debug", "light_cyan"), msg, *args, **kwargs)
508
+
509
+
510
+ logger = OSLogger()
511
+
512
+
513
+ def shell_cmd(cmd, getoutput=False, showcmd=True, env=None):
514
+ if showcmd:
515
+ logger.info(colored(f"\n$ [{os.getcwd()}]", "light_blue"))
516
+ logger.info(colored(f" $ {cmd}\n", "light_cyan"))
517
+ if getoutput:
518
+ output = subprocess.getoutput(cmd, env=env)
519
+ return output
520
+ else:
521
+ subprocess.run(cmd, shell=True, env=env)
522
+
523
+
524
+ class Runtimer:
525
+ def __enter__(self):
526
+ self.t1, _ = self.start_time()
527
+ return self
528
+
529
+ def __exit__(self, exc_type, exc_value, traceback):
530
+ self.t2, _ = self.end_time()
531
+ self.elapsed_time(self.t2 - self.t1)
532
+
533
+ def start_time(self):
534
+ t1 = datetime.datetime.now()
535
+ self.logger_time("start", t1)
536
+ return t1, self.time2str(t1)
537
+
538
+ def end_time(self):
539
+ t2 = datetime.datetime.now()
540
+ self.logger_time("end", t2)
541
+ return t2, self.time2str(t2)
542
+
543
+ def elapsed_time(self, dt=None):
544
+ if dt is None:
545
+ dt = self.t2 - self.t1
546
+ self.logger_time("elapsed", dt)
547
+ return dt, self.time2str(dt)
548
+
549
+ def logger_time(self, time_type, t):
550
+ time_types = {
551
+ "start": "Start",
552
+ "end": "End",
553
+ "elapsed": "Elapsed",
554
+ }
555
+ time_str = add_fillers(
556
+ colored(
557
+ f"{time_types[time_type]} time: [ {self.time2str(t)} ]",
558
+ "light_magenta",
559
+ ),
560
+ fill_side="both",
172
561
  )
173
- self.add_argument(
174
- "-e",
175
- "--extract_webpage",
176
- default=False,
177
- action="store_true",
178
- help="Enable extracting main text contents from webpage",
562
+ logger.line(time_str)
563
+
564
+ # Convert time to string
565
+ def time2str(self, t):
566
+ datetime_str_format = "%Y-%m-%d %H:%M:%S"
567
+ if isinstance(t, datetime.datetime):
568
+ return t.strftime(datetime_str_format)
569
+ elif isinstance(t, datetime.timedelta):
570
+ hours = t.seconds // 3600
571
+ hour_str = f"{hours} hr" if hours > 0 else ""
572
+ minutes = (t.seconds // 60) % 60
573
+ minute_str = f"{minutes:>2} min" if minutes > 0 else ""
574
+ seconds = t.seconds % 60
575
+ second_str = f"{seconds:>2} s"
576
+ time_str = " ".join([hour_str, minute_str, second_str]).strip()
577
+ return time_str
578
+ else:
579
+ return str(t)
580
+
581
+
582
+ class OSEnver:
583
+ def __init__(self):
584
+ self.envs_stack = []
585
+ self.envs = os.environ.copy()
586
+
587
+ def store_envs(self):
588
+ self.envs_stack.append(self.envs)
589
+
590
+ def restore_envs(self):
591
+ self.envs = self.envs_stack.pop()
592
+
593
+ def set_envs(self, secrets=True, proxies=None, store_envs=True):
594
+ # caller_info = inspect.stack()[1]
595
+ # logger.back(f"OS Envs is set by: {caller_info.filename}")
596
+
597
+ if store_envs:
598
+ self.store_envs()
599
+
600
+ if secrets:
601
+ secrets_path = Path(__file__).parents[1] / "secrets.json"
602
+ if secrets_path.exists():
603
+ with open(secrets_path, "r") as rf:
604
+ secrets = json.load(rf)
605
+ else:
606
+ secrets = {}
607
+
608
+ if proxies:
609
+ for proxy_env in ["http_proxy", "https_proxy"]:
610
+ if isinstance(proxies, str):
611
+ self.envs[proxy_env] = proxies
612
+ elif "http_proxy" in secrets.keys():
613
+ self.envs[proxy_env] = secrets["http_proxy"]
614
+ elif os.getenv("http_proxy"):
615
+ self.envs[proxy_env] = os.getenv("http_proxy")
616
+ else:
617
+ continue
618
+
619
+ self.proxy = (
620
+ self.envs.get("all_proxy")
621
+ or self.envs.get("http_proxy")
622
+ or self.envs.get("https_proxy")
623
+ or None
179
624
  )
180
- self.add_argument(
181
- "-o",
182
- "--overwrite_query_html",
183
- default=False,
184
- action="store_true",
185
- help="Overwrite HTML file of query results",
625
+ self.requests_proxies = {
626
+ "http": self.proxy,
627
+ "https": self.proxy,
628
+ }
629
+
630
+ if self.proxy:
631
+ logger.note(f"Using proxy: [{self.proxy}]")
632
+
633
+
634
+ enver = OSEnver()
635
+
636
+ class GoogleSearcher:
637
+ def __init__(self):
638
+ self.url = "https://www.google.com/search"
639
+ self.enver = enver
640
+ self.enver.set_envs(proxies=True)
641
+ self.filepath_converter = QueryToFilepathConverter()
642
+
643
+ def send_request(self, result_num=10, safe=False):
644
+ self.request_response = requests.get(
645
+ url=self.url,
646
+ headers=REQUESTS_HEADERS,
647
+ params={
648
+ "q": self.query,
649
+ "num": result_num,
650
+ },
651
+ proxies=self.enver.requests_proxies,
186
652
  )
187
- self.add_argument(
188
- "-w",
189
- "--overwrite_webpage_html",
190
- default=False,
191
- action="store_true",
192
- help="Overwrite HTML files of webpages from query results",
653
+
654
+ def save_response(self):
655
+ if not self.html_path.exists():
656
+ self.html_path.parent.mkdir(parents=True, exist_ok=True)
657
+ logger.note(f"Saving to: [{self.html_path}]")
658
+ with open(self.html_path, "wb") as wf:
659
+ wf.write(self.request_response.content)
660
+
661
+ def search(self, query, result_num=10, safe=False, overwrite=False):
662
+ self.query = query
663
+ self.html_path = self.filepath_converter.convert(self.query)
664
+ logger.note(f"Searching: [{self.query}]")
665
+ if self.html_path.exists() and not overwrite:
666
+ logger.success(f"HTML existed: {self.html_path}")
667
+ else:
668
+ self.send_request(result_num=result_num, safe=safe)
669
+ self.save_response()
670
+ return self.html_path
671
+
672
+
673
+ IGNORE_TAGS = ["script", "style", "button"]
674
+ IGNORE_CLASSES = [
675
+ # common
676
+ "sidebar",
677
+ "footer",
678
+ "related",
679
+ "comment",
680
+ "topbar",
681
+ "offcanvas",
682
+ "navbar",
683
+ # 163.com
684
+ "post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
685
+ "ntes\-.*nav",
686
+ "nav\-bottom",
687
+ # wikipedia.org
688
+ "language\-list",
689
+ "vector\-(header)|(column)|(sticky\-pinned)|(dropdown\-content)",
690
+ "navbox",
691
+ "catlinks",
692
+ ]
693
+
694
+ IGNORE_HOSTS = [
695
+ "weibo.com",
696
+ "hymson.com",
697
+ "yahoo.com",
698
+ ]
699
+
700
+ REQUESTS_HEADERS = {
701
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
702
+ }
703
+
704
+
705
+
706
+
707
+ class WebpageFetcher:
708
+ def __init__(self):
709
+ self.enver = enver
710
+ self.enver.set_envs(proxies=True)
711
+ self.filepath_converter = UrlToFilepathConverter()
712
+
713
+ def is_ignored_host(self, url):
714
+ self.host = tldextract.extract(url).registered_domain
715
+ if self.host in IGNORE_HOSTS:
716
+ return True
717
+ else:
718
+ return False
719
+
720
+ def send_request(self):
721
+ try:
722
+ self.request_response = requests.get(
723
+ url=self.url,
724
+ headers=REQUESTS_HEADERS,
725
+ proxies=self.enver.requests_proxies,
726
+ timeout=15,
727
+ )
728
+ except:
729
+ logger.warn(f"Failed to fetch: [{self.url}]")
730
+ self.request_response = None
731
+
732
+ def save_response(self):
733
+ if not self.html_path.exists():
734
+ self.html_path.parent.mkdir(parents=True, exist_ok=True)
735
+ logger.success(f"Saving to: [{self.html_path}]")
736
+
737
+ if self.request_response is None:
738
+ return
739
+ else:
740
+ with open(self.html_path, "wb") as wf:
741
+ wf.write(self.request_response.content)
742
+
743
+ def fetch(self, url, overwrite=False, output_parent=None):
744
+ self.url = url
745
+ logger.note(f"Fetching: [{self.url}]")
746
+ self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
747
+
748
+ if self.is_ignored_host(self.url):
749
+ logger.warn(f"Ignore host: [{self.host}]")
750
+ return self.html_path
751
+
752
+ if self.html_path.exists() and not overwrite:
753
+ logger.success(f"HTML existed: [{self.html_path}]")
754
+ else:
755
+ self.send_request()
756
+ self.save_response()
757
+ return self.html_path
758
+
759
+
760
+ class BatchWebpageFetcher:
761
+ def __init__(self):
762
+ self.done_count = 0
763
+ self.total_count = 0
764
+ self.url_and_html_path_list = []
765
+
766
+ def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
767
+ webpage_fetcher = WebpageFetcher()
768
+ html_path = webpage_fetcher.fetch(
769
+ url=url, overwrite=overwrite, output_parent=output_parent
193
770
  )
771
+ self.url_and_html_path_list.append({"url": url, "html_path": html_path})
772
+ self.done_count += 1
773
+ logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
774
+
775
+ def fetch(self, urls, overwrite=False, output_parent=None):
776
+ self.urls = urls
777
+ self.total_count = len(self.urls)
778
+ with concurrent.futures.ThreadPoolExecutor() as executor:
779
+ futures = [
780
+ executor.submit(
781
+ self.fecth_single_webpage,
782
+ url=url,
783
+ overwrite=overwrite,
784
+ output_parent=output_parent,
785
+ )
786
+ for url in urls
787
+ ]
194
788
 
195
- self.args = self.parse_args()
789
+ for idx, future in enumerate(concurrent.futures.as_completed(futures)):
790
+ result = future.result()
791
+ return self.url_and_html_path_list
196
792
 
197
793