webscout 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

@@ -0,0 +1,269 @@
1
+ import datetime
2
+ import functools
3
+ import inspect
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import subprocess
8
+ from termcolor import colored
9
+
10
+
11
+ def add_fillers(text, filler="=", fill_side="both"):
12
+ terminal_width = shutil.get_terminal_size().columns
13
+ text = text.strip()
14
+ text_width = len(text)
15
+ if text_width >= terminal_width:
16
+ return text
17
+
18
+ if fill_side[0].lower() == "b":
19
+ leading_fill_str = filler * ((terminal_width - text_width) // 2 - 1) + " "
20
+ trailing_fill_str = " " + filler * (
21
+ terminal_width - text_width - len(leading_fill_str) - 1
22
+ )
23
+ elif fill_side[0].lower() == "l":
24
+ leading_fill_str = filler * (terminal_width - text_width - 1) + " "
25
+ trailing_fill_str = ""
26
+ elif fill_side[0].lower() == "r":
27
+ leading_fill_str = ""
28
+ trailing_fill_str = " " + filler * (terminal_width - text_width - 1)
29
+ else:
30
+ raise ValueError("Invalid fill_side")
31
+
32
+ filled_str = f"{leading_fill_str}{text}{trailing_fill_str}"
33
+ return filled_str
34
+
35
+
36
+ class OSLogger(logging.Logger):
37
+ LOG_METHODS = {
38
+ "err": ("error", "red"),
39
+ "warn": ("warning", "light_red"),
40
+ "note": ("info", "light_magenta"),
41
+ "mesg": ("info", "light_cyan"),
42
+ "file": ("info", "light_blue"),
43
+ "line": ("info", "white"),
44
+ "success": ("info", "light_green"),
45
+ "fail": ("info", "light_red"),
46
+ "back": ("debug", "light_cyan"),
47
+ }
48
+ INDENT_METHODS = [
49
+ "indent",
50
+ "set_indent",
51
+ "reset_indent",
52
+ "store_indent",
53
+ "restore_indent",
54
+ "log_indent",
55
+ ]
56
+ LEVEL_METHODS = [
57
+ "set_level",
58
+ "store_level",
59
+ "restore_level",
60
+ "quiet",
61
+ "enter_quiet",
62
+ "exit_quiet",
63
+ ]
64
+ LEVEL_NAMES = {
65
+ "critical": logging.CRITICAL,
66
+ "error": logging.ERROR,
67
+ "warning": logging.WARNING,
68
+ "info": logging.INFO,
69
+ "debug": logging.DEBUG,
70
+ }
71
+
72
+ def __init__(self, name=None, prefix=False):
73
+ if not name:
74
+ frame = inspect.stack()[1]
75
+ module = inspect.getmodule(frame[0])
76
+ name = module.__name__
77
+
78
+ super().__init__(name)
79
+ self.setLevel(logging.INFO)
80
+
81
+ if prefix:
82
+ formatter_prefix = "[%(asctime)s] - [%(name)s] - [%(levelname)s]\n"
83
+ else:
84
+ formatter_prefix = ""
85
+
86
+ self.formatter = logging.Formatter(formatter_prefix + "%(message)s")
87
+
88
+ stream_handler = logging.StreamHandler()
89
+ stream_handler.setLevel(logging.INFO)
90
+ stream_handler.setFormatter(self.formatter)
91
+ self.addHandler(stream_handler)
92
+
93
+ self.log_indent = 0
94
+ self.log_indents = []
95
+
96
+ self.log_level = "info"
97
+ self.log_levels = []
98
+
99
+ def indent(self, indent=2):
100
+ self.log_indent += indent
101
+
102
+ def set_indent(self, indent=2):
103
+ self.log_indent = indent
104
+
105
+ def reset_indent(self):
106
+ self.log_indent = 0
107
+
108
+ def store_indent(self):
109
+ self.log_indents.append(self.log_indent)
110
+
111
+ def restore_indent(self):
112
+ self.log_indent = self.log_indents.pop(-1)
113
+
114
+ def set_level(self, level):
115
+ self.log_level = level
116
+ self.setLevel(self.LEVEL_NAMES[level])
117
+
118
+ def store_level(self):
119
+ self.log_levels.append(self.log_level)
120
+
121
+ def restore_level(self):
122
+ self.log_level = self.log_levels.pop(-1)
123
+ self.set_level(self.log_level)
124
+
125
+ def quiet(self):
126
+ self.set_level("critical")
127
+
128
+ def enter_quiet(self, quiet=False):
129
+ if quiet:
130
+ self.store_level()
131
+ self.quiet()
132
+
133
+ def exit_quiet(self, quiet=False):
134
+ if quiet:
135
+ self.restore_level()
136
+
137
+ def log(
138
+ self,
139
+ level,
140
+ color,
141
+ msg,
142
+ indent=0,
143
+ fill=False,
144
+ fill_side="both",
145
+ end="\n",
146
+ *args,
147
+ **kwargs,
148
+ ):
149
+ if type(msg) == str:
150
+ msg_str = msg
151
+ else:
152
+ msg_str = repr(msg)
153
+ quotes = ["'", '"']
154
+ if msg_str[0] in quotes and msg_str[-1] in quotes:
155
+ msg_str = msg_str[1:-1]
156
+
157
+ indent_str = " " * (self.log_indent + indent)
158
+ indented_msg = "\n".join([indent_str + line for line in msg_str.split("\n")])
159
+
160
+ if fill:
161
+ indented_msg = add_fillers(indented_msg, fill_side=fill_side)
162
+
163
+ handler = self.handlers[0]
164
+ handler.terminator = end
165
+
166
+ getattr(self, level)(colored(indented_msg, color), *args, **kwargs)
167
+
168
+ def route_log(self, method, msg, *args, **kwargs):
169
+ level, method = method
170
+ functools.partial(self.log, level, method, msg)(*args, **kwargs)
171
+
172
+ def err(self, msg: str = "", *args, **kwargs):
173
+ self.route_log(("error", "red"), msg, *args, **kwargs)
174
+
175
+ def warn(self, msg: str = "", *args, **kwargs):
176
+ self.route_log(("warning", "light_red"), msg, *args, **kwargs)
177
+
178
+ def note(self, msg: str = "", *args, **kwargs):
179
+ self.route_log(("info", "light_magenta"), msg, *args, **kwargs)
180
+
181
+ def mesg(self, msg: str = "", *args, **kwargs):
182
+ self.route_log(("info", "light_cyan"), msg, *args, **kwargs)
183
+
184
+ def file(self, msg: str = "", *args, **kwargs):
185
+ self.route_log(("info", "light_blue"), msg, *args, **kwargs)
186
+
187
+ def line(self, msg: str = "", *args, **kwargs):
188
+ self.route_log(("info", "white"), msg, *args, **kwargs)
189
+
190
+ def success(self, msg: str = "", *args, **kwargs):
191
+ self.route_log(("info", "light_green"), msg, *args, **kwargs)
192
+
193
+ def fail(self, msg: str = "", *args, **kwargs):
194
+ self.route_log(("info", "light_red"), msg, *args, **kwargs)
195
+
196
+ def back(self, msg: str = "", *args, **kwargs):
197
+ self.route_log(("debug", "light_cyan"), msg, *args, **kwargs)
198
+
199
+
200
+ logger = OSLogger()
201
+
202
+
203
+ def shell_cmd(cmd, getoutput=False, showcmd=True, env=None):
204
+ if showcmd:
205
+ logger.info(colored(f"\n$ [{os.getcwd()}]", "light_blue"))
206
+ logger.info(colored(f" $ {cmd}\n", "light_cyan"))
207
+ if getoutput:
208
+ output = subprocess.getoutput(cmd, env=env)
209
+ return output
210
+ else:
211
+ subprocess.run(cmd, shell=True, env=env)
212
+
213
+
214
+ class Runtimer:
215
+ def __enter__(self):
216
+ self.t1, _ = self.start_time()
217
+ return self
218
+
219
+ def __exit__(self, exc_type, exc_value, traceback):
220
+ self.t2, _ = self.end_time()
221
+ self.elapsed_time(self.t2 - self.t1)
222
+
223
+ def start_time(self):
224
+ t1 = datetime.datetime.now()
225
+ self.logger_time("start", t1)
226
+ return t1, self.time2str(t1)
227
+
228
+ def end_time(self):
229
+ t2 = datetime.datetime.now()
230
+ self.logger_time("end", t2)
231
+ return t2, self.time2str(t2)
232
+
233
+ def elapsed_time(self, dt=None):
234
+ if dt is None:
235
+ dt = self.t2 - self.t1
236
+ self.logger_time("elapsed", dt)
237
+ return dt, self.time2str(dt)
238
+
239
+ def logger_time(self, time_type, t):
240
+ time_types = {
241
+ "start": "Start",
242
+ "end": "End",
243
+ "elapsed": "Elapsed",
244
+ }
245
+ time_str = add_fillers(
246
+ colored(
247
+ f"{time_types[time_type]} time: [ {self.time2str(t)} ]",
248
+ "light_magenta",
249
+ ),
250
+ fill_side="both",
251
+ )
252
+ logger.line(time_str)
253
+
254
+ # Convert time to string
255
+ def time2str(self, t):
256
+ datetime_str_format = "%Y-%m-%d %H:%M:%S"
257
+ if isinstance(t, datetime.datetime):
258
+ return t.strftime(datetime_str_format)
259
+ elif isinstance(t, datetime.timedelta):
260
+ hours = t.seconds // 3600
261
+ hour_str = f"{hours} hr" if hours > 0 else ""
262
+ minutes = (t.seconds // 60) % 60
263
+ minute_str = f"{minutes:>2} min" if minutes > 0 else ""
264
+ seconds = t.seconds % 60
265
+ second_str = f"{seconds:>2} s"
266
+ time_str = " ".join([hour_str, minute_str, second_str]).strip()
267
+ return time_str
268
+ else:
269
+ return str(t)
webscout/DWEBS.py ADDED
@@ -0,0 +1,179 @@
1
+
2
+ from pydantic import BaseModel, Field
3
+ from typing import Union
4
+
5
+ from DeepWEBS.utilsdw.logger import logger
6
+ from DeepWEBS.networks.google_searcher import GoogleSearcher
7
+ from DeepWEBS.networks.webpage_fetcher import BatchWebpageFetcher
8
+ from DeepWEBS.documents.query_results_extractor import QueryResultsExtractor
9
+ from DeepWEBS.documents.webpage_content_extractor import BatchWebpageContentExtractor
10
+ from DeepWEBS.utilsdw.logger import logger
11
+ import argparse
12
+
13
+ class DeepWEBS:
14
+ def __init__(self):
15
+ pass
16
+
17
+ class DeepSearch(BaseModel):
18
+ queries: list = Field(
19
+ default=[""],
20
+ description="(list[str]) Queries to search",
21
+ )
22
+ result_num: int = Field(
23
+ default=10,
24
+ description="(int) Number of search results",
25
+ )
26
+ safe: bool = Field(
27
+ default=False,
28
+ description="(bool) Enable SafeSearch",
29
+ )
30
+ types: list = Field(
31
+ default=["web"],
32
+ description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
33
+ )
34
+ extract_webpage: bool = Field(
35
+ default=False,
36
+ description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
37
+ )
38
+ overwrite_query_html: bool = Field(
39
+ default=False,
40
+ description="(bool) Overwrite HTML file of query results",
41
+ )
42
+ overwrite_webpage_html: bool = Field(
43
+ default=False,
44
+ description="(bool) Overwrite HTML files of webpages from query results",
45
+ )
46
+
47
+ def queries_to_search_results(self, item: DeepSearch):
48
+ google_searcher = GoogleSearcher()
49
+ queries_search_results = []
50
+ for query in item.queries:
51
+ query_results_extractor = QueryResultsExtractor()
52
+ if not query.strip():
53
+ continue
54
+ query_html_path = google_searcher.search(
55
+ query=query,
56
+ result_num=item.result_num,
57
+ safe=item.safe,
58
+ overwrite=item.overwrite_query_html,
59
+ )
60
+ query_search_results = query_results_extractor.extract(query_html_path)
61
+ queries_search_results.append(query_search_results)
62
+ logger.note(queries_search_results)
63
+
64
+ if item.extract_webpage:
65
+ queries_search_results = self.extract_webpages(
66
+ queries_search_results,
67
+ overwrite_webpage_html=item.overwrite_webpage_html,
68
+ )
69
+ return queries_search_results
70
+
71
+ def extract_webpages(self, queries_search_results, overwrite_webpage_html=False):
72
+ for query_idx, query_search_results in enumerate(queries_search_results):
73
+ # Fetch webpages with urls
74
+ batch_webpage_fetcher = BatchWebpageFetcher()
75
+ urls = [
76
+ query_result["url"]
77
+ for query_result in query_search_results["query_results"]
78
+ ]
79
+ url_and_html_path_list = batch_webpage_fetcher.fetch(
80
+ urls,
81
+ overwrite=overwrite_webpage_html,
82
+ output_parent=query_search_results["query"],
83
+ )
84
+
85
+ # Extract webpage contents from htmls
86
+ html_paths = [
87
+ str(url_and_html_path["html_path"])
88
+ for url_and_html_path in url_and_html_path_list
89
+ ]
90
+ batch_webpage_content_extractor = BatchWebpageContentExtractor()
91
+ html_path_and_extracted_content_list = (
92
+ batch_webpage_content_extractor.extract(html_paths)
93
+ )
94
+
95
+ # Build the map of url to extracted_content
96
+ html_path_to_url_dict = {
97
+ str(url_and_html_path["html_path"]): url_and_html_path["url"]
98
+ for url_and_html_path in url_and_html_path_list
99
+ }
100
+ url_to_extracted_content_dict = {
101
+ html_path_to_url_dict[
102
+ html_path_and_extracted_content["html_path"]
103
+ ]: html_path_and_extracted_content["extracted_content"]
104
+ for html_path_and_extracted_content in html_path_and_extracted_content_list
105
+ }
106
+
107
+ # Write extracted contents (as 'text' field) to query_search_results
108
+ for query_result_idx, query_result in enumerate(
109
+ query_search_results["query_results"]
110
+ ):
111
+ url = query_result["url"]
112
+ extracted_content = url_to_extracted_content_dict[url]
113
+ queries_search_results[query_idx]["query_results"][query_result_idx][
114
+ "text"
115
+ ] = extracted_content
116
+
117
+ return queries_search_results
118
+
119
+
120
+ class ArgParser(argparse.ArgumentParser):
121
+ def __init__(self, *args, **kwargs):
122
+ super(ArgParser, self).__init__(*args, **kwargs)
123
+
124
+ self.add_argument(
125
+ "-q",
126
+ "--queries",
127
+ type=str,
128
+ nargs="+",
129
+ required=True,
130
+ help="Queries to search",
131
+ )
132
+ self.add_argument(
133
+ "-n",
134
+ "--result_num",
135
+ type=int,
136
+ default=10,
137
+ help="Number of search results",
138
+ )
139
+ self.add_argument(
140
+ "-s",
141
+ "--safe",
142
+ default=False,
143
+ action="store_true",
144
+ help="Enable SafeSearch",
145
+ )
146
+ self.add_argument(
147
+ "-t",
148
+ "--types",
149
+ type=str,
150
+ nargs="+",
151
+ default=["web"],
152
+ choices=["web", "image", "videos", "news"],
153
+ help="Types of search results",
154
+ )
155
+ self.add_argument(
156
+ "-e",
157
+ "--extract_webpage",
158
+ default=False,
159
+ action="store_true",
160
+ help="Enable extracting main text contents from webpage",
161
+ )
162
+ self.add_argument(
163
+ "-o",
164
+ "--overwrite_query_html",
165
+ default=False,
166
+ action="store_true",
167
+ help="Overwrite HTML file of query results",
168
+ )
169
+ self.add_argument(
170
+ "-w",
171
+ "--overwrite_webpage_html",
172
+ default=False,
173
+ action="store_true",
174
+ help="Overwrite HTML files of webpages from query results",
175
+ )
176
+
177
+ self.args = self.parse_args()
178
+
179
+
webscout/__init__.py CHANGED
@@ -8,7 +8,7 @@ import logging
8
8
  from .webscout_search import WEBS
9
9
  from .webscout_search_async import AsyncWEBS
10
10
  from .version import __version__
11
-
11
+ from .DWEBS import DeepWEBS
12
12
  __all__ = ["WEBS", "AsyncWEBS", "__version__", "cli"]
13
13
 
14
14
  logging.getLogger("webscout").addHandler(logging.NullHandler())
webscout/version.py CHANGED
@@ -1,2 +1,2 @@
1
- __version__ = "1.2.0"
1
+ __version__ = "1.2.2"
2
2
 
@@ -1,14 +1,15 @@
1
1
  import asyncio
2
2
  from concurrent.futures import Future
3
3
  from threading import Thread
4
+ import sys
4
5
  from types import TracebackType
5
6
  from typing import Any, Awaitable, Dict, Optional, Type, Union
6
- # Attempt to set the event loop policy to WindowsSelectorEventLoopPolicy
7
- try:
8
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
9
- except AttributeError:
10
- # Fall back to ProactorEventLoopPolicy if WindowsSelectorEventLoopPolicy is not available
11
- asyncio.set_event_loop_policy(asyncio.ProactorEventLoopPolicy())
7
+ if sys.platform == 'win32':
8
+ try:
9
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
10
+ except AttributeError:
11
+ # If WindowsSelectorEventLoopPolicy is not available, do nothing
12
+ pass
12
13
  from .webscout_search_async import AsyncWEBS
13
14
 
14
15