webscout 4.7__py3-none-any.whl → 4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (53) hide show
  1. webscout/Agents/functioncall.py +97 -37
  2. webscout/Bard.py +365 -0
  3. webscout/Bing_search.py +124 -0
  4. webscout/DWEBS.py +141 -777
  5. webscout/Local/_version.py +1 -1
  6. webscout/Provider/Andi.py +7 -1
  7. webscout/Provider/BasedGPT.py +11 -5
  8. webscout/Provider/Berlin4h.py +11 -5
  9. webscout/Provider/Blackboxai.py +10 -4
  10. webscout/Provider/Cloudflare.py +286 -0
  11. webscout/Provider/Cohere.py +11 -5
  12. webscout/Provider/DARKAI.py +25 -7
  13. webscout/Provider/Deepinfra.py +2 -1
  14. webscout/Provider/Deepseek.py +25 -9
  15. webscout/Provider/DiscordRocks.py +389 -0
  16. webscout/Provider/Farfalle.py +227 -0
  17. webscout/Provider/Gemini.py +1 -1
  18. webscout/Provider/Groq.py +244 -110
  19. webscout/Provider/Llama.py +13 -5
  20. webscout/Provider/Llama3.py +15 -2
  21. webscout/Provider/OLLAMA.py +8 -7
  22. webscout/Provider/{Geminiflash.py → PI.py} +96 -40
  23. webscout/Provider/Perplexity.py +422 -52
  24. webscout/Provider/Phind.py +6 -5
  25. webscout/Provider/PizzaGPT.py +7 -1
  26. webscout/Provider/Youchat.py +98 -76
  27. webscout/Provider/__init__.py +26 -31
  28. webscout/Provider/ai4chat.py +193 -0
  29. webscout/Provider/{VTLchat.py → felo_search.py} +62 -76
  30. webscout/Provider/julius.py +263 -0
  31. webscout/Provider/koala.py +11 -5
  32. webscout/Provider/liaobots.py +268 -0
  33. webscout/Provider/meta.py +2 -1
  34. webscout/Provider/{ChatGPTUK.py → turboseek.py} +79 -56
  35. webscout/Provider/{FreeGemini.py → xdash.py} +51 -18
  36. webscout/Provider/yep.py +258 -0
  37. webscout/__init__.py +1 -59
  38. webscout/version.py +1 -1
  39. webscout/webai.py +2 -64
  40. webscout/webscout_search.py +1 -1
  41. {webscout-4.7.dist-info → webscout-4.9.dist-info}/METADATA +249 -323
  42. webscout-4.9.dist-info/RECORD +83 -0
  43. webscout/GoogleS.py +0 -342
  44. webscout/Provider/Geminipro.py +0 -152
  45. webscout/Provider/Leo.py +0 -469
  46. webscout/Provider/OpenGPT.py +0 -867
  47. webscout/Provider/Xjai.py +0 -230
  48. webscout/Provider/Yepchat.py +0 -478
  49. webscout-4.7.dist-info/RECORD +0 -80
  50. {webscout-4.7.dist-info → webscout-4.9.dist-info}/LICENSE.md +0 -0
  51. {webscout-4.7.dist-info → webscout-4.9.dist-info}/WHEEL +0 -0
  52. {webscout-4.7.dist-info → webscout-4.9.dist-info}/entry_points.txt +0 -0
  53. {webscout-4.7.dist-info → webscout-4.9.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py CHANGED
@@ -1,793 +1,157 @@
1
1
  from bs4 import BeautifulSoup
2
- from pathlib import Path
3
- import platform
4
- import re
5
- import concurrent.futures
6
2
  import requests
7
- import tldextract
8
- import json
9
- import os
10
- import shutil
11
- import subprocess
12
- import datetime
13
- import functools
14
- import inspect
15
- import logging
16
-
17
- from urllib.parse import quote, unquote
18
- from tiktoken import get_encoding as tiktoken_get_encoding
19
- from markdownify import markdownify
3
+ from typing import Dict, List, Optional, Union
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from urllib.parse import quote
20
6
  from termcolor import colored
21
- class QueryResultsExtractor:
22
- def __init__(self) -> None:
23
- self.query_results = []
24
- self.related_questions = []
25
-
26
- def load_html(self, html_path):
27
- try:
28
- with open(html_path, "r", encoding="utf-8") as f:
29
- html = f.read()
30
- self.soup = BeautifulSoup(html, "html.parser")
31
- except FileNotFoundError:
32
- logger.error(f"File not found: {html_path}")
33
- except Exception as e:
34
- logger.error(f"Error loading HTML: {e}")
35
-
36
- def extract_query_results(self):
37
- try:
38
- self.query = self.soup.find("textarea").text.strip()
39
- query_result_elements = self.soup.find_all("div", class_="g")
40
- for idx, result in enumerate(query_result_elements):
41
- try:
42
- site = result.find("cite").find_previous("span").text.strip()
43
- url = result.find("a")["href"]
44
- title = result.find("h3").text.strip()
45
- abstract_element_conditions = [
46
- {"data-sncf": "1"},
47
- {"class_": "ITZIwc"},
48
- ]
49
- for condition in abstract_element_conditions:
50
- abstract_element = result.find("div", condition)
51
- if abstract_element is not None:
52
- abstract = abstract_element.text.strip()
53
- break
54
- else:
55
- abstract = ""
56
- logger.mesg(
57
- f"{title}\n"
58
- f" - {site}\n"
59
- f" - {url}\n"
60
- f" - {abstract}\n"
61
- f"\n"
62
- )
63
- self.query_results.append(
64
- {
65
- "title": title,
66
- "site": site,
67
- "url": url,
68
- "abstract": abstract,
69
- "index": idx,
70
- "type": "web",
71
- }
72
- )
73
- except Exception as e:
74
- logger.error(f"Error extracting query result: {e}")
75
- logger.success(f"- {len(query_result_elements)} query results")
76
- except Exception as e:
77
- logger.error(f"Error extracting query results: {e}")
78
-
79
- def extract_related_questions(self):
80
- try:
81
- related_question_elements = self.soup.find_all(
82
- "div", class_="related-question-pair"
83
- )
84
- for question_element in related_question_elements:
85
- try:
86
- question = question_element.find("span").text.strip()
87
- print(question)
88
- self.related_questions.append(question)
89
- except Exception as e:
90
- logger.error(f"Error extracting related question: {e}")
91
- logger.success(f"- {len(self.related_questions)} related questions")
92
- except Exception as e:
93
- logger.error(f"Error extracting related questions: {e}")
94
-
95
- def extract(self, html_path):
96
- self.load_html(html_path)
97
- self.extract_query_results()
98
- self.extract_related_questions()
99
- self.search_results = {
100
- "query": self.query,
101
- "query_results": self.query_results,
102
- "related_questions": self.related_questions,
103
- }
104
- return self.search_results
105
-
106
-
107
-
108
-
109
- class WebpageContentExtractor:
110
- def __init__(self):
111
- self.tokenizer = tiktoken_get_encoding("cl100k_base")
112
-
113
- def count_tokens(self, text):
114
- tokens = self.tokenizer.encode(text)
115
- token_count = len(tokens)
116
- return token_count
117
-
118
- def html_to_markdown(self, html_str, ignore_links=True):
119
- if ignore_links:
120
- markdown_str = markdownify(html_str, strip="a")
121
- else:
122
- markdown_str = markdownify(html_str)
123
- markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
124
-
125
- self.markdown_token_count = self.count_tokens(markdown_str)
126
- logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
127
-
128
- self.markdown_str = markdown_str
129
-
130
- return self.markdown_str
131
-
132
- def remove_elements_from_html(self, html_str):
133
- soup = BeautifulSoup(html_str, "html.parser")
134
- ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
135
- ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
136
- removed_element_counts = 0
137
- for element in soup.find_all():
138
- class_str = ""
139
- id_str = ""
140
- try:
141
- class_attr = element.get("class", [])
142
- if class_attr:
143
- class_str = " ".join(list(class_attr))
144
- if id_str:
145
- class_str = f"{class_str} {id_str}"
146
- except:
147
- pass
148
-
149
- try:
150
- id_str = element.get("id", "")
151
- except:
152
- pass
153
-
154
- if (
155
- (not element.text.strip())
156
- or (element.name in IGNORE_TAGS)
157
- or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
158
- or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
159
- ):
160
- element.decompose()
161
- removed_element_counts += 1
162
-
163
- logger.mesg(
164
- f"- Elements: "
165
- f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
166
- )
167
-
168
- html_str = str(soup)
169
- self.html_str = html_str
170
-
171
- return self.html_str
172
-
173
- def extract(self, html_path):
174
- logger.note(f"Extracting content from: {html_path}")
175
-
176
- if not Path(html_path).exists():
177
- logger.warn(f"File not found: {html_path}")
178
- return ""
179
-
180
- encodings = ["utf-8", "latin-1"]
181
- for encoding in encodings:
182
- try:
183
- with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
184
- html_str = rf.read()
185
- break
186
- except UnicodeDecodeError:
187
- pass
188
- else:
189
- logger.warn(f"No matching encodings: {html_path}")
190
- return ""
191
-
192
- html_str = self.remove_elements_from_html(html_str)
193
- markdown_str = self.html_to_markdown(html_str)
194
- return markdown_str
195
-
196
-
197
- class BatchWebpageContentExtractor:
198
- def __init__(self) -> None:
199
- self.html_path_and_extracted_content_list = []
200
- self.done_count = 0
201
-
202
- def extract_single_html(self, html_path):
203
- webpage_content_extractor = WebpageContentExtractor()
204
- extracted_content = webpage_content_extractor.extract(html_path)
205
- self.html_path_and_extracted_content_list.append(
206
- {"html_path": html_path, "extracted_content": extracted_content}
207
- )
208
- self.done_count += 1
209
- logger.success(
210
- f"> [{self.done_count}/{self.total_count}] Extracted: {html_path}"
211
- )
212
-
213
- def extract(self, html_paths):
214
- self.html_path = html_paths
215
- self.total_count = len(self.html_path)
216
- with concurrent.futures.ThreadPoolExecutor() as executor:
217
- futures = [
218
- executor.submit(self.extract_single_html, html_path)
219
- for html_path in self.html_path
220
- ]
221
- for idx, future in enumerate(concurrent.futures.as_completed(futures)):
222
- result = future.result()
223
-
224
- return self.html_path_and_extracted_content_list
225
-
226
-
227
-
228
-
229
-
230
- # What characters are forbidden in Windows and Linux directory names?
231
- # https://stackoverflow.com/questions/1976007/what-characters-are-forbidden-in-windows-and-linux-directory-names
232
-
233
- INVALID_FILE_PATH_CHARS = [
234
- "\\",
235
- "/",
236
- ":",
237
- "*",
238
- "?",
239
- '"',
240
- "<",
241
- ">",
242
- "|",
243
- "\n",
244
- "\t",
245
- "\r",
246
- *[chr(i) for i in range(32)],
247
- ]
248
-
249
- WINDOWS_INVALID_FILE_PATH_NAMES = [
250
- "con",
251
- "prn",
252
- "aux",
253
- "nul",
254
- *[f"com{i+1}" for i in range(10)],
255
- *[f"lpt{i+1}" for i in range(10)],
256
- ]
257
-
258
-
259
- class FilepathConverter:
260
- def __init__(self, parent: str = None):
261
- self.output_root = Path(__file__).parents[1] / "files"
262
- self.parent = parent
263
-
264
- def preprocess(self, input_string):
265
- return input_string
266
-
267
- def validate(self, input_string):
268
- if not input_string:
269
- return input_string
270
- filename = input_string
271
- for char in INVALID_FILE_PATH_CHARS:
272
- filename = filename.replace(char, "_")
273
- if platform.system() == "Windows":
274
- filename_base = filename.split(".")[0]
275
- if filename_base.lower() in WINDOWS_INVALID_FILE_PATH_NAMES:
276
- filename_base = filename_base + "_"
277
- filename = ".".join([filename_base, *filename.split(".")[1:]])
278
- return filename
279
-
280
- def append_extension(self, filename, accept_exts=[".html", ".htm"], ext=".html"):
281
- if ext:
282
- filename_ext = "." + filename.split(".")[-1]
283
- if filename_ext.lower() not in accept_exts:
284
- filename += ext
285
- return filename
286
-
287
- def convert(self, input_string, parent=None):
288
- filename = self.preprocess(input_string)
289
- filename = self.validate(filename)
290
- filename = self.append_extension(filename)
291
-
292
- parent = parent or self.parent
293
- parent = self.validate(parent)
294
- if parent:
295
- filepath = self.output_root / parent / filename
296
- else:
297
- filepath = self.output_root / filename
298
-
299
- self.filename = filename
300
- self.filepath = filepath
301
-
302
- return self.filepath
303
-
304
-
305
- class UrlToFilepathConverter(FilepathConverter):
306
- def __init__(self, parent: str = None):
307
- super().__init__(parent)
308
- self.output_root = self.output_root / "urls"
309
-
310
- def preprocess(self, url):
311
- filename = unquote(url.split("//")[1])
312
- return filename
313
-
314
-
315
- class QueryToFilepathConverter(FilepathConverter):
316
- def __init__(self, parent: str = None):
317
- super().__init__(parent)
318
- self.output_root = self.output_root / "queries"
319
-
320
-
321
- def add_fillers(text, filler="=", fill_side="both"):
322
- terminal_width = shutil.get_terminal_size().columns
323
- text = text.strip()
324
- text_width = len(text)
325
- if text_width >= terminal_width:
326
- return text
327
-
328
- if fill_side[0].lower() == "b":
329
- leading_fill_str = filler * ((terminal_width - text_width) // 2 - 1) + " "
330
- trailing_fill_str = " " + filler * (
331
- terminal_width - text_width - len(leading_fill_str) - 1
332
- )
333
- elif fill_side[0].lower() == "l":
334
- leading_fill_str = filler * (terminal_width - text_width - 1) + " "
335
- trailing_fill_str = ""
336
- elif fill_side[0].lower() == "r":
337
- leading_fill_str = ""
338
- trailing_fill_str = " " + filler * (terminal_width - text_width - 1)
339
- else:
340
- raise ValueError("Invalid fill_side")
341
-
342
- filled_str = f"{leading_fill_str}{text}{trailing_fill_str}"
343
- return filled_str
344
-
345
-
346
- class OSLogger(logging.Logger):
347
- LOG_METHODS = {
348
- "err": ("error", "red"),
349
- "warn": ("warning", "light_red"),
350
- "note": ("info", "light_magenta"),
351
- "mesg": ("info", "light_cyan"),
352
- "file": ("info", "light_blue"),
353
- "line": ("info", "white"),
354
- "success": ("info", "light_green"),
355
- "fail": ("info", "light_red"),
356
- "back": ("debug", "light_cyan"),
357
- }
358
- INDENT_METHODS = [
359
- "indent",
360
- "set_indent",
361
- "reset_indent",
362
- "store_indent",
363
- "restore_indent",
364
- "log_indent",
365
- ]
366
- LEVEL_METHODS = [
367
- "set_level",
368
- "store_level",
369
- "restore_level",
370
- "quiet",
371
- "enter_quiet",
372
- "exit_quiet",
373
- ]
374
- LEVEL_NAMES = {
375
- "critical": logging.CRITICAL,
376
- "error": logging.ERROR,
377
- "warning": logging.WARNING,
378
- "info": logging.INFO,
379
- "debug": logging.DEBUG,
380
- }
381
-
382
- def __init__(self, name=None, prefix=False):
383
- if not name:
384
- frame = inspect.stack()[1]
385
- module = inspect.getmodule(frame[0])
386
- name = module.__name__
387
-
388
- super().__init__(name)
389
- self.setLevel(logging.INFO)
390
-
391
- if prefix:
392
- formatter_prefix = "[%(asctime)s] - [%(name)s] - [%(levelname)s]\n"
393
- else:
394
- formatter_prefix = ""
395
-
396
- self.formatter = logging.Formatter(formatter_prefix + "%(message)s")
397
-
398
- stream_handler = logging.StreamHandler()
399
- stream_handler.setLevel(logging.INFO)
400
- stream_handler.setFormatter(self.formatter)
401
- self.addHandler(stream_handler)
402
-
403
- self.log_indent = 0
404
- self.log_indents = []
405
-
406
- self.log_level = "info"
407
- self.log_levels = []
408
-
409
- def indent(self, indent=2):
410
- self.log_indent += indent
7
+ import time
8
+ import random
411
9
 
412
- def set_indent(self, indent=2):
413
- self.log_indent = indent
10
+ class GoogleS:
11
+ """Google search class to get search results from google.com."""
414
12
 
415
- def reset_indent(self):
416
- self.log_indent = 0
13
+ _executor: ThreadPoolExecutor = ThreadPoolExecutor(max_workers=10)
417
14
 
418
- def store_indent(self):
419
- self.log_indents.append(self.log_indent)
420
-
421
- def restore_indent(self):
422
- self.log_indent = self.log_indents.pop(-1)
423
-
424
- def set_level(self, level):
425
- self.log_level = level
426
- self.setLevel(self.LEVEL_NAMES[level])
427
-
428
- def store_level(self):
429
- self.log_levels.append(self.log_level)
430
-
431
- def restore_level(self):
432
- self.log_level = self.log_levels.pop(-1)
433
- self.set_level(self.log_level)
434
-
435
- def quiet(self):
436
- self.set_level("critical")
437
-
438
- def enter_quiet(self, quiet=False):
439
- if quiet:
440
- self.store_level()
441
- self.quiet()
442
-
443
- def exit_quiet(self, quiet=False):
444
- if quiet:
445
- self.restore_level()
446
-
447
- def log(
15
+ def __init__(
448
16
  self,
449
- level,
450
- color,
451
- msg,
452
- indent=0,
453
- fill=False,
454
- fill_side="both",
455
- end="\n",
456
- *args,
457
- **kwargs,
458
- ):
459
- if type(msg) == str:
460
- msg_str = msg
461
- else:
462
- msg_str = repr(msg)
463
- quotes = ["'", '"']
464
- if msg_str[0] in quotes and msg_str[-1] in quotes:
465
- msg_str = msg_str[1:-1]
466
-
467
- indent_str = " " * (self.log_indent + indent)
468
- indented_msg = "\n".join([indent_str + line for line in msg_str.split("\n")])
469
-
470
- if fill:
471
- indented_msg = add_fillers(indented_msg, fill_side=fill_side)
472
-
473
- handler = self.handlers[0]
474
- handler.terminator = end
475
-
476
- getattr(self, level)(colored(indented_msg, color), *args, **kwargs)
477
-
478
- def route_log(self, method, msg, *args, **kwargs):
479
- level, method = method
480
- functools.partial(self.log, level, method, msg)(*args, **kwargs)
481
-
482
- def err(self, msg: str = "", *args, **kwargs):
483
- self.route_log(("error", "red"), msg, *args, **kwargs)
484
-
485
- def warn(self, msg: str = "", *args, **kwargs):
486
- self.route_log(("warning", "light_red"), msg, *args, **kwargs)
487
-
488
- def note(self, msg: str = "", *args, **kwargs):
489
- self.route_log(("info", "light_magenta"), msg, *args, **kwargs)
490
-
491
- def mesg(self, msg: str = "", *args, **kwargs):
492
- self.route_log(("info", "light_cyan"), msg, *args, **kwargs)
493
-
494
- def file(self, msg: str = "", *args, **kwargs):
495
- self.route_log(("info", "light_blue"), msg, *args, **kwargs)
496
-
497
- def line(self, msg: str = "", *args, **kwargs):
498
- self.route_log(("info", "white"), msg, *args, **kwargs)
499
-
500
- def success(self, msg: str = "", *args, **kwargs):
501
- self.route_log(("info", "light_green"), msg, *args, **kwargs)
502
-
503
- def fail(self, msg: str = "", *args, **kwargs):
504
- self.route_log(("info", "light_red"), msg, *args, **kwargs)
505
-
506
- def back(self, msg: str = "", *args, **kwargs):
507
- self.route_log(("debug", "light_cyan"), msg, *args, **kwargs)
508
-
509
-
510
- logger = OSLogger()
511
-
512
-
513
- def shell_cmd(cmd, getoutput=False, showcmd=True, env=None):
514
- if showcmd:
515
- logger.info(colored(f"\n$ [{os.getcwd()}]", "light_blue"))
516
- logger.info(colored(f" $ {cmd}\n", "light_cyan"))
517
- if getoutput:
518
- output = subprocess.getoutput(cmd, env=env)
519
- return output
520
- else:
521
- subprocess.run(cmd, shell=True, env=env)
522
-
523
-
524
- class Runtimer:
525
- def __enter__(self):
526
- self.t1, _ = self.start_time()
527
- return self
528
-
529
- def __exit__(self, exc_type, exc_value, traceback):
530
- self.t2, _ = self.end_time()
531
- self.elapsed_time(self.t2 - self.t1)
532
-
533
- def start_time(self):
534
- t1 = datetime.datetime.now()
535
- self.logger_time("start", t1)
536
- return t1, self.time2str(t1)
537
-
538
- def end_time(self):
539
- t2 = datetime.datetime.now()
540
- self.logger_time("end", t2)
541
- return t2, self.time2str(t2)
542
-
543
- def elapsed_time(self, dt=None):
544
- if dt is None:
545
- dt = self.t2 - self.t1
546
- self.logger_time("elapsed", dt)
547
- return dt, self.time2str(dt)
548
-
549
- def logger_time(self, time_type, t):
550
- time_types = {
551
- "start": "Start",
552
- "end": "End",
553
- "elapsed": "Elapsed",
554
- }
555
- time_str = add_fillers(
556
- colored(
557
- f"{time_types[time_type]} time: [ {self.time2str(t)} ]",
558
- "light_magenta",
559
- ),
560
- fill_side="both",
561
- )
562
- logger.line(time_str)
563
-
564
- # Convert time to string
565
- def time2str(self, t):
566
- datetime_str_format = "%Y-%m-%d %H:%M:%S"
567
- if isinstance(t, datetime.datetime):
568
- return t.strftime(datetime_str_format)
569
- elif isinstance(t, datetime.timedelta):
570
- hours = t.seconds // 3600
571
- hour_str = f"{hours} hr" if hours > 0 else ""
572
- minutes = (t.seconds // 60) % 60
573
- minute_str = f"{minutes:>2} min" if minutes > 0 else ""
574
- seconds = t.seconds % 60
575
- second_str = f"{seconds:>2} s"
576
- time_str = " ".join([hour_str, minute_str, second_str]).strip()
577
- return time_str
578
- else:
579
- return str(t)
580
-
581
-
582
- class OSEnver:
583
- def __init__(self):
584
- self.envs_stack = []
585
- self.envs = os.environ.copy()
586
-
587
- def store_envs(self):
588
- self.envs_stack.append(self.envs)
589
-
590
- def restore_envs(self):
591
- self.envs = self.envs_stack.pop()
592
-
593
- def set_envs(self, secrets=True, proxies=None, store_envs=True):
594
- # caller_info = inspect.stack()[1]
595
- # logger.back(f"OS Envs is set by: {caller_info.filename}")
596
-
597
- if store_envs:
598
- self.store_envs()
599
-
600
- if secrets:
601
- secrets_path = Path(__file__).parents[1] / "secrets.json"
602
- if secrets_path.exists():
603
- with open(secrets_path, "r") as rf:
604
- secrets = json.load(rf)
605
- else:
606
- secrets = {}
607
-
608
- if proxies:
609
- for proxy_env in ["http_proxy", "https_proxy"]:
610
- if isinstance(proxies, str):
611
- self.envs[proxy_env] = proxies
612
- elif "http_proxy" in secrets.keys():
613
- self.envs[proxy_env] = secrets["http_proxy"]
614
- elif os.getenv("http_proxy"):
615
- self.envs[proxy_env] = os.getenv("http_proxy")
616
- else:
617
- continue
618
-
619
- self.proxy = (
620
- self.envs.get("all_proxy")
621
- or self.envs.get("http_proxy")
622
- or self.envs.get("https_proxy")
623
- or None
624
- )
625
- self.requests_proxies = {
626
- "http": self.proxy,
627
- "https": self.proxy,
17
+ headers: Optional[Dict[str, str]] = None,
18
+ proxy: Optional[str] = None,
19
+ timeout: Optional[int] = 10,
20
+ ) -> None:
21
+ """Initialize the GoogleS object."""
22
+ self.proxy: Optional[str] = proxy
23
+ self.headers = headers if headers else {
24
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"
628
25
  }
26
+ self.headers["Referer"] = "https://www.google.com/"
27
+ self.client = requests.Session()
28
+ self.client.headers.update(self.headers)
29
+ self.client.proxies.update({"http": self.proxy, "https": self.proxy})
30
+ self.timeout = timeout
629
31
 
630
- if self.proxy:
631
- logger.note(f"Using proxy: [{self.proxy}]")
632
-
633
-
634
- enver = OSEnver()
635
-
636
- class GoogleSearcher:
637
- def __init__(self):
638
- self.url = "https://www.google.com/search"
639
- self.enver = enver
640
- self.enver.set_envs(proxies=True)
641
- self.filepath_converter = QueryToFilepathConverter()
642
-
643
- def send_request(self, result_num=10, safe=False):
644
- self.request_response = requests.get(
645
- url=self.url,
646
- headers=REQUESTS_HEADERS,
647
- params={
648
- "q": self.query,
649
- "num": result_num,
650
- },
651
- proxies=self.enver.requests_proxies,
652
- )
653
-
654
- def save_response(self):
655
- if not self.html_path.exists():
656
- self.html_path.parent.mkdir(parents=True, exist_ok=True)
657
- logger.note(f"Saving to: [{self.html_path}]")
658
- with open(self.html_path, "wb") as wf:
659
- wf.write(self.request_response.content)
660
-
661
- def search(self, query, result_num=10, safe=False, overwrite=False):
662
- self.query = query
663
- self.html_path = self.filepath_converter.convert(self.query)
664
- logger.note(f"Searching: [{self.query}]")
665
- if self.html_path.exists() and not overwrite:
666
- logger.success(f"HTML existed: {self.html_path}")
667
- else:
668
- self.send_request(result_num=result_num, safe=safe)
669
- self.save_response()
670
- return self.html_path
671
-
672
-
673
- IGNORE_TAGS = ["script", "style", "button"]
674
- IGNORE_CLASSES = [
675
- # common
676
- "sidebar",
677
- "footer",
678
- "related",
679
- "comment",
680
- "topbar",
681
- "offcanvas",
682
- "navbar",
683
- # 163.com
684
- "post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
685
- "ntes\-.*nav",
686
- "nav\-bottom",
687
- # wikipedia.org
688
- "language\-list",
689
- "vector\-(header)|(column)|(sticky\-pinned)|(dropdown\-content)",
690
- "navbox",
691
- "catlinks",
692
- ]
693
-
694
- IGNORE_HOSTS = [
695
- "weibo.com",
696
- "hymson.com",
697
- "yahoo.com",
698
- ]
699
-
700
- REQUESTS_HEADERS = {
701
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
702
- }
703
-
704
-
705
-
706
-
707
- class WebpageFetcher:
708
- def __init__(self):
709
- self.enver = enver
710
- self.enver.set_envs(proxies=True)
711
- self.filepath_converter = UrlToFilepathConverter()
32
+ def __enter__(self) -> "GoogleS":
33
+ return self
712
34
 
713
- def is_ignored_host(self, url):
714
- self.host = tldextract.extract(url).registered_domain
715
- if self.host in IGNORE_HOSTS:
716
- return True
717
- else:
718
- return False
35
+ def __exit__(self, exc_type, exc_val, exc_tb):
36
+ self.client.close()
719
37
 
720
- def send_request(self):
38
+ def _get_url(
39
+ self,
40
+ method: str,
41
+ url: str,
42
+ params: Optional[Dict[str, str]] = None,
43
+ data: Optional[Union[Dict[str, str], bytes]] = None,
44
+ ) -> bytes:
721
45
  try:
722
- self.request_response = requests.get(
723
- url=self.url,
724
- headers=REQUESTS_HEADERS,
725
- proxies=self.enver.requests_proxies,
726
- timeout=15,
727
- )
728
- except:
729
- logger.warn(f"Failed to fetch: [{self.url}]")
730
- self.request_response = None
731
-
732
- def save_response(self):
733
- if not self.html_path.exists():
734
- self.html_path.parent.mkdir(parents=True, exist_ok=True)
735
- logger.success(f"Saving to: [{self.html_path}]")
736
-
737
- if self.request_response is None:
738
- return
739
- else:
740
- with open(self.html_path, "wb") as wf:
741
- wf.write(self.request_response.content)
742
-
743
- def fetch(self, url, overwrite=False, output_parent=None):
744
- self.url = url
745
- logger.note(f"Fetching: [{self.url}]")
746
- self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
747
-
748
- if self.is_ignored_host(self.url):
749
- logger.warn(f"Ignore host: [{self.host}]")
750
- return self.html_path
751
-
752
- if self.html_path.exists() and not overwrite:
753
- logger.success(f"HTML existed: [{self.html_path}]")
754
- else:
755
- self.send_request()
756
- self.save_response()
757
- return self.html_path
758
-
759
-
760
- class BatchWebpageFetcher:
761
- def __init__(self):
762
- self.done_count = 0
763
- self.total_count = 0
764
- self.url_and_html_path_list = []
765
-
766
- def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
767
- webpage_fetcher = WebpageFetcher()
768
- html_path = webpage_fetcher.fetch(
769
- url=url, overwrite=overwrite, output_parent=output_parent
770
- )
771
- self.url_and_html_path_list.append({"url": url, "html_path": html_path})
772
- self.done_count += 1
773
- logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
46
+ resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
47
+ except Exception as ex:
48
+ raise Exception(f"{url} {type(ex).__name__}: {ex}") from ex
49
+ if resp.status_code == 200:
50
+ return resp.content
51
+ raise Exception(f"{resp.url} returned status code {resp.status_code}. {params=} {data=}")
52
+
53
+ def extract_text_from_webpage(self, html_content, max_characters=None):
54
+ """Extracts visible text from HTML content using BeautifulSoup."""
55
+ soup = BeautifulSoup(html_content, "html.parser")
56
+ # Remove unwanted tags
57
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
58
+ tag.extract()
59
+ # Get the remaining visible text
60
+ visible_text = soup.get_text(strip=True)
61
+ if max_characters:
62
+ visible_text = visible_text[:max_characters]
63
+ return visible_text
64
+
65
+ def search(
66
+ self,
67
+ keywords: str,
68
+ region: str = "us-en",
69
+ lang: str = "en",
70
+ safe: str = "off",
71
+ timelimit: Optional[str] = None,
72
+ max_results: Optional[int] = None,
73
+ extract_webpage_text: bool = False,
74
+ max_extract_characters: Optional[int] = 100,
75
+ ) -> List[Dict[str, str]]:
76
+ """Google text search."""
77
+ assert keywords, "keywords is mandatory"
78
+
79
+ results = []
80
+ futures = []
81
+ start = 0
82
+ while len(results) < max_results:
83
+ params = {
84
+ "q": keywords,
85
+ "num": 10, # Number of results per page
86
+ "hl": lang,
87
+ "start": start,
88
+ "safe": safe,
89
+ "gl": region,
90
+ }
91
+ if timelimit:
92
+ params["tbs"] = f"qdr:{timelimit}"
93
+
94
+ futures.append(self._executor.submit(self._get_url, "GET", "https://www.google.com/search", params=params))
95
+ start += 10
96
+
97
+ for future in as_completed(futures):
98
+ try:
99
+ resp_content = future.result()
100
+ soup = BeautifulSoup(resp_content, "html.parser")
101
+ result_block = soup.find_all("div", class_="g")
102
+
103
+ if not result_block:
104
+ break
105
+
106
+ for result in result_block:
107
+ try:
108
+ link = result.find("a", href=True)
109
+ title = result.find("h3")
110
+ description_box = result.find(
111
+ "div", {"style": "-webkit-line-clamp:2"}
112
+ )
113
+
114
+ if link and title and description_box:
115
+ url = link["href"]
116
+ title = title.text
117
+ description = description_box.text
118
+
119
+ visible_text = ""
120
+ if extract_webpage_text:
121
+ try:
122
+ page_content = self._get_url("GET", url)
123
+ visible_text = self.extract_text_from_webpage(
124
+ page_content, max_characters=max_extract_characters
125
+ )
126
+ except Exception as e:
127
+ print(f"Error extracting text from {url}: {e}")
128
+
129
+ results.append(
130
+ {
131
+ "title": title,
132
+ "href": url,
133
+ "abstract": description,
134
+ "index": len(results),
135
+ "type": "web",
136
+ "visible_text": visible_text,
137
+ }
138
+ )
139
+
140
+ if len(results) >= max_results:
141
+ return results
142
+
143
+ except Exception as e:
144
+ print(f"Error extracting result: {e}")
774
145
 
775
- def fetch(self, urls, overwrite=False, output_parent=None):
776
- self.urls = urls
777
- self.total_count = len(self.urls)
778
- with concurrent.futures.ThreadPoolExecutor() as executor:
779
- futures = [
780
- executor.submit(
781
- self.fecth_single_webpage,
782
- url=url,
783
- overwrite=overwrite,
784
- output_parent=output_parent,
785
- )
786
- for url in urls
787
- ]
146
+ except Exception as e:
147
+ print(f"Error fetching URL: {e}")
788
148
 
789
- for idx, future in enumerate(concurrent.futures.as_completed(futures)):
790
- result = future.result()
791
- return self.url_and_html_path_list
149
+ return results
792
150
 
793
151
 
152
+ if __name__ == "__main__":
153
+ from rich import print
154
+ searcher = GoogleS()
155
+ results = searcher.search("HelpingAI-9B", max_results=20, extract_webpage_text=True, max_extract_characters=200)
156
+ for result in results:
157
+ print(result)