webscout 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- DeepWEBS/documents/query_results_extractor.py +67 -46
- DeepWEBS/networks/google_searcher.py +28 -24
- DeepWEBS/networks/webpage_fetcher.py +45 -55
- webscout/DWEBS.py +197 -179
- webscout/__init__.py +1 -0
- webscout/offlineAI.py +206 -0
- webscout/version.py +1 -1
- {webscout-1.2.2.dist-info → webscout-1.2.4.dist-info}/METADATA +57 -10
- {webscout-1.2.2.dist-info → webscout-1.2.4.dist-info}/RECORD +13 -12
- {webscout-1.2.2.dist-info → webscout-1.2.4.dist-info}/LICENSE.md +0 -0
- {webscout-1.2.2.dist-info → webscout-1.2.4.dist-info}/WHEEL +0 -0
- {webscout-1.2.2.dist-info → webscout-1.2.4.dist-info}/entry_points.txt +0 -0
- {webscout-1.2.2.dist-info → webscout-1.2.4.dist-info}/top_level.txt +0 -0
|
@@ -2,61 +2,79 @@ from bs4 import BeautifulSoup
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from DeepWEBS.utilsdw.logger import logger
|
|
4
4
|
|
|
5
|
-
|
|
6
5
|
class QueryResultsExtractor:
|
|
7
6
|
def __init__(self) -> None:
|
|
8
7
|
self.query_results = []
|
|
9
8
|
self.related_questions = []
|
|
10
9
|
|
|
11
10
|
def load_html(self, html_path):
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
try:
|
|
12
|
+
with open(html_path, "r", encoding="utf-8") as f:
|
|
13
|
+
html = f.read()
|
|
14
|
+
self.soup = BeautifulSoup(html, "html.parser")
|
|
15
|
+
except FileNotFoundError:
|
|
16
|
+
logger.error(f"File not found: {html_path}")
|
|
17
|
+
except Exception as e:
|
|
18
|
+
logger.error(f"Error loading HTML: {e}")
|
|
15
19
|
|
|
16
20
|
def extract_query_results(self):
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
21
|
+
try:
|
|
22
|
+
self.query = self.soup.find("textarea").text.strip()
|
|
23
|
+
query_result_elements = self.soup.find_all("div", class_="g")
|
|
24
|
+
for idx, result in enumerate(query_result_elements):
|
|
25
|
+
try:
|
|
26
|
+
site = result.find("cite").find_previous("span").text.strip()
|
|
27
|
+
url = result.find("a")["href"]
|
|
28
|
+
title = result.find("h3").text.strip()
|
|
29
|
+
abstract_element_conditions = [
|
|
30
|
+
{"data-sncf": "1"},
|
|
31
|
+
{"class_": "ITZIwc"},
|
|
32
|
+
]
|
|
33
|
+
for condition in abstract_element_conditions:
|
|
34
|
+
abstract_element = result.find("div", condition)
|
|
35
|
+
if abstract_element is not None:
|
|
36
|
+
abstract = abstract_element.text.strip()
|
|
37
|
+
break
|
|
38
|
+
else:
|
|
39
|
+
abstract = ""
|
|
40
|
+
logger.mesg(
|
|
41
|
+
f"{title}\n"
|
|
42
|
+
f" - {site}\n"
|
|
43
|
+
f" - {url}\n"
|
|
44
|
+
f" - {abstract}\n"
|
|
45
|
+
f"\n"
|
|
46
|
+
)
|
|
47
|
+
self.query_results.append(
|
|
48
|
+
{
|
|
49
|
+
"title": title,
|
|
50
|
+
"site": site,
|
|
51
|
+
"url": url,
|
|
52
|
+
"abstract": abstract,
|
|
53
|
+
"index": idx,
|
|
54
|
+
"type": "web",
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Error extracting query result: {e}")
|
|
59
|
+
logger.success(f"- {len(query_result_elements)} query results")
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.error(f"Error extracting query results: {e}")
|
|
50
62
|
|
|
51
63
|
def extract_related_questions(self):
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
64
|
+
try:
|
|
65
|
+
related_question_elements = self.soup.find_all(
|
|
66
|
+
"div", class_="related-question-pair"
|
|
67
|
+
)
|
|
68
|
+
for question_element in related_question_elements:
|
|
69
|
+
try:
|
|
70
|
+
question = question_element.find("span").text.strip()
|
|
71
|
+
print(question)
|
|
72
|
+
self.related_questions.append(question)
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.error(f"Error extracting related question: {e}")
|
|
75
|
+
logger.success(f"- {len(self.related_questions)} related questions")
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Error extracting related questions: {e}")
|
|
60
78
|
|
|
61
79
|
def extract(self, html_path):
|
|
62
80
|
self.load_html(html_path)
|
|
@@ -75,4 +93,7 @@ if __name__ == "__main__":
|
|
|
75
93
|
html_filename = "python_tutorials"
|
|
76
94
|
html_path = html_path_root / f"{html_filename}.html"
|
|
77
95
|
extractor = QueryResultsExtractor()
|
|
78
|
-
|
|
96
|
+
try:
|
|
97
|
+
extractor.extract(html_path)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"Error in main function: {e}")
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import random
|
|
3
5
|
from DeepWEBS.utilsdw.enver import enver
|
|
4
6
|
from DeepWEBS.utilsdw.logger import logger
|
|
5
7
|
from DeepWEBS.networks.filepath_converter import QueryToFilepathConverter
|
|
6
8
|
from DeepWEBS.networks.network_configs import REQUESTS_HEADERS
|
|
7
9
|
|
|
8
|
-
|
|
9
10
|
class GoogleSearcher:
|
|
10
11
|
def __init__(self):
|
|
11
12
|
self.url = "https://www.google.com/search"
|
|
@@ -13,36 +14,39 @@ class GoogleSearcher:
|
|
|
13
14
|
self.enver.set_envs(proxies=True)
|
|
14
15
|
self.filepath_converter = QueryToFilepathConverter()
|
|
15
16
|
|
|
16
|
-
def send_request(self, result_num=10, safe=False):
|
|
17
|
-
|
|
18
|
-
|
|
17
|
+
def send_request(self, query: str, result_num: int = 10, safe: bool = False) -> requests.Response:
|
|
18
|
+
params = {
|
|
19
|
+
"q": query,
|
|
20
|
+
"num": result_num,
|
|
21
|
+
}
|
|
22
|
+
response = requests.get(
|
|
23
|
+
self.url,
|
|
19
24
|
headers=REQUESTS_HEADERS,
|
|
20
|
-
params=
|
|
21
|
-
"q": self.query,
|
|
22
|
-
"num": result_num,
|
|
23
|
-
},
|
|
25
|
+
params=params,
|
|
24
26
|
proxies=self.enver.requests_proxies,
|
|
25
27
|
)
|
|
28
|
+
response.raise_for_status() # Raise an exception for non-2xx status codes
|
|
29
|
+
return response
|
|
30
|
+
|
|
31
|
+
def save_response(self, response: requests.Response, html_path: Path) -> None:
|
|
32
|
+
html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
logger.note(f"Saving to: [{html_path}]")
|
|
34
|
+
with html_path.open("wb") as wf:
|
|
35
|
+
wf.write(response.content)
|
|
26
36
|
|
|
27
|
-
def
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
logger.note(f"Saving to: [{self.html_path}]")
|
|
31
|
-
with open(self.html_path, "wb") as wf:
|
|
32
|
-
wf.write(self.request_response.content)
|
|
37
|
+
def search(self, query: str, result_num: int = 10, safe: bool = False, overwrite: bool = False) -> Path:
|
|
38
|
+
html_path = self.filepath_converter.convert(query)
|
|
39
|
+
logger.note(f"Searching: [{query}]")
|
|
33
40
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
self.html_path = self.filepath_converter.convert(self.query)
|
|
37
|
-
logger.note(f"Searching: [{self.query}]")
|
|
38
|
-
if self.html_path.exists() and not overwrite:
|
|
39
|
-
logger.success(f"HTML existed: {self.html_path}")
|
|
41
|
+
if html_path.exists() and not overwrite:
|
|
42
|
+
logger.success(f"HTML existed: {html_path}")
|
|
40
43
|
else:
|
|
41
|
-
self.send_request(result_num
|
|
42
|
-
self.save_response()
|
|
43
|
-
return self.html_path
|
|
44
|
+
response = self.send_request(query, result_num, safe)
|
|
45
|
+
self.save_response(response, html_path)
|
|
44
46
|
|
|
47
|
+
return html_path
|
|
45
48
|
|
|
46
49
|
if __name__ == "__main__":
|
|
47
50
|
searcher = GoogleSearcher()
|
|
48
|
-
searcher.search("python tutorials")
|
|
51
|
+
html_path = searcher.search("python tutorials")
|
|
52
|
+
print(f"HTML file saved at: {html_path}")
|
|
@@ -1,100 +1,92 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
|
+
import random
|
|
2
3
|
import requests
|
|
3
4
|
import tldextract
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import List, Tuple, Dict
|
|
7
|
+
|
|
5
8
|
from DeepWEBS.utilsdw.enver import enver
|
|
6
9
|
from DeepWEBS.utilsdw.logger import logger
|
|
7
10
|
from DeepWEBS.networks.filepath_converter import UrlToFilepathConverter
|
|
8
11
|
from DeepWEBS.networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
|
|
9
12
|
|
|
10
|
-
|
|
11
13
|
class WebpageFetcher:
|
|
12
14
|
def __init__(self):
|
|
13
15
|
self.enver = enver
|
|
14
16
|
self.enver.set_envs(proxies=True)
|
|
15
17
|
self.filepath_converter = UrlToFilepathConverter()
|
|
16
18
|
|
|
17
|
-
def is_ignored_host(self, url):
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
return True
|
|
21
|
-
else:
|
|
22
|
-
return False
|
|
19
|
+
def is_ignored_host(self, url: str) -> bool:
|
|
20
|
+
host = tldextract.extract(url).registered_domain
|
|
21
|
+
return host in IGNORE_HOSTS
|
|
23
22
|
|
|
24
|
-
def send_request(self):
|
|
23
|
+
def send_request(self, url: str) -> requests.Response:
|
|
25
24
|
try:
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
user_agent = random.choice(REQUESTS_HEADERS["User-Agent"])
|
|
26
|
+
response = requests.get(
|
|
27
|
+
url=url,
|
|
28
|
+
headers={"User-Agent": user_agent},
|
|
29
29
|
proxies=self.enver.requests_proxies,
|
|
30
30
|
timeout=15,
|
|
31
31
|
)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
if self.request_response is None:
|
|
32
|
+
response.raise_for_status()
|
|
33
|
+
return response
|
|
34
|
+
except requests.exceptions.RequestException as e:
|
|
35
|
+
logger.warn(f"Failed to fetch: [{url}] | {e}")
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
def save_response(self, response: requests.Response, html_path: Path) -> None:
|
|
39
|
+
if response is None:
|
|
42
40
|
return
|
|
43
|
-
else:
|
|
44
|
-
with open(self.html_path, "wb") as wf:
|
|
45
|
-
wf.write(self.request_response.content)
|
|
46
41
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
42
|
+
html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
logger.success(f"Saving to: [{html_path}]")
|
|
44
|
+
with html_path.open("wb") as wf:
|
|
45
|
+
wf.write(response.content)
|
|
46
|
+
|
|
47
|
+
def fetch(self, url: str, overwrite: bool = False, output_parent: str = None) -> Path:
|
|
48
|
+
logger.note(f"Fetching: [{url}]")
|
|
49
|
+
html_path = self.filepath_converter.convert(url, parent=output_parent)
|
|
51
50
|
|
|
52
|
-
if self.is_ignored_host(
|
|
53
|
-
logger.warn(f"
|
|
54
|
-
return
|
|
51
|
+
if self.is_ignored_host(url):
|
|
52
|
+
logger.warn(f"Ignored host: [{tldextract.extract(url).registered_domain}]")
|
|
53
|
+
return html_path
|
|
55
54
|
|
|
56
|
-
if
|
|
57
|
-
logger.success(f"HTML existed: [{
|
|
55
|
+
if html_path.exists() and not overwrite:
|
|
56
|
+
logger.success(f"HTML existed: [{html_path}]")
|
|
58
57
|
else:
|
|
59
|
-
self.send_request()
|
|
60
|
-
self.save_response()
|
|
61
|
-
return self.html_path
|
|
58
|
+
response = self.send_request(url)
|
|
59
|
+
self.save_response(response, html_path)
|
|
62
60
|
|
|
61
|
+
return html_path
|
|
63
62
|
|
|
64
63
|
class BatchWebpageFetcher:
|
|
65
64
|
def __init__(self):
|
|
66
65
|
self.done_count = 0
|
|
67
66
|
self.total_count = 0
|
|
68
|
-
self.url_and_html_path_list = []
|
|
67
|
+
self.url_and_html_path_list: List[Dict[str, str]] = []
|
|
69
68
|
|
|
70
|
-
def
|
|
69
|
+
def fetch_single_webpage(self, url: str, overwrite: bool = False, output_parent: str = None) -> Tuple[str, Path]:
|
|
71
70
|
webpage_fetcher = WebpageFetcher()
|
|
72
|
-
html_path = webpage_fetcher.fetch(
|
|
73
|
-
|
|
74
|
-
)
|
|
75
|
-
self.url_and_html_path_list.append({"url": url, "html_path": html_path})
|
|
71
|
+
html_path = webpage_fetcher.fetch(url, overwrite, output_parent)
|
|
72
|
+
self.url_and_html_path_list.append({"url": url, "html_path": str(html_path)})
|
|
76
73
|
self.done_count += 1
|
|
77
74
|
logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
|
|
75
|
+
return url, html_path
|
|
78
76
|
|
|
79
|
-
def fetch(self, urls, overwrite=False, output_parent=None):
|
|
77
|
+
def fetch(self, urls: List[str], overwrite: bool = False, output_parent: str = None) -> List[Dict[str, str]]:
|
|
80
78
|
self.urls = urls
|
|
81
79
|
self.total_count = len(self.urls)
|
|
80
|
+
|
|
82
81
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
83
82
|
futures = [
|
|
84
|
-
executor.submit(
|
|
85
|
-
self.fecth_single_webpage,
|
|
86
|
-
url=url,
|
|
87
|
-
overwrite=overwrite,
|
|
88
|
-
output_parent=output_parent,
|
|
89
|
-
)
|
|
83
|
+
executor.submit(self.fetch_single_webpage, url, overwrite, output_parent)
|
|
90
84
|
for url in urls
|
|
91
85
|
]
|
|
86
|
+
concurrent.futures.wait(futures)
|
|
92
87
|
|
|
93
|
-
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
|
94
|
-
result = future.result()
|
|
95
88
|
return self.url_and_html_path_list
|
|
96
89
|
|
|
97
|
-
|
|
98
90
|
if __name__ == "__main__":
|
|
99
91
|
urls = [
|
|
100
92
|
"https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
|
|
@@ -102,6 +94,4 @@ if __name__ == "__main__":
|
|
|
102
94
|
"https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
|
|
103
95
|
]
|
|
104
96
|
batch_webpage_fetcher = BatchWebpageFetcher()
|
|
105
|
-
batch_webpage_fetcher.fetch(
|
|
106
|
-
urls=urls, overwrite=True, output_parent="python tutorials"
|
|
107
|
-
)
|
|
97
|
+
batch_webpage_fetcher.fetch(urls=urls, overwrite=True, output_parent="python tutorials")
|
webscout/DWEBS.py
CHANGED
|
@@ -1,179 +1,197 @@
|
|
|
1
|
-
|
|
2
|
-
from pydantic import BaseModel, Field
|
|
3
|
-
from typing import Union
|
|
4
|
-
|
|
5
|
-
from DeepWEBS.utilsdw.logger import logger
|
|
6
|
-
from DeepWEBS.networks.google_searcher import GoogleSearcher
|
|
7
|
-
from DeepWEBS.networks.webpage_fetcher import BatchWebpageFetcher
|
|
8
|
-
from DeepWEBS.documents.query_results_extractor import QueryResultsExtractor
|
|
9
|
-
from DeepWEBS.documents.webpage_content_extractor import BatchWebpageContentExtractor
|
|
10
|
-
from DeepWEBS.utilsdw.logger import logger
|
|
11
|
-
import argparse
|
|
12
|
-
|
|
13
|
-
class DeepWEBS:
|
|
14
|
-
def __init__(self):
|
|
15
|
-
pass
|
|
16
|
-
|
|
17
|
-
class DeepSearch(BaseModel):
|
|
18
|
-
queries: list = Field(
|
|
19
|
-
default=[""],
|
|
20
|
-
description="(list[str]) Queries to search",
|
|
21
|
-
)
|
|
22
|
-
result_num: int = Field(
|
|
23
|
-
default=10,
|
|
24
|
-
description="(int) Number of search results",
|
|
25
|
-
)
|
|
26
|
-
safe: bool = Field(
|
|
27
|
-
default=False,
|
|
28
|
-
description="(bool) Enable SafeSearch",
|
|
29
|
-
)
|
|
30
|
-
types: list = Field(
|
|
31
|
-
default=["web"],
|
|
32
|
-
description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
|
|
33
|
-
)
|
|
34
|
-
extract_webpage: bool = Field(
|
|
35
|
-
default=False,
|
|
36
|
-
description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
|
|
37
|
-
)
|
|
38
|
-
overwrite_query_html: bool = Field(
|
|
39
|
-
default=False,
|
|
40
|
-
description="(bool) Overwrite HTML file of query results",
|
|
41
|
-
)
|
|
42
|
-
overwrite_webpage_html: bool = Field(
|
|
43
|
-
default=False,
|
|
44
|
-
description="(bool) Overwrite HTML files of webpages from query results",
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
def queries_to_search_results(self, item: DeepSearch):
|
|
48
|
-
google_searcher = GoogleSearcher()
|
|
49
|
-
queries_search_results = []
|
|
50
|
-
for query in item.queries:
|
|
51
|
-
query_results_extractor = QueryResultsExtractor()
|
|
52
|
-
if not query.strip():
|
|
53
|
-
continue
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
]
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
"
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
"
|
|
171
|
-
"
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
1
|
+
|
|
2
|
+
from pydantic import BaseModel, Field
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
6
|
+
from DeepWEBS.networks.google_searcher import GoogleSearcher
|
|
7
|
+
from DeepWEBS.networks.webpage_fetcher import BatchWebpageFetcher
|
|
8
|
+
from DeepWEBS.documents.query_results_extractor import QueryResultsExtractor
|
|
9
|
+
from DeepWEBS.documents.webpage_content_extractor import BatchWebpageContentExtractor
|
|
10
|
+
from DeepWEBS.utilsdw.logger import logger
|
|
11
|
+
import argparse
|
|
12
|
+
|
|
13
|
+
class DeepWEBS:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
class DeepSearch(BaseModel):
|
|
18
|
+
queries: list = Field(
|
|
19
|
+
default=[""],
|
|
20
|
+
description="(list[str]) Queries to search",
|
|
21
|
+
)
|
|
22
|
+
result_num: int = Field(
|
|
23
|
+
default=10,
|
|
24
|
+
description="(int) Number of search results",
|
|
25
|
+
)
|
|
26
|
+
safe: bool = Field(
|
|
27
|
+
default=False,
|
|
28
|
+
description="(bool) Enable SafeSearch",
|
|
29
|
+
)
|
|
30
|
+
types: list = Field(
|
|
31
|
+
default=["web"],
|
|
32
|
+
description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
|
|
33
|
+
)
|
|
34
|
+
extract_webpage: bool = Field(
|
|
35
|
+
default=False,
|
|
36
|
+
description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
|
|
37
|
+
)
|
|
38
|
+
overwrite_query_html: bool = Field(
|
|
39
|
+
default=False,
|
|
40
|
+
description="(bool) Overwrite HTML file of query results",
|
|
41
|
+
)
|
|
42
|
+
overwrite_webpage_html: bool = Field(
|
|
43
|
+
default=False,
|
|
44
|
+
description="(bool) Overwrite HTML files of webpages from query results",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def queries_to_search_results(self, item: DeepSearch):
|
|
48
|
+
google_searcher = GoogleSearcher()
|
|
49
|
+
queries_search_results = []
|
|
50
|
+
for query in item.queries:
|
|
51
|
+
query_results_extractor = QueryResultsExtractor()
|
|
52
|
+
if not query.strip():
|
|
53
|
+
continue
|
|
54
|
+
try:
|
|
55
|
+
query_html_path = google_searcher.search(
|
|
56
|
+
query=query,
|
|
57
|
+
result_num=item.result_num,
|
|
58
|
+
safe=item.safe,
|
|
59
|
+
overwrite=item.overwrite_query_html,
|
|
60
|
+
)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.error(f"Failed to search for query '{query}': {e}")
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
query_search_results = query_results_extractor.extract(query_html_path)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.error(f"Failed to extract search results for query '{query}': {e}")
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
queries_search_results.append(query_search_results)
|
|
72
|
+
logger.note(queries_search_results)
|
|
73
|
+
|
|
74
|
+
if item.extract_webpage:
|
|
75
|
+
queries_search_results = self.extract_webpages(
|
|
76
|
+
queries_search_results,
|
|
77
|
+
overwrite_webpage_html=item.overwrite_webpage_html,
|
|
78
|
+
)
|
|
79
|
+
return queries_search_results
|
|
80
|
+
|
|
81
|
+
def extract_webpages(self, queries_search_results, overwrite_webpage_html=False):
|
|
82
|
+
for query_idx, query_search_results in enumerate(queries_search_results):
|
|
83
|
+
try:
|
|
84
|
+
# Fetch webpages with urls
|
|
85
|
+
batch_webpage_fetcher = BatchWebpageFetcher()
|
|
86
|
+
urls = [
|
|
87
|
+
query_result["url"]
|
|
88
|
+
for query_result in query_search_results["query_results"]
|
|
89
|
+
]
|
|
90
|
+
url_and_html_path_list = batch_webpage_fetcher.fetch(
|
|
91
|
+
urls,
|
|
92
|
+
overwrite=overwrite_webpage_html,
|
|
93
|
+
output_parent=query_search_results["query"],
|
|
94
|
+
)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.error(f"Failed to fetch webpages for query '{query_search_results['query']}': {e}")
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Extract webpage contents from htmls
|
|
100
|
+
html_paths = [
|
|
101
|
+
str(url_and_html_path["html_path"])
|
|
102
|
+
for url_and_html_path in url_and_html_path_list
|
|
103
|
+
]
|
|
104
|
+
batch_webpage_content_extractor = BatchWebpageContentExtractor()
|
|
105
|
+
try:
|
|
106
|
+
html_path_and_extracted_content_list = (
|
|
107
|
+
batch_webpage_content_extractor.extract(html_paths)
|
|
108
|
+
)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"Failed to extract webpage contents for query '{query_search_results['query']}': {e}")
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
# Build the map of url to extracted_content
|
|
114
|
+
html_path_to_url_dict = {
|
|
115
|
+
str(url_and_html_path["html_path"]): url_and_html_path["url"]
|
|
116
|
+
for url_and_html_path in url_and_html_path_list
|
|
117
|
+
}
|
|
118
|
+
url_to_extracted_content_dict = {
|
|
119
|
+
html_path_to_url_dict[
|
|
120
|
+
html_path_and_extracted_content["html_path"]
|
|
121
|
+
]: html_path_and_extracted_content["extracted_content"]
|
|
122
|
+
for html_path_and_extracted_content in html_path_and_extracted_content_list
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# Write extracted contents (as 'text' field) to query_search_results
|
|
126
|
+
for query_result_idx, query_result in enumerate(
|
|
127
|
+
query_search_results["query_results"]
|
|
128
|
+
):
|
|
129
|
+
url = query_result["url"]
|
|
130
|
+
extracted_content = url_to_extracted_content_dict.get(url, "")
|
|
131
|
+
queries_search_results[query_idx]["query_results"][query_result_idx][
|
|
132
|
+
"text"
|
|
133
|
+
] = extracted_content
|
|
134
|
+
|
|
135
|
+
return queries_search_results
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class ArgParser(argparse.ArgumentParser):
|
|
139
|
+
def __init__(self, *args, **kwargs):
|
|
140
|
+
super(ArgParser, self).__init__(*args, **kwargs)
|
|
141
|
+
|
|
142
|
+
self.add_argument(
|
|
143
|
+
"-q",
|
|
144
|
+
"--queries",
|
|
145
|
+
type=str,
|
|
146
|
+
nargs="+",
|
|
147
|
+
required=True,
|
|
148
|
+
help="Queries to search",
|
|
149
|
+
)
|
|
150
|
+
self.add_argument(
|
|
151
|
+
"-n",
|
|
152
|
+
"--result_num",
|
|
153
|
+
type=int,
|
|
154
|
+
default=10,
|
|
155
|
+
help="Number of search results",
|
|
156
|
+
)
|
|
157
|
+
self.add_argument(
|
|
158
|
+
"-s",
|
|
159
|
+
"--safe",
|
|
160
|
+
default=False,
|
|
161
|
+
action="store_true",
|
|
162
|
+
help="Enable SafeSearch",
|
|
163
|
+
)
|
|
164
|
+
self.add_argument(
|
|
165
|
+
"-t",
|
|
166
|
+
"--types",
|
|
167
|
+
type=str,
|
|
168
|
+
nargs="+",
|
|
169
|
+
default=["web"],
|
|
170
|
+
choices=["web", "image", "videos", "news"],
|
|
171
|
+
help="Types of search results",
|
|
172
|
+
)
|
|
173
|
+
self.add_argument(
|
|
174
|
+
"-e",
|
|
175
|
+
"--extract_webpage",
|
|
176
|
+
default=False,
|
|
177
|
+
action="store_true",
|
|
178
|
+
help="Enable extracting main text contents from webpage",
|
|
179
|
+
)
|
|
180
|
+
self.add_argument(
|
|
181
|
+
"-o",
|
|
182
|
+
"--overwrite_query_html",
|
|
183
|
+
default=False,
|
|
184
|
+
action="store_true",
|
|
185
|
+
help="Overwrite HTML file of query results",
|
|
186
|
+
)
|
|
187
|
+
self.add_argument(
|
|
188
|
+
"-w",
|
|
189
|
+
"--overwrite_webpage_html",
|
|
190
|
+
default=False,
|
|
191
|
+
action="store_true",
|
|
192
|
+
help="Overwrite HTML files of webpages from query results",
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
self.args = self.parse_args()
|
|
196
|
+
|
|
197
|
+
|
webscout/__init__.py
CHANGED
|
@@ -9,6 +9,7 @@ from .webscout_search import WEBS
|
|
|
9
9
|
from .webscout_search_async import AsyncWEBS
|
|
10
10
|
from .version import __version__
|
|
11
11
|
from .DWEBS import DeepWEBS
|
|
12
|
+
from .offlineAI import GPT4ALL
|
|
12
13
|
__all__ = ["WEBS", "AsyncWEBS", "__version__", "cli"]
|
|
13
14
|
|
|
14
15
|
logging.getLogger("webscout").addHandler(logging.NullHandler())
|
webscout/offlineAI.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
from webscout.AIutel import Optimizers
|
|
2
|
+
from webscout.AIutel import Conversation
|
|
3
|
+
from webscout.AIutel import AwesomePrompts
|
|
4
|
+
from webscout.AIbase import Provider
|
|
5
|
+
from gpt4all import GPT4All
|
|
6
|
+
from gpt4all.gpt4all import empty_chat_session
|
|
7
|
+
from gpt4all.gpt4all import append_extension_if_missing
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
my_logger = logging.getLogger("gpt4all")
|
|
13
|
+
my_logger.setLevel(logging.CRITICAL)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GPT4ALL(Provider):
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
model: str,
|
|
20
|
+
is_conversation: bool = True,
|
|
21
|
+
max_tokens: int = 800,
|
|
22
|
+
temperature: float = 0.7,
|
|
23
|
+
presence_penalty: int = 0,
|
|
24
|
+
frequency_penalty: int = 1.18,
|
|
25
|
+
top_p: float = 0.4,
|
|
26
|
+
intro: str = None,
|
|
27
|
+
filepath: str = None,
|
|
28
|
+
update_file: bool = True,
|
|
29
|
+
history_offset: int = 10250,
|
|
30
|
+
act: str = None,
|
|
31
|
+
):
|
|
32
|
+
"""Instantiates GPT4ALL
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
model (str, optional): Path to LLM model (.gguf or .bin).
|
|
36
|
+
is_conversation (bool, optional): Flag for chatting conversationally. Defaults to True.
|
|
37
|
+
max_tokens (int, optional): Maximum number of tokens to be generated upon completion. Defaults to 800.
|
|
38
|
+
temperature (float, optional): Charge of the generated text's randomness. Defaults to 0.7.
|
|
39
|
+
presence_penalty (int, optional): Chances of topic being repeated. Defaults to 0.
|
|
40
|
+
frequency_penalty (int, optional): Chances of word being repeated. Defaults to 1.18.
|
|
41
|
+
top_p (float, optional): Sampling threshold during inference time. Defaults to 0.4.
|
|
42
|
+
intro (str, optional): Conversation introductory prompt. Defaults to None.
|
|
43
|
+
filepath (str, optional): Path to file containing conversation history. Defaults to None.
|
|
44
|
+
update_file (bool, optional): Add new prompts and responses to the file. Defaults to True.
|
|
45
|
+
history_offset (int, optional): Limit conversation history to this number of last texts. Defaults to 10250.
|
|
46
|
+
act (str|int, optional): Awesome prompt key or index. (Used as intro). Defaults to None.
|
|
47
|
+
"""
|
|
48
|
+
self.is_conversation = is_conversation
|
|
49
|
+
self.max_tokens_to_sample = max_tokens
|
|
50
|
+
self.model = model
|
|
51
|
+
self.temperature = temperature
|
|
52
|
+
self.presence_penalty = presence_penalty
|
|
53
|
+
self.frequency_penalty = frequency_penalty
|
|
54
|
+
self.top_p = top_p
|
|
55
|
+
self.last_response = {}
|
|
56
|
+
|
|
57
|
+
self.__available_optimizers = (
|
|
58
|
+
method
|
|
59
|
+
for method in dir(Optimizers)
|
|
60
|
+
if callable(getattr(Optimizers, method)) and not method.startswith("__")
|
|
61
|
+
)
|
|
62
|
+
Conversation.intro = (
|
|
63
|
+
AwesomePrompts().get_act(
|
|
64
|
+
act, raise_not_found=True, default=None, case_insensitive=True
|
|
65
|
+
)
|
|
66
|
+
if act
|
|
67
|
+
else intro or Conversation.intro
|
|
68
|
+
)
|
|
69
|
+
self.conversation = Conversation(
|
|
70
|
+
is_conversation, self.max_tokens_to_sample, filepath, update_file
|
|
71
|
+
)
|
|
72
|
+
self.conversation.history_offset = history_offset
|
|
73
|
+
|
|
74
|
+
def get_model_name_path():
|
|
75
|
+
import os
|
|
76
|
+
from pathlib import Path
|
|
77
|
+
|
|
78
|
+
initial_model_path = Path(append_extension_if_missing(model))
|
|
79
|
+
if initial_model_path.exists:
|
|
80
|
+
if not initial_model_path.is_absolute():
|
|
81
|
+
initial_model_path = Path(os.getcwd()) / initial_model_path
|
|
82
|
+
return os.path.split(initial_model_path.as_posix())
|
|
83
|
+
else:
|
|
84
|
+
raise FileNotFoundError(
|
|
85
|
+
"File does not exist " + initial_model_path.as_posix()
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
model_dir, model_name = get_model_name_path()
|
|
89
|
+
|
|
90
|
+
self.gpt4all = GPT4All(
|
|
91
|
+
model_name=model_name,
|
|
92
|
+
model_path=model_dir,
|
|
93
|
+
allow_download=False,
|
|
94
|
+
verbose=False,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def ask(
|
|
98
|
+
self,
|
|
99
|
+
prompt: str,
|
|
100
|
+
stream: bool = False,
|
|
101
|
+
raw: bool = False,
|
|
102
|
+
optimizer: str = None,
|
|
103
|
+
conversationally: bool = False,
|
|
104
|
+
) -> dict:
|
|
105
|
+
"""Chat with AI
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
prompt (str): Prompt to be send.
|
|
109
|
+
stream (bool, optional): Flag for streaming response. Defaults to False.
|
|
110
|
+
raw (bool, optional): Stream back raw response as received. Defaults to False.
|
|
111
|
+
optimizer (str, optional): Prompt optimizer name - `[code, shell_command]`. Defaults to None.
|
|
112
|
+
conversationally (bool, optional): Chat conversationally when using optimizer. Defaults to False.
|
|
113
|
+
Returns:
|
|
114
|
+
dict : {}
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"text" : "How may I help you today?"
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
"""
|
|
121
|
+
conversation_prompt = self.conversation.gen_complete_prompt(prompt)
|
|
122
|
+
if optimizer:
|
|
123
|
+
if optimizer in self.__available_optimizers:
|
|
124
|
+
conversation_prompt = getattr(Optimizers, optimizer)(
|
|
125
|
+
conversation_prompt if conversationally else prompt
|
|
126
|
+
)
|
|
127
|
+
else:
|
|
128
|
+
raise Exception(
|
|
129
|
+
f"Optimizer is not one of {self.__available_optimizers}"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def for_stream():
|
|
133
|
+
response = self.gpt4all.generate(
|
|
134
|
+
prompt=conversation_prompt,
|
|
135
|
+
max_tokens=self.max_tokens_to_sample,
|
|
136
|
+
temp=self.temperature,
|
|
137
|
+
top_p=self.top_p,
|
|
138
|
+
repeat_penalty=self.frequency_penalty,
|
|
139
|
+
streaming=True,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
message_load: str = ""
|
|
143
|
+
for token in response:
|
|
144
|
+
message_load += token
|
|
145
|
+
resp: dict = dict(text=message_load)
|
|
146
|
+
yield token if raw else resp
|
|
147
|
+
self.last_response.update(resp)
|
|
148
|
+
|
|
149
|
+
self.conversation.update_chat_history(
|
|
150
|
+
prompt, self.get_message(self.last_response)
|
|
151
|
+
)
|
|
152
|
+
self.gpt4all.current_chat_session = empty_chat_session()
|
|
153
|
+
|
|
154
|
+
def for_non_stream():
|
|
155
|
+
for _ in for_stream():
|
|
156
|
+
pass
|
|
157
|
+
return self.last_response
|
|
158
|
+
|
|
159
|
+
return for_stream() if stream else for_non_stream()
|
|
160
|
+
|
|
161
|
+
def chat(
|
|
162
|
+
self,
|
|
163
|
+
prompt: str,
|
|
164
|
+
stream: bool = False,
|
|
165
|
+
optimizer: str = None,
|
|
166
|
+
conversationally: bool = False,
|
|
167
|
+
) -> str:
|
|
168
|
+
"""Generate response `str`
|
|
169
|
+
Args:
|
|
170
|
+
prompt (str): Prompt to be send.
|
|
171
|
+
stream (bool, optional): Flag for streaming response. Defaults to False.
|
|
172
|
+
optimizer (str, optional): Prompt optimizer name - `[code, shell_command]`. Defaults to None.
|
|
173
|
+
conversationally (bool, optional): Chat conversationally when using optimizer. Defaults to False.
|
|
174
|
+
Returns:
|
|
175
|
+
str: Response generated
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
def for_stream():
|
|
179
|
+
for response in self.ask(
|
|
180
|
+
prompt, True, optimizer=optimizer, conversationally=conversationally
|
|
181
|
+
):
|
|
182
|
+
yield self.get_message(response)
|
|
183
|
+
|
|
184
|
+
def for_non_stream():
|
|
185
|
+
return self.get_message(
|
|
186
|
+
self.ask(
|
|
187
|
+
prompt,
|
|
188
|
+
False,
|
|
189
|
+
optimizer=optimizer,
|
|
190
|
+
conversationally=conversationally,
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return for_stream() if stream else for_non_stream()
|
|
195
|
+
|
|
196
|
+
def get_message(self, response: dict) -> str:
|
|
197
|
+
"""Retrieves message only from response
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
response (str): Response generated by `self.ask`
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
str: Message extracted
|
|
204
|
+
"""
|
|
205
|
+
assert isinstance(response, dict), "Response should be of dict data-type only"
|
|
206
|
+
return response["text"]
|
webscout/version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.4"
|
|
2
2
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: webscout
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.4
|
|
4
4
|
Summary: Search for words, documents, images, videos, news, maps and text translation using the Google, DuckDuckGo.com, yep.com, phind.com, you.com, etc Also containes AI models
|
|
5
5
|
Author: OEvortex
|
|
6
6
|
Author-email: helpingai5@gmail.com
|
|
@@ -45,6 +45,8 @@ Requires-Dist: sse-starlette
|
|
|
45
45
|
Requires-Dist: termcolor
|
|
46
46
|
Requires-Dist: tiktoken
|
|
47
47
|
Requires-Dist: tldextract
|
|
48
|
+
Requires-Dist: gpt4all
|
|
49
|
+
Requires-Dist: orjson
|
|
48
50
|
Provides-Extra: dev
|
|
49
51
|
Requires-Dist: ruff >=0.1.6 ; extra == 'dev'
|
|
50
52
|
Requires-Dist: pytest >=7.4.2 ; extra == 'dev'
|
|
@@ -69,6 +71,7 @@ Also containes AI models that you can use
|
|
|
69
71
|
- [Regions](#regions)
|
|
70
72
|
- [DeepWEBS: Advanced Web Searches](#deepwebs-advanced-web-searches)
|
|
71
73
|
- [Activating DeepWEBS](#activating-deepwebs)
|
|
74
|
+
- [Point to remember before using `DeepWEBS`](#point-to-remember-before-using-deepwebs)
|
|
72
75
|
- [Usage Example](#usage-example)
|
|
73
76
|
- [WEBS and AsyncWEBS classes](#webs-and-asyncwebs-classes)
|
|
74
77
|
- [Exceptions](#exceptions)
|
|
@@ -91,6 +94,7 @@ Also containes AI models that you can use
|
|
|
91
94
|
- [6. `BlackBox` - Search/chat With BlackBox](#6-blackbox---searchchat-with-blackbox)
|
|
92
95
|
- [7. `PERPLEXITY` - Search With PERPLEXITY](#7-perplexity---search-with-perplexity)
|
|
93
96
|
- [8. `OpenGPT` - chat With OPENGPT](#8-opengpt---chat-with-opengpt)
|
|
97
|
+
- [9. `GPT4ALL` - chat offline with Language models using gpt4all from webscout](#9-gpt4all---chat-offline-with-language-models-using-gpt4all-from-webscout)
|
|
94
98
|
- [usage of special .LLM file from webscout (webscout.LLM)](#usage-of-special-llm-file-from-webscout-webscoutllm)
|
|
95
99
|
- [`LLM`](#llm)
|
|
96
100
|
|
|
@@ -224,26 +228,33 @@ ___
|
|
|
224
228
|
|
|
225
229
|
To utilize the `DeepWEBS` feature, you must first create an instance of the `DeepWEBS` . This is designed to be used independently of the `WEBS` , offering a focused approach to web searches.
|
|
226
230
|
|
|
231
|
+
### Point to remember before using `DeepWEBS`
|
|
232
|
+
As `DeepWEBS` is designed to extract relevant information directly from webpages and Search engine, It extracts html from webpages and saves them to folder named files in `DeepWEBS` that can be found at `C:\Users\Username\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\DeepWEBS`
|
|
233
|
+
|
|
227
234
|
### Usage Example
|
|
228
235
|
|
|
229
236
|
Here's a basic example of how to use the `DeepWEBS` :
|
|
230
237
|
```python
|
|
231
238
|
from webscout import DeepWEBS
|
|
232
239
|
|
|
233
|
-
|
|
234
240
|
def perform_web_search(query):
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
+
# Initialize the DeepWEBS class
|
|
242
|
+
D = DeepWEBS()
|
|
243
|
+
|
|
244
|
+
# Set up the search parameters
|
|
245
|
+
search_params = D.DeepSearch(
|
|
246
|
+
queries=[query], # Query to search
|
|
247
|
+
result_num=5, # Number of search results
|
|
248
|
+
safe=True, # Enable SafeSearch
|
|
249
|
+
types=["web"], # Search type: web
|
|
241
250
|
extract_webpage=True, # True for extracting webpages
|
|
242
251
|
overwrite_query_html=False,
|
|
243
252
|
overwrite_webpage_html=False,
|
|
244
253
|
)
|
|
245
|
-
|
|
246
|
-
|
|
254
|
+
|
|
255
|
+
# Execute the search and retrieve results
|
|
256
|
+
results = D.queries_to_search_results(search_params)
|
|
257
|
+
|
|
247
258
|
return results
|
|
248
259
|
|
|
249
260
|
def print_search_results(results):
|
|
@@ -260,8 +271,13 @@ def print_search_results(results):
|
|
|
260
271
|
print("No search results found.")
|
|
261
272
|
|
|
262
273
|
def main():
|
|
274
|
+
# Prompt the user for a search query
|
|
263
275
|
query = input("Enter your search query: ")
|
|
276
|
+
|
|
277
|
+
# Perform the web search
|
|
264
278
|
results = perform_web_search(query)
|
|
279
|
+
|
|
280
|
+
# Print the search results
|
|
265
281
|
print_search_results(results)
|
|
266
282
|
|
|
267
283
|
if __name__ == "__main__":
|
|
@@ -592,6 +608,37 @@ prompt = "tell me about india"
|
|
|
592
608
|
response_str = opengpt.chat(prompt)
|
|
593
609
|
print(response_str)
|
|
594
610
|
```
|
|
611
|
+
### 9. `GPT4ALL` - chat offline with Language models using gpt4all from webscout
|
|
612
|
+
```python
|
|
613
|
+
from webscout import GPT4ALL
|
|
614
|
+
|
|
615
|
+
# Initialize the GPT4ALL class with your model path and other optional parameters
|
|
616
|
+
gpt4all_instance = GPT4ALL(
|
|
617
|
+
model="path/to/your/model/file", # Replace with the actual path to your model file
|
|
618
|
+
is_conversation=True,
|
|
619
|
+
max_tokens=800,
|
|
620
|
+
temperature=0.7,
|
|
621
|
+
presence_penalty=0,
|
|
622
|
+
frequency_penalty=1.18,
|
|
623
|
+
top_p=0.4,
|
|
624
|
+
intro="Hello, how can I assist you today?",
|
|
625
|
+
filepath="path/to/conversation/history/file", # Optional, for conversation history
|
|
626
|
+
update_file=True,
|
|
627
|
+
history_offset=10250,
|
|
628
|
+
act=None # Optional, for using an awesome prompt as intro
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
# Generate a response from the AI model
|
|
632
|
+
response = gpt4all_instance.chat(
|
|
633
|
+
prompt="What is the weather like today?",
|
|
634
|
+
stream=False, # Set to True if you want to stream the response
|
|
635
|
+
optimizer=None, # Optional, specify an optimizer if needed
|
|
636
|
+
conversationally=False # Set to True for conversationally generated responses
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# Print the generated response
|
|
640
|
+
print(response)
|
|
641
|
+
```
|
|
595
642
|
|
|
596
643
|
## usage of special .LLM file from webscout (webscout.LLM)
|
|
597
644
|
|
|
@@ -1,33 +1,34 @@
|
|
|
1
1
|
DeepWEBS/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
DeepWEBS/documents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
DeepWEBS/documents/query_results_extractor.py,sha256=
|
|
3
|
+
DeepWEBS/documents/query_results_extractor.py,sha256=whd0NKLpcxW_6q3SkBOhMukr1K_c1PPYN92rf5EHRPM,4049
|
|
4
4
|
DeepWEBS/documents/webpage_content_extractor.py,sha256=P4yHCkPTiBvMbORd8SKVt64rQFPJuj3iixcQoRU34Lw,5272
|
|
5
5
|
DeepWEBS/networks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
DeepWEBS/networks/filepath_converter.py,sha256=JKMBew1TYe4TVoGTqgTWerq2Pam49_9u9TVUFCTDQyk,3183
|
|
7
|
-
DeepWEBS/networks/google_searcher.py,sha256
|
|
7
|
+
DeepWEBS/networks/google_searcher.py,sha256=-AdIpVkRgemsARnOt8WPkF2Id1baVlqDHyqX2qz8Aew,1966
|
|
8
8
|
DeepWEBS/networks/network_configs.py,sha256=-Hb78_7SBx32h219FnU14qcHTvBdDUf_QAU6-RTL_e0,726
|
|
9
|
-
DeepWEBS/networks/webpage_fetcher.py,sha256=
|
|
9
|
+
DeepWEBS/networks/webpage_fetcher.py,sha256=d5paDTB3wa_w6YWmLV7RkpAj8Lh8ztuUuyfe8RuTjQg,3846
|
|
10
10
|
DeepWEBS/utilsdw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
DeepWEBS/utilsdw/enver.py,sha256=vstxg_5P3Rwo1en6oPcuc2SBiATJqxi4C7meGmw5w0M,1754
|
|
12
12
|
DeepWEBS/utilsdw/logger.py,sha256=Z0nFUcEGyU8r28yKiIyvEtO26xxpmJgbvNToTfwZecc,8174
|
|
13
13
|
webscout/AI.py,sha256=CwUCeGnNRL9STd5bAZSyIiLysorBMu065HrkY8UCzAQ,49618
|
|
14
14
|
webscout/AIbase.py,sha256=vQi2ougu5bG-QdmoYmxCQsOg7KTEgG7EF6nZh5qqUGw,2343
|
|
15
15
|
webscout/AIutel.py,sha256=cvsuw57hq3GirAiT-PjqwhAiLPf1urOzDb2szJ4bwmo,24124
|
|
16
|
-
webscout/DWEBS.py,sha256=
|
|
16
|
+
webscout/DWEBS.py,sha256=QT-7-dUgWhQ_H7EVZD53AVyXxyskoPMKCkFIpzkN56Q,7332
|
|
17
17
|
webscout/HelpingAI.py,sha256=YeZw0zYVHMcBFFPNdd3_Ghpm9ebt_EScQjHO_IIs4lg,8103
|
|
18
18
|
webscout/LLM.py,sha256=XByJPiATLA_57FBWKw18Xx_PGRCPOj-GJE96aQH1k2Y,3309
|
|
19
|
-
webscout/__init__.py,sha256=
|
|
19
|
+
webscout/__init__.py,sha256=auv4OtSXPzH_Bcocya1179UvX4CTLmUqVg3cVXszjaA,457
|
|
20
20
|
webscout/__main__.py,sha256=ZtTRgsRjUi2JOvYFLF1ZCh55Sdoz94I-BS-TlJC7WDU,126
|
|
21
21
|
webscout/cli.py,sha256=F888fdrFUQgczMBN4yMOSf6Nh-IbvkqpPhDsbnA2FtQ,17059
|
|
22
22
|
webscout/exceptions.py,sha256=4AOO5wexeL96nvUS-badcckcwrPS7UpZyAgB9vknHZE,276
|
|
23
23
|
webscout/models.py,sha256=5iQIdtedT18YuTZ3npoG7kLMwcrKwhQ7928dl_7qZW0,692
|
|
24
|
+
webscout/offlineAI.py,sha256=ieF9fQU-bWFZz5aBAQ8ZNxaCj1O1mI_w5AaAM9E3e8Y,7607
|
|
24
25
|
webscout/utils.py,sha256=c_98M4oqpb54pUun3fpGGlCerFD6ZHUbghyp5b7Mwgo,2605
|
|
25
|
-
webscout/version.py,sha256=
|
|
26
|
+
webscout/version.py,sha256=w3Y48JpCJLB-DvbXBfEkRgyEnrQoRiXGnyHDTl9pG5M,25
|
|
26
27
|
webscout/webscout_search.py,sha256=3_lli-hDb8_kCGwscK29xuUcOS833ROgpNhDzrxh0dk,3085
|
|
27
28
|
webscout/webscout_search_async.py,sha256=Y5frH0k3hLqBCR-8dn7a_b7EvxdYxn6wHiKl3jWosE0,40670
|
|
28
|
-
webscout-1.2.
|
|
29
|
-
webscout-1.2.
|
|
30
|
-
webscout-1.2.
|
|
31
|
-
webscout-1.2.
|
|
32
|
-
webscout-1.2.
|
|
33
|
-
webscout-1.2.
|
|
29
|
+
webscout-1.2.4.dist-info/LICENSE.md,sha256=mRVwJuT4SXC5O93BFdsfWBjlXjGn2Np90Zm5SocUzM0,3150
|
|
30
|
+
webscout-1.2.4.dist-info/METADATA,sha256=Zh6yfh9n8U_C2QZUYkpluwAk04H7Hj2bcsyd0EHfP9w,23100
|
|
31
|
+
webscout-1.2.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
32
|
+
webscout-1.2.4.dist-info/entry_points.txt,sha256=8-93eRslYrzTHs5E-6yFRJrve00C9q-SkXJD113jzRY,197
|
|
33
|
+
webscout-1.2.4.dist-info/top_level.txt,sha256=OD5YKy6Y3hldL7SmuxsiEDxAG4LgdSSWwzYk22MF9fk,18
|
|
34
|
+
webscout-1.2.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|