webscout 6.2b0__py3-none-any.whl → 6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIbase.py +309 -239
- webscout/Agents/functioncall.py +248 -198
- webscout/DWEBS.py +322 -178
- webscout/Extra/gguf.py +250 -60
- webscout/Extra/weather.py +172 -67
- webscout/LLM.py +279 -100
- webscout/Local/formats.py +4 -2
- webscout/Provider/Amigo.py +19 -10
- webscout/Provider/Andi.py +0 -33
- webscout/Provider/Blackboxai.py +4 -204
- webscout/Provider/Llama3.py +1 -1
- webscout/Provider/Marcus.py +137 -0
- webscout/Provider/TTI/__init__.py +2 -1
- webscout/Provider/TTI/talkai.py +116 -0
- webscout/Provider/__init__.py +10 -3
- webscout/Provider/askmyai.py +158 -0
- webscout/Provider/cerebras.py +71 -58
- webscout/Provider/geminiapi.py +208 -198
- webscout/Provider/llama3mitril.py +181 -0
- webscout/Provider/llmchat.py +203 -0
- webscout/Provider/talkai.py +196 -0
- webscout/Provider/twitterclone.py +7 -6
- webscout/cli.py +354 -346
- webscout/version.py +1 -1
- webscout-6.3.dist-info/LICENSE.md +211 -0
- {webscout-6.2b0.dist-info → webscout-6.3.dist-info}/METADATA +11 -13
- {webscout-6.2b0.dist-info → webscout-6.3.dist-info}/RECORD +31 -25
- webscout-6.2b0.dist-info/LICENSE.md +0 -50
- /webscout/Provider/TTI/{AIuncensored.py → AIuncensoredimage.py} +0 -0
- {webscout-6.2b0.dist-info → webscout-6.3.dist-info}/WHEEL +0 -0
- {webscout-6.2b0.dist-info → webscout-6.3.dist-info}/entry_points.txt +0 -0
- {webscout-6.2b0.dist-info → webscout-6.3.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py
CHANGED
|
@@ -1,179 +1,323 @@
|
|
|
1
|
-
from bs4 import BeautifulSoup
|
|
2
|
-
import requests
|
|
3
|
-
from typing import Dict, List, Optional, Union
|
|
4
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
-
from urllib.parse import quote
|
|
6
|
-
from termcolor import colored
|
|
7
|
-
import time
|
|
8
|
-
import random
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
self
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
1
|
+
from bs4 import BeautifulSoup
|
|
2
|
+
import requests
|
|
3
|
+
from typing import Dict, List, Optional, Union, Any
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
+
from urllib.parse import quote, urljoin
|
|
6
|
+
from termcolor import colored
|
|
7
|
+
import time
|
|
8
|
+
import random
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
from datetime import datetime, timedelta
|
|
12
|
+
from functools import lru_cache
|
|
13
|
+
import logging
|
|
14
|
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
15
|
+
|
|
16
|
+
class GoogleS:
|
|
17
|
+
"""
|
|
18
|
+
Enhanced Google Search class with support for web search, image search, and advanced filters.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
SEARCH_TYPES = {
|
|
22
|
+
"web": "https://www.google.com/search",
|
|
23
|
+
"image": "https://www.google.com/images",
|
|
24
|
+
"news": "https://www.google.com/news",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
headers: Optional[Dict[str, str]] = None,
|
|
30
|
+
proxy: Optional[str] = None,
|
|
31
|
+
timeout: Optional[int] = 10,
|
|
32
|
+
max_workers: int = 20,
|
|
33
|
+
cache_dir: Optional[str] = None,
|
|
34
|
+
rate_limit: float = 0.01
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Initialize the GoogleS object with enhanced features.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
cache_dir: Directory to store search result cache
|
|
41
|
+
rate_limit: Minimum time between requests in seconds
|
|
42
|
+
"""
|
|
43
|
+
self.proxy = proxy
|
|
44
|
+
self.headers = headers if headers else {
|
|
45
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
|
|
46
|
+
}
|
|
47
|
+
self.headers["Referer"] = "https://www.google.com/"
|
|
48
|
+
self.client = requests.Session()
|
|
49
|
+
self.client.headers.update(self.headers)
|
|
50
|
+
if proxy:
|
|
51
|
+
self.client.proxies.update({"http": proxy, "https": proxy})
|
|
52
|
+
self.timeout = timeout
|
|
53
|
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
54
|
+
self.cache_dir = cache_dir
|
|
55
|
+
if cache_dir and not os.path.exists(cache_dir):
|
|
56
|
+
os.makedirs(cache_dir)
|
|
57
|
+
self.last_request_time = 0
|
|
58
|
+
self.rate_limit = rate_limit
|
|
59
|
+
|
|
60
|
+
# Setup logging
|
|
61
|
+
logging.basicConfig(level=logging.INFO)
|
|
62
|
+
self.logger = logging.getLogger(__name__)
|
|
63
|
+
|
|
64
|
+
def _respect_rate_limit(self):
|
|
65
|
+
"""Ensure minimum time between requests"""
|
|
66
|
+
current_time = time.time()
|
|
67
|
+
time_since_last = current_time - self.last_request_time
|
|
68
|
+
if time_since_last < self.rate_limit:
|
|
69
|
+
time.sleep(self.rate_limit - time_since_last)
|
|
70
|
+
self.last_request_time = time.time()
|
|
71
|
+
|
|
72
|
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
|
73
|
+
def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
|
|
74
|
+
data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
|
|
75
|
+
"""
|
|
76
|
+
Makes an HTTP request with retry logic and rate limiting.
|
|
77
|
+
"""
|
|
78
|
+
self._respect_rate_limit()
|
|
79
|
+
try:
|
|
80
|
+
resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
|
|
81
|
+
resp.raise_for_status()
|
|
82
|
+
return resp.content
|
|
83
|
+
except requests.exceptions.RequestException as ex:
|
|
84
|
+
self.logger.error(f"Request failed: {url} - {str(ex)}")
|
|
85
|
+
raise
|
|
86
|
+
|
|
87
|
+
@lru_cache(maxsize=100)
|
|
88
|
+
def _cache_key(self, query: str, **kwargs) -> str:
|
|
89
|
+
"""Generate a cache key from search parameters"""
|
|
90
|
+
cache_data = {'query': query, **kwargs}
|
|
91
|
+
return json.dumps(cache_data, sort_keys=True)
|
|
92
|
+
|
|
93
|
+
def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
|
|
94
|
+
"""Retrieve cached results if they exist and are not expired"""
|
|
95
|
+
if not self.cache_dir:
|
|
96
|
+
return None
|
|
97
|
+
cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
|
|
98
|
+
if os.path.exists(cache_file):
|
|
99
|
+
with open(cache_file, 'r') as f:
|
|
100
|
+
cached_data = json.load(f)
|
|
101
|
+
if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
|
|
102
|
+
return cached_data['results']
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
|
|
106
|
+
"""Cache search results"""
|
|
107
|
+
if not self.cache_dir:
|
|
108
|
+
return
|
|
109
|
+
cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
|
|
110
|
+
with open(cache_file, 'w') as f:
|
|
111
|
+
json.dump({
|
|
112
|
+
'timestamp': datetime.now().isoformat(),
|
|
113
|
+
'results': results
|
|
114
|
+
}, f)
|
|
115
|
+
|
|
116
|
+
def search_images(
|
|
117
|
+
self,
|
|
118
|
+
query: str,
|
|
119
|
+
max_results: int = 10,
|
|
120
|
+
size: Optional[str] = None, # large, medium, icon
|
|
121
|
+
color: Optional[str] = None, # color, gray, transparent
|
|
122
|
+
type_filter: Optional[str] = None, # face, photo, clipart, lineart
|
|
123
|
+
**kwargs
|
|
124
|
+
) -> List[Dict[str, str]]:
|
|
125
|
+
"""
|
|
126
|
+
Perform an image search and return results.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
size: Filter by image size
|
|
130
|
+
color: Filter by color
|
|
131
|
+
type_filter: Filter by image type
|
|
132
|
+
"""
|
|
133
|
+
params = {
|
|
134
|
+
"q": query,
|
|
135
|
+
"tbm": "isch",
|
|
136
|
+
"num": max_results
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if size:
|
|
140
|
+
params["tbs"] = f"isz:{size}"
|
|
141
|
+
if color:
|
|
142
|
+
params["tbs"] = f"ic:{color}"
|
|
143
|
+
if type_filter:
|
|
144
|
+
params["tbs"] = f"itp:{type_filter}"
|
|
145
|
+
|
|
146
|
+
content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
|
|
147
|
+
soup = BeautifulSoup(content, 'lxml')
|
|
148
|
+
|
|
149
|
+
results = []
|
|
150
|
+
for img in soup.find_all("img", class_="rg_i"):
|
|
151
|
+
if len(results) >= max_results:
|
|
152
|
+
break
|
|
153
|
+
|
|
154
|
+
img_data = {
|
|
155
|
+
"thumbnail": img.get("src", ""),
|
|
156
|
+
"title": img.get("alt", ""),
|
|
157
|
+
"type": "image"
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# Extract full resolution image URL if available
|
|
161
|
+
parent = img.parent
|
|
162
|
+
if parent and parent.get("href"):
|
|
163
|
+
img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
|
|
164
|
+
|
|
165
|
+
results.append(img_data)
|
|
166
|
+
|
|
167
|
+
return results
|
|
168
|
+
|
|
169
|
+
def search(
|
|
170
|
+
self,
|
|
171
|
+
query: str,
|
|
172
|
+
region: str = "us-en",
|
|
173
|
+
language: str = "en",
|
|
174
|
+
safe: str = "off",
|
|
175
|
+
time_period: Optional[str] = None,
|
|
176
|
+
max_results: int = 10,
|
|
177
|
+
extract_text: bool = False,
|
|
178
|
+
max_text_length: Optional[int] = 100,
|
|
179
|
+
site: Optional[str] = None, # Search within specific site
|
|
180
|
+
file_type: Optional[str] = None, # Filter by file type
|
|
181
|
+
sort_by: str = "relevance", # relevance, date
|
|
182
|
+
exclude_terms: Optional[List[str]] = None, # Terms to exclude
|
|
183
|
+
exact_phrase: Optional[str] = None, # Exact phrase match
|
|
184
|
+
) -> List[Dict[str, Union[str, int]]]:
|
|
185
|
+
"""
|
|
186
|
+
Enhanced search with additional filters and options.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
site: Limit search to specific website
|
|
190
|
+
file_type: Filter by file type (pdf, doc, etc.)
|
|
191
|
+
sort_by: Sort results by relevance or date
|
|
192
|
+
exclude_terms: List of terms to exclude from search
|
|
193
|
+
exact_phrase: Exact phrase to match
|
|
194
|
+
"""
|
|
195
|
+
# Build advanced query
|
|
196
|
+
advanced_query = query
|
|
197
|
+
if site:
|
|
198
|
+
advanced_query += f" site:{site}"
|
|
199
|
+
if file_type:
|
|
200
|
+
advanced_query += f" filetype:{file_type}"
|
|
201
|
+
if exclude_terms:
|
|
202
|
+
advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
|
|
203
|
+
if exact_phrase:
|
|
204
|
+
advanced_query = f'"{exact_phrase}"' + advanced_query
|
|
205
|
+
|
|
206
|
+
# Check cache first
|
|
207
|
+
cache_key = self._cache_key(advanced_query, region=region, language=language,
|
|
208
|
+
safe=safe, time_period=time_period, sort_by=sort_by)
|
|
209
|
+
cached_results = self._get_cached_results(cache_key)
|
|
210
|
+
if cached_results:
|
|
211
|
+
return cached_results[:max_results]
|
|
212
|
+
|
|
213
|
+
# Continue with regular search implementation...
|
|
214
|
+
results = []
|
|
215
|
+
futures = []
|
|
216
|
+
start = 0
|
|
217
|
+
|
|
218
|
+
while len(results) < max_results:
|
|
219
|
+
params = {
|
|
220
|
+
"q": advanced_query,
|
|
221
|
+
"num": 10,
|
|
222
|
+
"hl": language,
|
|
223
|
+
"start": start,
|
|
224
|
+
"safe": safe,
|
|
225
|
+
"gl": region,
|
|
226
|
+
}
|
|
227
|
+
if time_period:
|
|
228
|
+
params["tbs"] = f"qdr:{time_period}"
|
|
229
|
+
|
|
230
|
+
futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
|
|
231
|
+
start += 10
|
|
232
|
+
|
|
233
|
+
for future in as_completed(futures):
|
|
234
|
+
try:
|
|
235
|
+
resp_content = future.result()
|
|
236
|
+
soup = BeautifulSoup(resp_content, 'lxml') # Use lxml parser
|
|
237
|
+
result_blocks = soup.find_all("div", class_="g")
|
|
238
|
+
|
|
239
|
+
if not result_blocks:
|
|
240
|
+
break
|
|
241
|
+
|
|
242
|
+
# Extract links and titles first
|
|
243
|
+
for result_block in result_blocks:
|
|
244
|
+
link = result_block.find("a", href=True)
|
|
245
|
+
title = result_block.find("h3")
|
|
246
|
+
description_box = result_block.find(
|
|
247
|
+
"div", {"style": "-webkit-line-clamp:2"}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
if link and title and description_box:
|
|
251
|
+
url = link["href"]
|
|
252
|
+
results.append({
|
|
253
|
+
"title": title.text,
|
|
254
|
+
"href": url,
|
|
255
|
+
"abstract": description_box.text,
|
|
256
|
+
"index": len(results),
|
|
257
|
+
"type": "web",
|
|
258
|
+
"visible_text": "" # Initialize visible_text as empty string
|
|
259
|
+
})
|
|
260
|
+
|
|
261
|
+
if len(results) >= max_results:
|
|
262
|
+
break # Stop if we have enough results
|
|
263
|
+
|
|
264
|
+
# Parallelize text extraction if needed
|
|
265
|
+
if extract_text:
|
|
266
|
+
with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
|
|
267
|
+
extraction_futures = [
|
|
268
|
+
text_extractor.submit(self._extract_text_from_webpage,
|
|
269
|
+
self._get_url("GET", result['href']),
|
|
270
|
+
max_characters=max_text_length)
|
|
271
|
+
for result in results
|
|
272
|
+
if 'href' in result
|
|
273
|
+
]
|
|
274
|
+
for i, future in enumerate(as_completed(extraction_futures)):
|
|
275
|
+
try:
|
|
276
|
+
results[i]['visible_text'] = future.result()
|
|
277
|
+
except Exception as e:
|
|
278
|
+
print(f"Error extracting text: {e}")
|
|
279
|
+
|
|
280
|
+
except Exception as e:
|
|
281
|
+
print(f"Error: {e}")
|
|
282
|
+
|
|
283
|
+
# Cache results before returning
|
|
284
|
+
self._cache_results(cache_key, results)
|
|
285
|
+
return results
|
|
286
|
+
|
|
287
|
+
def get_search_suggestions(self, query: str) -> List[str]:
|
|
288
|
+
"""Get search suggestions for a query"""
|
|
289
|
+
params = {
|
|
290
|
+
"client": "chrome",
|
|
291
|
+
"q": query
|
|
292
|
+
}
|
|
293
|
+
content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
|
|
294
|
+
params=params)
|
|
295
|
+
suggestions = json.loads(content.decode('utf-8'))[1]
|
|
296
|
+
return suggestions
|
|
297
|
+
|
|
298
|
+
def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
|
|
299
|
+
"""
|
|
300
|
+
Extracts visible text from HTML content using lxml parser.
|
|
301
|
+
"""
|
|
302
|
+
soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
|
|
303
|
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
|
304
|
+
tag.extract()
|
|
305
|
+
visible_text = soup.get_text(strip=True)
|
|
306
|
+
if max_characters:
|
|
307
|
+
visible_text = visible_text[:max_characters]
|
|
308
|
+
return visible_text
|
|
309
|
+
|
|
310
|
+
def __enter__(self):
|
|
311
|
+
return self
|
|
312
|
+
|
|
313
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
314
|
+
self.client.close()
|
|
315
|
+
self._executor.shutdown()
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
if __name__ == "__main__":
|
|
319
|
+
from rich import print
|
|
320
|
+
searcher = GoogleS()
|
|
321
|
+
results = searcher.search("HelpingAI-9B", max_results=200, extract_text=False, max_text_length=200)
|
|
322
|
+
for result in results:
|
|
179
323
|
print(result)
|