webscout 6.2b0__py3-none-any.whl → 6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (97) hide show
  1. webscout/AIauto.py +191 -176
  2. webscout/AIbase.py +112 -239
  3. webscout/AIutel.py +488 -1130
  4. webscout/Agents/functioncall.py +248 -198
  5. webscout/Bing_search.py +250 -153
  6. webscout/DWEBS.py +454 -178
  7. webscout/Extra/__init__.py +2 -1
  8. webscout/Extra/autocoder/__init__.py +9 -0
  9. webscout/Extra/autocoder/autocoder_utiles.py +121 -0
  10. webscout/Extra/autocoder/rawdog.py +681 -0
  11. webscout/Extra/autollama.py +246 -195
  12. webscout/Extra/gguf.py +441 -226
  13. webscout/Extra/weather.py +172 -67
  14. webscout/LLM.py +442 -100
  15. webscout/Litlogger/__init__.py +681 -0
  16. webscout/Local/formats.py +4 -2
  17. webscout/Provider/Amigo.py +19 -10
  18. webscout/Provider/Andi.py +0 -33
  19. webscout/Provider/Blackboxai.py +4 -204
  20. webscout/Provider/DARKAI.py +1 -1
  21. webscout/Provider/EDITEE.py +1 -1
  22. webscout/Provider/Llama3.py +1 -1
  23. webscout/Provider/Marcus.py +137 -0
  24. webscout/Provider/NinjaChat.py +1 -1
  25. webscout/Provider/PI.py +221 -207
  26. webscout/Provider/Perplexity.py +598 -598
  27. webscout/Provider/RoboCoders.py +206 -0
  28. webscout/Provider/TTI/AiForce/__init__.py +22 -0
  29. webscout/Provider/TTI/AiForce/async_aiforce.py +257 -0
  30. webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -0
  31. webscout/Provider/TTI/Nexra/__init__.py +22 -0
  32. webscout/Provider/TTI/Nexra/async_nexra.py +286 -0
  33. webscout/Provider/TTI/Nexra/sync_nexra.py +258 -0
  34. webscout/Provider/TTI/PollinationsAI/__init__.py +23 -0
  35. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -0
  36. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -0
  37. webscout/Provider/TTI/__init__.py +3 -4
  38. webscout/Provider/TTI/artbit/__init__.py +22 -0
  39. webscout/Provider/TTI/artbit/async_artbit.py +184 -0
  40. webscout/Provider/TTI/artbit/sync_artbit.py +176 -0
  41. webscout/Provider/TTI/blackbox/__init__.py +4 -0
  42. webscout/Provider/TTI/blackbox/async_blackbox.py +212 -0
  43. webscout/Provider/TTI/{blackboximage.py → blackbox/sync_blackbox.py} +199 -153
  44. webscout/Provider/TTI/deepinfra/__init__.py +4 -0
  45. webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -0
  46. webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -0
  47. webscout/Provider/TTI/huggingface/__init__.py +22 -0
  48. webscout/Provider/TTI/huggingface/async_huggingface.py +199 -0
  49. webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -0
  50. webscout/Provider/TTI/imgninza/__init__.py +4 -0
  51. webscout/Provider/TTI/imgninza/async_ninza.py +214 -0
  52. webscout/Provider/TTI/{imgninza.py → imgninza/sync_ninza.py} +209 -136
  53. webscout/Provider/TTI/talkai/__init__.py +4 -0
  54. webscout/Provider/TTI/talkai/async_talkai.py +229 -0
  55. webscout/Provider/TTI/talkai/sync_talkai.py +207 -0
  56. webscout/Provider/__init__.py +146 -132
  57. webscout/Provider/askmyai.py +158 -0
  58. webscout/Provider/cerebras.py +227 -206
  59. webscout/Provider/geminiapi.py +208 -198
  60. webscout/Provider/llama3mitril.py +180 -0
  61. webscout/Provider/llmchat.py +203 -0
  62. webscout/Provider/mhystical.py +176 -0
  63. webscout/Provider/perplexitylabs.py +265 -0
  64. webscout/Provider/talkai.py +196 -0
  65. webscout/Provider/twitterclone.py +251 -244
  66. webscout/Provider/typegpt.py +359 -0
  67. webscout/__init__.py +28 -23
  68. webscout/__main__.py +5 -5
  69. webscout/cli.py +327 -347
  70. webscout/conversation.py +227 -0
  71. webscout/exceptions.py +161 -29
  72. webscout/litagent/__init__.py +172 -0
  73. webscout/litprinter/__init__.py +831 -0
  74. webscout/optimizers.py +270 -0
  75. webscout/prompt_manager.py +279 -0
  76. webscout/swiftcli/__init__.py +810 -0
  77. webscout/transcriber.py +479 -551
  78. webscout/update_checker.py +125 -0
  79. webscout/version.py +1 -1
  80. webscout-6.4.dist-info/LICENSE.md +211 -0
  81. {webscout-6.2b0.dist-info → webscout-6.4.dist-info}/METADATA +34 -55
  82. webscout-6.4.dist-info/RECORD +154 -0
  83. webscout/Provider/TTI/AIuncensored.py +0 -103
  84. webscout/Provider/TTI/Nexra.py +0 -120
  85. webscout/Provider/TTI/PollinationsAI.py +0 -138
  86. webscout/Provider/TTI/WebSimAI.py +0 -142
  87. webscout/Provider/TTI/aiforce.py +0 -160
  88. webscout/Provider/TTI/artbit.py +0 -141
  89. webscout/Provider/TTI/deepinfra.py +0 -148
  90. webscout/Provider/TTI/huggingface.py +0 -155
  91. webscout/models.py +0 -23
  92. webscout-6.2b0.dist-info/LICENSE.md +0 -50
  93. webscout-6.2b0.dist-info/RECORD +0 -118
  94. /webscout/{g4f.py → gpt4free.py} +0 -0
  95. {webscout-6.2b0.dist-info → webscout-6.4.dist-info}/WHEEL +0 -0
  96. {webscout-6.2b0.dist-info → webscout-6.4.dist-info}/entry_points.txt +0 -0
  97. {webscout-6.2b0.dist-info → webscout-6.4.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py CHANGED
@@ -1,179 +1,455 @@
1
- from bs4 import BeautifulSoup
2
- import requests
3
- from typing import Dict, List, Optional, Union
4
- from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from urllib.parse import quote
6
- from termcolor import colored
7
- import time
8
- import random
9
-
10
- class GoogleS:
11
- """
12
- Class to perform Google searches and retrieve results.
13
- """
14
-
15
- def __init__(
16
- self,
17
- headers: Optional[Dict[str, str]] = None,
18
- proxy: Optional[str] = None,
19
- timeout: Optional[int] = 10,
20
- max_workers: int = 20 # Increased max workers for thread pool
21
- ):
22
- """Initializes the GoogleS object."""
23
- self.proxy = proxy
24
- self.headers = headers if headers else {
25
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"
26
- }
27
- self.headers["Referer"] = "https://www.google.com/"
28
- self.client = requests.Session()
29
- self.client.headers.update(self.headers)
30
- self.client.proxies.update({"http": self.proxy, "https": self.proxy})
31
- self.timeout = timeout
32
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
33
-
34
- def __enter__(self):
35
- return self
36
-
37
- def __exit__(self, exc_type, exc_val, exc_tb):
38
- self.client.close()
39
-
40
- def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
41
- data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
42
- """
43
- Makes an HTTP request and returns the response content.
44
- """
45
- try:
46
- resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
47
- except Exception as ex:
48
- raise Exception(f"{url} {type(ex).__name__}: {ex}") from ex
49
- if resp.status_code == 200:
50
- return resp.content
51
- raise Exception(f"{resp.url} returned status code {resp.status_code}. {params=} {data=}")
52
-
53
- def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
54
- """
55
- Extracts visible text from HTML content using lxml parser.
56
- """
57
- soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
58
- for tag in soup(["script", "style", "header", "footer", "nav"]):
59
- tag.extract()
60
- visible_text = soup.get_text(strip=True)
61
- if max_characters:
62
- visible_text = visible_text[:max_characters]
63
- return visible_text
64
-
65
- def search(
66
- self,
67
- query: str,
68
- region: str = "us-en",
69
- language: str = "en",
70
- safe: str = "off",
71
- time_period: Optional[str] = None,
72
- max_results: int = 10,
73
- extract_text: bool = False,
74
- max_text_length: Optional[int] = 100,
75
- ) -> List[Dict[str, Union[str, int]]]:
76
- """
77
- Performs a Google search and returns the results.
78
-
79
- Args:
80
- query (str): The search query.
81
- region (str, optional): The region to search in (e.g., "us-en"). Defaults to "us-en".
82
- language (str, optional): The language of the search results (e.g., "en"). Defaults to "en".
83
- safe (str, optional): Safe search setting ("off", "active"). Defaults to "off".
84
- time_period (Optional[str], optional): Time period filter (e.g., "h" for past hour, "d" for past day).
85
- Defaults to None.
86
- max_results (int, optional): The maximum number of results to retrieve. Defaults to 10.
87
- extract_text (bool, optional): Whether to extract text from the linked web pages. Defaults to False.
88
- max_text_length (Optional[int], optional): The maximum length of the extracted text (in characters).
89
- Defaults to 100.
90
-
91
- Returns:
92
- List[Dict[str, Union[str, int]]]: A list of dictionaries, each representing a search result, containing:
93
- - 'title': The title of the result.
94
- - 'href': The URL of the result.
95
- - 'abstract': The description snippet of the result.
96
- - 'index': The index of the result in the list.
97
- - 'type': The type of result (currently always "web").
98
- - 'visible_text': The extracted text from the web page (if `extract_text` is True).
99
- """
100
- assert query, "Query cannot be empty."
101
-
102
- results = []
103
- futures = []
104
- start = 0
105
-
106
- while len(results) < max_results:
107
- params = {
108
- "q": query,
109
- "num": 10,
110
- "hl": language,
111
- "start": start,
112
- "safe": safe,
113
- "gl": region,
114
- }
115
- if time_period:
116
- params["tbs"] = f"qdr:{time_period}"
117
-
118
- futures.append(self._executor.submit(self._get_url, "GET", "https://www.google.com/search", params=params))
119
- start += 10
120
-
121
- for future in as_completed(futures):
122
- try:
123
- resp_content = future.result()
124
- soup = BeautifulSoup(resp_content, 'lxml') # Use lxml parser
125
- result_blocks = soup.find_all("div", class_="g")
126
-
127
- if not result_blocks:
128
- break
129
-
130
- # Extract links and titles first
131
- for result_block in result_blocks:
132
- link = result_block.find("a", href=True)
133
- title = result_block.find("h3")
134
- description_box = result_block.find(
135
- "div", {"style": "-webkit-line-clamp:2"}
136
- )
137
-
138
- if link and title and description_box:
139
- url = link["href"]
140
- results.append({
141
- "title": title.text,
142
- "href": url,
143
- "abstract": description_box.text,
144
- "index": len(results),
145
- "type": "web",
146
- "visible_text": "" # Initialize visible_text as empty string
147
- })
148
-
149
- if len(results) >= max_results:
150
- break # Stop if we have enough results
151
-
152
- # Parallelize text extraction if needed
153
- if extract_text:
154
- with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
155
- extraction_futures = [
156
- text_extractor.submit(self._extract_text_from_webpage,
157
- self._get_url("GET", result['href']),
158
- max_characters=max_text_length)
159
- for result in results
160
- if 'href' in result
161
- ]
162
- for i, future in enumerate(as_completed(extraction_futures)):
163
- try:
164
- results[i]['visible_text'] = future.result()
165
- except Exception as e:
166
- print(f"Error extracting text: {e}")
167
-
168
- except Exception as e:
169
- print(f"Error: {e}")
170
-
171
- return results
172
-
173
-
174
- if __name__ == "__main__":
175
- from rich import print
176
- searcher = GoogleS()
177
- results = searcher.search("HelpingAI-9B", max_results=20, extract_text=False, max_text_length=200)
178
- for result in results:
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ from typing import Dict, List, Optional, Union, Any
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from urllib.parse import quote, urljoin
6
+
7
+ import time
8
+ import random
9
+ import json
10
+ import os
11
+ from datetime import datetime, timedelta
12
+ from functools import lru_cache
13
+ from .Litlogger import LitLogger, LogFormat, ColorScheme
14
+ from tenacity import retry, stop_after_attempt, wait_exponential
15
+
16
+ class GoogleS:
17
+ """A Python interface for Google search with advanced features 🔥
18
+
19
+ The GoogleS class provides a powerful interface to perform web searches, image searches,
20
+ and advanced filtering on Google. Built with love by HAI to keep it 💯
21
+
22
+ Basic Usage:
23
+ >>> from webscout.DWEBS import GoogleS
24
+ >>> searcher = GoogleS()
25
+ >>> # Simple web search
26
+ >>> results = searcher.search("Python programming")
27
+ >>> for result in results:
28
+ ... print(f"Title: {result['title']}")
29
+ ... print(f"URL: {result['href']}")
30
+ ... print(f"Description: {result['abstract']}")
31
+
32
+ Advanced Web Search:
33
+ >>> # Search with filters
34
+ >>> results = searcher.search(
35
+ ... query="Python tutorials",
36
+ ... site="github.com",
37
+ ... file_type="pdf",
38
+ ... time_period="month",
39
+ ... max_results=5
40
+ ... )
41
+ >>> # Example response format:
42
+ >>> {
43
+ ... 'title': 'Python Tutorial',
44
+ ... 'href': 'https://example.com/python-tutorial',
45
+ ... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
46
+ ... 'index': 0,
47
+ ... 'type': 'web',
48
+ ... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
49
+ ... }
50
+
51
+ Image Search:
52
+ >>> # Search for images
53
+ >>> images = searcher.search_images(
54
+ ... query="cute puppies",
55
+ ... size="large",
56
+ ... color="color",
57
+ ... type_filter="photo",
58
+ ... max_results=5
59
+ ... )
60
+ >>> # Example response format:
61
+ >>> {
62
+ ... 'title': 'Cute Puppy Image',
63
+ ... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
64
+ ... 'full_url': 'https://example.com/puppy-full.jpg',
65
+ ... 'type': 'image'
66
+ ... }
67
+
68
+ Features:
69
+ - Web Search: Get detailed web results with title, URL, and description
70
+ - Image Search: Find images with thumbnails and full-resolution URLs
71
+ - Advanced Filters: Site-specific search, file types, time periods
72
+ - Rate Limiting: Smart request handling to avoid blocks
73
+ - Caching: Save results for faster repeat searches
74
+ - Retry Logic: Automatic retry on temporary failures
75
+ - Logging: Optional LitLogger integration for beautiful console output
76
+ - Proxy Support: Use custom proxies for requests
77
+ - Concurrent Processing: Multi-threaded requests for better performance
78
+
79
+ Response Format:
80
+ Web Search Results:
81
+ {
82
+ 'title': str, # Title of the webpage
83
+ 'href': str, # URL of the webpage
84
+ 'abstract': str, # Brief description or snippet
85
+ 'index': int, # Result position
86
+ 'type': 'web', # Result type identifier
87
+ 'visible_text': str # Full page text (if extract_text=True)
88
+ }
89
+
90
+ Image Search Results:
91
+ {
92
+ 'title': str, # Image title or description
93
+ 'thumbnail': str, # Thumbnail image URL
94
+ 'full_url': str, # Full resolution image URL
95
+ 'type': 'image' # Result type identifier
96
+ }
97
+ """
98
+
99
+ SEARCH_TYPES = {
100
+ "web": "https://www.google.com/search",
101
+ "image": "https://www.google.com/images",
102
+ "news": "https://www.google.com/news",
103
+ }
104
+
105
+ def __init__(
106
+ self,
107
+ headers: Optional[Dict[str, str]] = None,
108
+ proxy: Optional[str] = None,
109
+ timeout: Optional[int] = 10,
110
+ max_workers: int = 20,
111
+ cache_dir: Optional[str] = None,
112
+ rate_limit: float = 0.01,
113
+ use_litlogger: bool = False
114
+ ):
115
+ """
116
+ Initialize the GoogleS object with enhanced features.
117
+
118
+ Args:
119
+ cache_dir: Directory to store search result cache
120
+ rate_limit: Minimum time between requests in seconds
121
+ use_litlogger: Whether to use LitLogger for logging (default: False)
122
+ """
123
+ self.proxy = proxy
124
+ self.headers = headers if headers else {
125
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
126
+ }
127
+ self.headers["Referer"] = "https://www.google.com/"
128
+ self.client = requests.Session()
129
+ self.client.headers.update(self.headers)
130
+ if proxy:
131
+ self.client.proxies.update({"http": proxy, "https": proxy})
132
+ self.timeout = timeout
133
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
134
+ self.cache_dir = cache_dir
135
+ if cache_dir and not os.path.exists(cache_dir):
136
+ os.makedirs(cache_dir)
137
+ self.last_request_time = 0
138
+ self.rate_limit = rate_limit
139
+ self.use_litlogger = use_litlogger
140
+
141
+ # Setup enhanced logging with LitLogger if enabled
142
+ if self.use_litlogger:
143
+ self.logger = LitLogger(
144
+ name="GoogleS",
145
+ format=LogFormat.MODERN_EMOJI,
146
+ color_scheme=ColorScheme.CYBERPUNK,
147
+ console_output=True
148
+ )
149
+
150
+ def _respect_rate_limit(self):
151
+ """Ensure minimum time between requests"""
152
+ current_time = time.time()
153
+ time_since_last = current_time - self.last_request_time
154
+ if time_since_last < self.rate_limit:
155
+ time.sleep(self.rate_limit - time_since_last)
156
+ self.last_request_time = time.time()
157
+
158
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
159
+ def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
160
+ data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
161
+ """
162
+ Makes an HTTP request with retry logic and rate limiting.
163
+ """
164
+ self._respect_rate_limit()
165
+ try:
166
+ if self.use_litlogger:
167
+ self.logger.debug(f"Making {method} request to {url}")
168
+ resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
169
+ resp.raise_for_status()
170
+ if self.use_litlogger:
171
+ self.logger.success(f"Request successful: {resp.status_code}")
172
+ return resp.content
173
+ except requests.exceptions.RequestException as ex:
174
+ if self.use_litlogger:
175
+ self.logger.error(f"Request failed: {url} - {str(ex)}")
176
+ raise
177
+
178
+ @lru_cache(maxsize=100)
179
+ def _cache_key(self, query: str, **kwargs) -> str:
180
+ """Generate a cache key from search parameters"""
181
+ cache_data = {'query': query, **kwargs}
182
+ return json.dumps(cache_data, sort_keys=True)
183
+
184
+ def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
185
+ """Retrieve cached results if they exist and are not expired"""
186
+ if not self.cache_dir:
187
+ return None
188
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
189
+ if os.path.exists(cache_file):
190
+ with open(cache_file, 'r') as f:
191
+ cached_data = json.load(f)
192
+ if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
193
+ if self.use_litlogger:
194
+ self.logger.info(f"Using cached results for: {cache_key}")
195
+ return cached_data['results']
196
+ if self.use_litlogger:
197
+ self.logger.debug(f"No valid cache found for: {cache_key}")
198
+ return None
199
+
200
+ def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
201
+ """Cache search results"""
202
+ if not self.cache_dir:
203
+ return
204
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
205
+ with open(cache_file, 'w') as f:
206
+ json.dump({
207
+ 'timestamp': datetime.now().isoformat(),
208
+ 'results': results
209
+ }, f)
210
+
211
+ def search_images(
212
+ self,
213
+ query: str,
214
+ max_results: int = 10,
215
+ size: Optional[str] = None,
216
+ color: Optional[str] = None,
217
+ type_filter: Optional[str] = None,
218
+ **kwargs
219
+ ) -> List[Dict[str, str]]:
220
+ """Search for images on Google with style! 🖼️
221
+
222
+ Args:
223
+ query (str): What you're looking for fam
224
+ max_results (int): How many results you want (default: 10)
225
+ size (Optional[str]): Image size filter
226
+ - 'large': Big pics
227
+ - 'medium': Medium sized
228
+ - 'icon': Small icons
229
+ color (Optional[str]): Color filter
230
+ - 'color': Full color
231
+ - 'gray': Black and white
232
+ - 'transparent': Transparent background
233
+ type_filter (Optional[str]): Type of image
234
+ - 'face': Just faces
235
+ - 'photo': Real photos
236
+ - 'clipart': Vector art
237
+ - 'lineart': Line drawings
238
+
239
+ Returns:
240
+ List[Dict[str, str]]: List of image results with these keys:
241
+ - 'thumbnail': Small preview URL
242
+ - 'full_url': Full resolution image URL
243
+ - 'title': Image title/description
244
+ - 'type': Always 'image'
245
+
246
+ Example:
247
+ >>> searcher = GoogleS()
248
+ >>> # Find some cool nature pics
249
+ >>> images = searcher.search_images(
250
+ ... query="beautiful landscapes",
251
+ ... size="large",
252
+ ... color="color",
253
+ ... max_results=5
254
+ ... )
255
+ >>> for img in images:
256
+ ... print(f"Found: {img['title']}")
257
+ ... print(f"URL: {img['full_url']}")
258
+ """
259
+ params = {
260
+ "q": query,
261
+ "tbm": "isch",
262
+ "num": max_results
263
+ }
264
+
265
+ if size:
266
+ params["tbs"] = f"isz:{size}"
267
+ if color:
268
+ params["tbs"] = f"ic:{color}"
269
+ if type_filter:
270
+ params["tbs"] = f"itp:{type_filter}"
271
+
272
+ content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
273
+ soup = BeautifulSoup(content, 'lxml')
274
+
275
+ results = []
276
+ for img in soup.find_all("img", class_="rg_i"):
277
+ if len(results) >= max_results:
278
+ break
279
+
280
+ img_data = {
281
+ "thumbnail": img.get("src", ""),
282
+ "title": img.get("alt", ""),
283
+ "type": "image"
284
+ }
285
+
286
+ # Extract full resolution image URL if available
287
+ parent = img.parent
288
+ if parent and parent.get("href"):
289
+ img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
290
+
291
+ results.append(img_data)
292
+
293
+ return results
294
+
295
+ def search(
296
+ self,
297
+ query: str,
298
+ region: str = "us-en",
299
+ language: str = "en",
300
+ safe: str = "off",
301
+ time_period: Optional[str] = None,
302
+ max_results: int = 10,
303
+ extract_text: bool = False,
304
+ max_text_length: Optional[int] = 100,
305
+ site: Optional[str] = None, # Search within specific site
306
+ file_type: Optional[str] = None, # Filter by file type
307
+ sort_by: str = "relevance", # relevance, date
308
+ exclude_terms: Optional[List[str]] = None, # Terms to exclude
309
+ exact_phrase: Optional[str] = None, # Exact phrase match
310
+ ) -> List[Dict[str, Union[str, int]]]:
311
+ """
312
+ Enhanced search with additional filters and options.
313
+
314
+ Args:
315
+ site: Limit search to specific website
316
+ file_type: Filter by file type (pdf, doc, etc.)
317
+ sort_by: Sort results by relevance or date
318
+ exclude_terms: List of terms to exclude from search
319
+ exact_phrase: Exact phrase to match
320
+ """
321
+ if self.use_litlogger:
322
+ self.logger.info(f"Starting search for: {query}")
323
+
324
+ # Build advanced query
325
+ advanced_query = query
326
+ if site:
327
+ advanced_query += f" site:{site}"
328
+ if file_type:
329
+ advanced_query += f" filetype:{file_type}"
330
+ if exclude_terms:
331
+ advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
332
+ if exact_phrase:
333
+ advanced_query = f'"{exact_phrase}"' + advanced_query
334
+
335
+ if self.use_litlogger:
336
+ self.logger.debug(f"Advanced query: {advanced_query}")
337
+
338
+ # Check cache first
339
+ cache_key = self._cache_key(advanced_query, region=region, language=language,
340
+ safe=safe, time_period=time_period, sort_by=sort_by)
341
+ cached_results = self._get_cached_results(cache_key)
342
+ if cached_results:
343
+ return cached_results[:max_results]
344
+
345
+ # Continue with regular search implementation...
346
+ results = []
347
+ futures = []
348
+ start = 0
349
+
350
+ while len(results) < max_results:
351
+ params = {
352
+ "q": advanced_query,
353
+ "num": 10,
354
+ "hl": language,
355
+ "start": start,
356
+ "safe": safe,
357
+ "gl": region,
358
+ }
359
+ if time_period:
360
+ params["tbs"] = f"qdr:{time_period}"
361
+
362
+ futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
363
+ start += 10
364
+
365
+ for future in as_completed(futures):
366
+ try:
367
+ resp_content = future.result()
368
+ soup = BeautifulSoup(resp_content, 'lxml') # Use lxml parser
369
+ result_blocks = soup.find_all("div", class_="g")
370
+
371
+ if not result_blocks:
372
+ break
373
+
374
+ # Extract links and titles first
375
+ for result_block in result_blocks:
376
+ link = result_block.find("a", href=True)
377
+ title = result_block.find("h3")
378
+ description_box = result_block.find(
379
+ "div", {"style": "-webkit-line-clamp:2"}
380
+ )
381
+
382
+ if link and title and description_box:
383
+ url = link["href"]
384
+ results.append({
385
+ "title": title.text,
386
+ "href": url,
387
+ "abstract": description_box.text,
388
+ "index": len(results),
389
+ "type": "web",
390
+ "visible_text": "" # Initialize visible_text as empty string
391
+ })
392
+
393
+ if len(results) >= max_results:
394
+ break # Stop if we have enough results
395
+
396
+ # Parallelize text extraction if needed
397
+ if extract_text:
398
+ with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
399
+ extraction_futures = [
400
+ text_extractor.submit(self._extract_text_from_webpage,
401
+ self._get_url("GET", result['href']),
402
+ max_characters=max_text_length)
403
+ for result in results
404
+ if 'href' in result
405
+ ]
406
+ for i, future in enumerate(as_completed(extraction_futures)):
407
+ try:
408
+ results[i]['visible_text'] = future.result()
409
+ except Exception as e:
410
+ print(f"Error extracting text: {e}")
411
+
412
+ except Exception as e:
413
+ print(f"Error: {e}")
414
+
415
+ # Cache results before returning
416
+ self._cache_results(cache_key, results)
417
+ return results
418
+
419
+ def get_search_suggestions(self, query: str) -> List[str]:
420
+ """Get search suggestions for a query"""
421
+ params = {
422
+ "client": "chrome",
423
+ "q": query
424
+ }
425
+ content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
426
+ params=params)
427
+ suggestions = json.loads(content.decode('utf-8'))[1]
428
+ return suggestions
429
+
430
+ def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
431
+ """
432
+ Extracts visible text from HTML content using lxml parser.
433
+ """
434
+ soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
435
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
436
+ tag.extract()
437
+ visible_text = soup.get_text(strip=True)
438
+ if max_characters:
439
+ visible_text = visible_text[:max_characters]
440
+ return visible_text
441
+
442
+ def __enter__(self):
443
+ return self
444
+
445
+ def __exit__(self, exc_type, exc_val, exc_tb):
446
+ self.client.close()
447
+ self._executor.shutdown()
448
+
449
+
450
+ if __name__ == "__main__":
451
+ from rich import print
452
+ searcher = GoogleS()
453
+ results = searcher.search("HelpingAI-9B", max_results=200, extract_text=False, max_text_length=200)
454
+ for result in results:
179
455
  print(result)
@@ -1,4 +1,5 @@
1
1
  from .gguf import *
2
2
  from .autollama import *
3
3
  from .weather import *
4
- from .weather_ascii import *
4
+ from .weather_ascii import *
5
+ from .autocoder import *
@@ -0,0 +1,9 @@
1
+ """
2
+ AutoCoder Module - Part of Webscout
3
+ Provides automated code generation and manipulation capabilities.
4
+ """
5
+
6
+ from .rawdog import *
7
+ from .autocoder_utiles import *
8
+
9
+ # __all__ = [] # Add your public module names here