webscout 7.0__py3-none-any.whl → 7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (147) hide show
  1. webscout/AIauto.py +191 -191
  2. webscout/AIbase.py +122 -122
  3. webscout/AIutel.py +440 -440
  4. webscout/Bard.py +343 -161
  5. webscout/DWEBS.py +489 -492
  6. webscout/Extra/YTToolkit/YTdownloader.py +995 -995
  7. webscout/Extra/YTToolkit/__init__.py +2 -2
  8. webscout/Extra/YTToolkit/transcriber.py +476 -479
  9. webscout/Extra/YTToolkit/ytapi/channel.py +307 -307
  10. webscout/Extra/YTToolkit/ytapi/playlist.py +58 -58
  11. webscout/Extra/YTToolkit/ytapi/pool.py +7 -7
  12. webscout/Extra/YTToolkit/ytapi/utils.py +62 -62
  13. webscout/Extra/YTToolkit/ytapi/video.py +103 -103
  14. webscout/Extra/autocoder/__init__.py +9 -9
  15. webscout/Extra/autocoder/autocoder_utiles.py +199 -199
  16. webscout/Extra/autocoder/rawdog.py +5 -7
  17. webscout/Extra/autollama.py +230 -230
  18. webscout/Extra/gguf.py +3 -3
  19. webscout/Extra/weather.py +171 -171
  20. webscout/LLM.py +442 -442
  21. webscout/Litlogger/__init__.py +67 -681
  22. webscout/Litlogger/core/__init__.py +6 -0
  23. webscout/Litlogger/core/level.py +20 -0
  24. webscout/Litlogger/core/logger.py +123 -0
  25. webscout/Litlogger/handlers/__init__.py +12 -0
  26. webscout/Litlogger/handlers/console.py +50 -0
  27. webscout/Litlogger/handlers/file.py +143 -0
  28. webscout/Litlogger/handlers/network.py +174 -0
  29. webscout/Litlogger/styles/__init__.py +7 -0
  30. webscout/Litlogger/styles/colors.py +231 -0
  31. webscout/Litlogger/styles/formats.py +377 -0
  32. webscout/Litlogger/styles/text.py +87 -0
  33. webscout/Litlogger/utils/__init__.py +6 -0
  34. webscout/Litlogger/utils/detectors.py +154 -0
  35. webscout/Litlogger/utils/formatters.py +200 -0
  36. webscout/Provider/AISEARCH/DeepFind.py +250 -250
  37. webscout/Provider/Blackboxai.py +136 -137
  38. webscout/Provider/ChatGPTGratis.py +226 -0
  39. webscout/Provider/Cloudflare.py +91 -78
  40. webscout/Provider/DeepSeek.py +218 -0
  41. webscout/Provider/Deepinfra.py +59 -35
  42. webscout/Provider/Free2GPT.py +131 -124
  43. webscout/Provider/Gemini.py +100 -115
  44. webscout/Provider/Glider.py +74 -59
  45. webscout/Provider/Groq.py +30 -18
  46. webscout/Provider/Jadve.py +108 -77
  47. webscout/Provider/Llama3.py +117 -94
  48. webscout/Provider/Marcus.py +191 -137
  49. webscout/Provider/Netwrck.py +62 -50
  50. webscout/Provider/PI.py +79 -124
  51. webscout/Provider/PizzaGPT.py +129 -83
  52. webscout/Provider/QwenLM.py +311 -0
  53. webscout/Provider/TTI/AiForce/__init__.py +22 -22
  54. webscout/Provider/TTI/AiForce/async_aiforce.py +257 -257
  55. webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -242
  56. webscout/Provider/TTI/Nexra/__init__.py +22 -22
  57. webscout/Provider/TTI/Nexra/async_nexra.py +286 -286
  58. webscout/Provider/TTI/Nexra/sync_nexra.py +258 -258
  59. webscout/Provider/TTI/PollinationsAI/__init__.py +23 -23
  60. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -330
  61. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -285
  62. webscout/Provider/TTI/artbit/__init__.py +22 -22
  63. webscout/Provider/TTI/artbit/async_artbit.py +184 -184
  64. webscout/Provider/TTI/artbit/sync_artbit.py +176 -176
  65. webscout/Provider/TTI/blackbox/__init__.py +4 -4
  66. webscout/Provider/TTI/blackbox/async_blackbox.py +212 -212
  67. webscout/Provider/TTI/blackbox/sync_blackbox.py +199 -199
  68. webscout/Provider/TTI/deepinfra/__init__.py +4 -4
  69. webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -227
  70. webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -199
  71. webscout/Provider/TTI/huggingface/__init__.py +22 -22
  72. webscout/Provider/TTI/huggingface/async_huggingface.py +199 -199
  73. webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -195
  74. webscout/Provider/TTI/imgninza/__init__.py +4 -4
  75. webscout/Provider/TTI/imgninza/async_ninza.py +214 -214
  76. webscout/Provider/TTI/imgninza/sync_ninza.py +209 -209
  77. webscout/Provider/TTI/talkai/__init__.py +4 -4
  78. webscout/Provider/TTI/talkai/async_talkai.py +229 -229
  79. webscout/Provider/TTI/talkai/sync_talkai.py +207 -207
  80. webscout/Provider/TTS/deepgram.py +182 -182
  81. webscout/Provider/TTS/elevenlabs.py +136 -136
  82. webscout/Provider/TTS/gesserit.py +150 -150
  83. webscout/Provider/TTS/murfai.py +138 -138
  84. webscout/Provider/TTS/parler.py +133 -134
  85. webscout/Provider/TTS/streamElements.py +360 -360
  86. webscout/Provider/TTS/utils.py +280 -280
  87. webscout/Provider/TTS/voicepod.py +116 -116
  88. webscout/Provider/TextPollinationsAI.py +74 -47
  89. webscout/Provider/WiseCat.py +193 -0
  90. webscout/Provider/__init__.py +144 -136
  91. webscout/Provider/cerebras.py +242 -227
  92. webscout/Provider/chatglm.py +204 -204
  93. webscout/Provider/dgaf.py +67 -39
  94. webscout/Provider/gaurish.py +105 -66
  95. webscout/Provider/geminiapi.py +208 -208
  96. webscout/Provider/granite.py +223 -0
  97. webscout/Provider/hermes.py +218 -218
  98. webscout/Provider/llama3mitril.py +179 -179
  99. webscout/Provider/llamatutor.py +72 -62
  100. webscout/Provider/llmchat.py +60 -35
  101. webscout/Provider/meta.py +794 -794
  102. webscout/Provider/multichat.py +331 -230
  103. webscout/Provider/typegpt.py +359 -356
  104. webscout/Provider/yep.py +5 -5
  105. webscout/__main__.py +5 -5
  106. webscout/cli.py +319 -319
  107. webscout/conversation.py +241 -242
  108. webscout/exceptions.py +328 -328
  109. webscout/litagent/__init__.py +28 -28
  110. webscout/litagent/agent.py +2 -3
  111. webscout/litprinter/__init__.py +0 -58
  112. webscout/scout/__init__.py +8 -8
  113. webscout/scout/core.py +884 -884
  114. webscout/scout/element.py +459 -459
  115. webscout/scout/parsers/__init__.py +69 -69
  116. webscout/scout/parsers/html5lib_parser.py +172 -172
  117. webscout/scout/parsers/html_parser.py +236 -236
  118. webscout/scout/parsers/lxml_parser.py +178 -178
  119. webscout/scout/utils.py +38 -38
  120. webscout/swiftcli/__init__.py +811 -811
  121. webscout/update_checker.py +2 -12
  122. webscout/version.py +1 -1
  123. webscout/webscout_search.py +1142 -1140
  124. webscout/webscout_search_async.py +635 -635
  125. webscout/zeroart/__init__.py +54 -54
  126. webscout/zeroart/base.py +60 -60
  127. webscout/zeroart/effects.py +99 -99
  128. webscout/zeroart/fonts.py +816 -816
  129. {webscout-7.0.dist-info → webscout-7.2.dist-info}/METADATA +21 -28
  130. webscout-7.2.dist-info/RECORD +217 -0
  131. webstoken/__init__.py +30 -30
  132. webstoken/classifier.py +189 -189
  133. webstoken/keywords.py +216 -216
  134. webstoken/language.py +128 -128
  135. webstoken/ner.py +164 -164
  136. webstoken/normalizer.py +35 -35
  137. webstoken/processor.py +77 -77
  138. webstoken/sentiment.py +206 -206
  139. webstoken/stemmer.py +73 -73
  140. webstoken/tagger.py +60 -60
  141. webstoken/tokenizer.py +158 -158
  142. webscout/Provider/RUBIKSAI.py +0 -272
  143. webscout-7.0.dist-info/RECORD +0 -199
  144. {webscout-7.0.dist-info → webscout-7.2.dist-info}/LICENSE.md +0 -0
  145. {webscout-7.0.dist-info → webscout-7.2.dist-info}/WHEEL +0 -0
  146. {webscout-7.0.dist-info → webscout-7.2.dist-info}/entry_points.txt +0 -0
  147. {webscout-7.0.dist-info → webscout-7.2.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py CHANGED
@@ -1,493 +1,490 @@
1
- import requests
2
- from typing import Dict, List, Optional, Union, Any
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from webscout.scout import Scout
5
- from urllib.parse import quote, urljoin
6
- from webscout.litagent import LitAgent
7
-
8
- import time
9
- import random
10
- import json
11
- import os
12
- from datetime import datetime, timedelta
13
- from functools import lru_cache
14
- from webscout.Litlogger import LitLogger, LogFormat, ColorScheme
15
-
16
- class GoogleS:
17
- """A Python interface for Google search with advanced features
18
-
19
- The GoogleS class provides a powerful interface to perform web searches, image searches,
20
- and advanced filtering on Google. Built with love by HAI to keep it
21
-
22
- Basic Usage:
23
- >>> from webscout.DWEBS import GoogleS
24
- >>> searcher = GoogleS()
25
- >>> # Simple web search
26
- >>> results = searcher.search("Python programming")
27
- >>> for result in results:
28
- ... print(f"Title: {result['title']}")
29
- ... print(f"URL: {result['href']}")
30
- ... print(f"Description: {result['abstract']}")
31
-
32
- Advanced Web Search:
33
- >>> # Search with filters
34
- >>> results = searcher.search(
35
- ... query="Python tutorials",
36
- ... site="github.com",
37
- ... file_type="pdf",
38
- ... time_period="month",
39
- ... max_results=5
40
- ... )
41
- >>> # Example response format:
42
- >>> {
43
- ... 'title': 'Python Tutorial',
44
- ... 'href': 'https://example.com/python-tutorial',
45
- ... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
46
- ... 'index': 0,
47
- ... 'type': 'web',
48
- ... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
49
- ... }
50
-
51
- Image Search:
52
- >>> # Search for images
53
- >>> images = searcher.search_images(
54
- ... query="cute puppies",
55
- ... size="large",
56
- ... color="color",
57
- ... type_filter="photo",
58
- ... max_results=5
59
- ... )
60
- >>> # Example response format:
61
- >>> {
62
- ... 'title': 'Cute Puppy Image',
63
- ... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
64
- ... 'full_url': 'https://example.com/puppy-full.jpg',
65
- ... 'type': 'image'
66
- ... }
67
-
68
- Features:
69
- - Web Search: Get detailed web results with title, URL, and description
70
- - Image Search: Find images with thumbnails and full-resolution URLs
71
- - Advanced Filters: Site-specific search, file types, time periods
72
- - Rate Limiting: Smart request handling to avoid blocks
73
- - Caching: Save results for faster repeat searches
74
- - Retry Logic: Automatic retry on temporary failures
75
- - Logging: Optional LitLogger integration for beautiful console output
76
- - Proxy Support: Use custom proxies for requests
77
- - Concurrent Processing: Multi-threaded requests for better performance
78
-
79
- Response Format:
80
- Web Search Results:
81
- {
82
- 'title': str, # Title of the webpage
83
- 'href': str, # URL of the webpage
84
- 'abstract': str, # Brief description or snippet
85
- 'index': int, # Result position
86
- 'type': 'web', # Result type identifier
87
- 'visible_text': str # Full page text (if extract_text=True)
88
- }
89
-
90
- Image Search Results:
91
- {
92
- 'title': str, # Image title or description
93
- 'thumbnail': str, # Thumbnail image URL
94
- 'full_url': str, # Full resolution image URL
95
- 'type': 'image' # Result type identifier
96
- }
97
- """
98
-
99
- SEARCH_TYPES = {
100
- "web": "https://www.google.com/search",
101
- "image": "https://www.google.com/images",
102
- "news": "https://www.google.com/news",
103
- }
104
-
105
- def __init__(
106
- self,
107
- headers: Optional[Dict[str, str]] = None,
108
- proxy: Optional[str] = None,
109
- timeout: Optional[int] = 10,
110
- max_workers: int = 20,
111
- cache_dir: Optional[str] = None,
112
- rate_limit: float = 2.0,
113
- use_litlogger: bool = False
114
- ):
115
- """
116
- Initialize the GoogleS object with enhanced features.
117
-
118
- Args:
119
- cache_dir: Directory to store search result cache
120
- rate_limit: Minimum time between requests in seconds
121
- use_litlogger: Whether to use LitLogger for logging (default: False)
122
- """
123
- self.proxy = proxy
124
- self.headers = headers if headers else {
125
- "User-Agent": LitAgent().random() # Use LitAgent to generate user agent
126
- }
127
- self.headers["Referer"] = "https://www.google.com/"
128
- self.client = requests.Session()
129
- self.client.headers.update(self.headers)
130
- if proxy:
131
- self.client.proxies.update({"http": proxy, "https": proxy})
132
- self.timeout = timeout
133
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
134
- self.cache_dir = cache_dir
135
- if cache_dir and not os.path.exists(cache_dir):
136
- os.makedirs(cache_dir)
137
- self.last_request_time = 0
138
- self.rate_limit = rate_limit
139
- self.use_litlogger = use_litlogger
140
-
141
- # Setup enhanced logging with LitLogger if enabled
142
- if self.use_litlogger:
143
- self.logger = LitLogger(
144
- name="GoogleS",
145
- format=LogFormat.MODERN_EMOJI,
146
- color_scheme=ColorScheme.CYBERPUNK,
147
- console_output=True
148
- )
149
-
150
- def _respect_rate_limit(self):
151
- """Ensure minimum time between requests"""
152
- current_time = time.time()
153
- time_since_last = current_time - self.last_request_time
154
- if time_since_last < self.rate_limit:
155
- sleep_time = self.rate_limit - time_since_last
156
- if self.use_litlogger:
157
- self.logger.debug(f"Rate limiting: Waiting {sleep_time:.2f} seconds")
158
- time.sleep(sleep_time)
159
- self.last_request_time = time.time()
160
-
161
- def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
162
- data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
163
- """
164
- Makes an HTTP request with manual retry logic and rate limiting.
165
-
166
- Args:
167
- method (str): HTTP method (GET, POST, etc.)
168
- url (str): Target URL
169
- params (Optional[Dict[str, str]]): Query parameters
170
- data (Optional[Union[Dict[str, str], bytes]]): Request payload
171
- max_retries (int): Maximum number of retry attempts
172
-
173
- Returns:
174
- bytes: Response content
175
- """
176
- retry_count = 0
177
- base_delay = 5 # Base delay in seconds
178
-
179
- while retry_count < max_retries:
180
- try:
181
- self._respect_rate_limit()
182
- response = self.client.request(
183
- method=method,
184
- url=url,
185
- params=params,
186
- data=data,
187
- timeout=self.timeout
188
- )
189
-
190
- if response.status_code == 429:
191
- retry_delay = base_delay * (2 ** retry_count) # Exponential backoff
192
- if self.use_litlogger:
193
- self.logger.warning(f"Rate limited by Google. Waiting {retry_delay} seconds before retry...")
194
- time.sleep(retry_delay)
195
- retry_count += 1
196
- continue
197
-
198
- response.raise_for_status()
199
- return response.content
200
-
201
- except requests.exceptions.RequestException as e:
202
- if retry_count == max_retries - 1:
203
- if self.use_litlogger:
204
- self.logger.error(f"Max retries reached. Last error: {str(e)}")
205
- raise
206
-
207
- retry_delay = base_delay * (2 ** retry_count)
208
- if self.use_litlogger:
209
- self.logger.warning(f"Request failed. Retrying in {retry_delay} seconds... Error: {str(e)}")
210
- time.sleep(retry_delay)
211
- retry_count += 1
212
-
213
- raise Exception("Max retries reached")
214
-
215
- @lru_cache(maxsize=100)
216
- def _cache_key(self, query: str, **kwargs) -> str:
217
- """Generate a cache key from search parameters"""
218
- cache_data = {'query': query, **kwargs}
219
- return json.dumps(cache_data, sort_keys=True)
220
-
221
- def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
222
- """Retrieve cached results if they exist and are not expired"""
223
- if not self.cache_dir:
224
- return None
225
- cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
226
- if os.path.exists(cache_file):
227
- with open(cache_file, 'r') as f:
228
- cached_data = json.load(f)
229
- if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
230
- if self.use_litlogger:
231
- self.logger.info(f"Using cached results for: {cache_key}")
232
- return cached_data['results']
233
- if self.use_litlogger:
234
- self.logger.debug(f"No valid cache found for: {cache_key}")
235
- return None
236
-
237
- def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
238
- """Cache search results"""
239
- if not self.cache_dir:
240
- return
241
- cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
242
- with open(cache_file, 'w') as f:
243
- json.dump({
244
- 'timestamp': datetime.now().isoformat(),
245
- 'results': results
246
- }, f)
247
-
248
- def search_images(
249
- self,
250
- query: str,
251
- max_results: int = 10,
252
- size: Optional[str] = None,
253
- color: Optional[str] = None,
254
- type_filter: Optional[str] = None,
255
- **kwargs
256
- ) -> List[Dict[str, str]]:
257
- """Search for images on Google with style!
258
-
259
- Args:
260
- query (str): What you're looking for fam
261
- max_results (int): How many results you want (default: 10)
262
- size (Optional[str]): Image size filter
263
- - 'large': Big pics
264
- - 'medium': Medium sized
265
- - 'icon': Small icons
266
- color (Optional[str]): Color filter
267
- - 'color': Full color
268
- - 'gray': Black and white
269
- - 'transparent': Transparent background
270
- type_filter (Optional[str]): Type of image
271
- - 'face': Just faces
272
- - 'photo': Real photos
273
- - 'clipart': Vector art
274
- - 'lineart': Line drawings
275
-
276
- Returns:
277
- List[Dict[str, str]]: List of image results with these keys:
278
- - 'thumbnail': Small preview URL
279
- - 'full_url': Full resolution image URL
280
- - 'title': Image title/description
281
- - 'type': Always 'image'
282
-
283
- Example:
284
- >>> searcher = GoogleS()
285
- >>> # Find some cool nature pics
286
- >>> images = searcher.search_images(
287
- ... query="beautiful landscapes",
288
- ... size="large",
289
- ... color="color",
290
- ... max_results=5
291
- ... )
292
- >>> for img in images:
293
- ... print(f"Found: {img['title']}")
294
- ... print(f"URL: {img['full_url']}")
295
- """
296
- params = {
297
- "q": query,
298
- "tbm": "isch",
299
- "num": max_results
300
- }
301
-
302
- if size:
303
- params["tbs"] = f"isz:{size}"
304
- if color:
305
- params["tbs"] = f"ic:{color}"
306
- if type_filter:
307
- params["tbs"] = f"itp:{type_filter}"
308
-
309
- content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
310
- soup = Scout(content) # Use Scout parser
311
-
312
- results = []
313
- for img in soup.find_all("img", class_="rg_i"):
314
- if len(results) >= max_results:
315
- break
316
-
317
- img_data = {
318
- "thumbnail": img.get("src", ""),
319
- "title": img.get("alt", ""),
320
- "type": "image"
321
- }
322
-
323
- # Extract full resolution image URL if available
324
- parent = img.parent
325
- if parent and parent.get("href"):
326
- img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
327
-
328
- results.append(img_data)
329
-
330
- return results
331
-
332
- def search(
333
- self,
334
- query: str,
335
- region: str = "us-en",
336
- language: str = "en",
337
- safe: str = "off",
338
- time_period: Optional[str] = None,
339
- max_results: int = 10,
340
- extract_text: bool = False,
341
- max_text_length: Optional[int] = 100,
342
- site: Optional[str] = None, # Search within specific site
343
- file_type: Optional[str] = None, # Filter by file type
344
- sort_by: str = "relevance", # relevance, date
345
- exclude_terms: Optional[List[str]] = None, # Terms to exclude
346
- exact_phrase: Optional[str] = None, # Exact phrase match
347
- ) -> List[Dict[str, Union[str, int]]]:
348
- """
349
- Enhanced search with additional filters and options.
350
-
351
- Args:
352
- site: Limit search to specific website
353
- file_type: Filter by file type (pdf, doc, etc.)
354
- sort_by: Sort results by relevance or date
355
- exclude_terms: List of terms to exclude from search
356
- exact_phrase: Exact phrase to match
357
- """
358
- if self.use_litlogger:
359
- self.logger.info(f"Starting search for: {query}")
360
-
361
- # Build advanced query
362
- advanced_query = query
363
- if site:
364
- advanced_query += f" site:{site}"
365
- if file_type:
366
- advanced_query += f" filetype:{file_type}"
367
- if exclude_terms:
368
- advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
369
- if exact_phrase:
370
- advanced_query = f'"{exact_phrase}"' + advanced_query
371
-
372
- if self.use_litlogger:
373
- self.logger.debug(f"Advanced query: {advanced_query}")
374
-
375
- # Check cache first
376
- cache_key = self._cache_key(advanced_query, region=region, language=language,
377
- safe=safe, time_period=time_period, sort_by=sort_by)
378
- cached_results = self._get_cached_results(cache_key)
379
- if cached_results:
380
- return cached_results[:max_results]
381
-
382
- # Continue with regular search implementation...
383
- results = []
384
- futures = []
385
- start = 0
386
-
387
- while len(results) < max_results:
388
- params = {
389
- "q": advanced_query,
390
- "num": 10,
391
- "hl": language,
392
- "start": start,
393
- "safe": safe,
394
- "gl": region,
395
- }
396
- if time_period:
397
- params["tbs"] = f"qdr:{time_period}"
398
-
399
- futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
400
- start += 10
401
-
402
- for future in as_completed(futures):
403
- try:
404
- resp_content = future.result()
405
- soup = Scout(resp_content) # Use Scout parser
406
-
407
- result_blocks = soup.find_all("div", class_="g")
408
-
409
- if not result_blocks:
410
- break
411
-
412
- # Extract links and titles first
413
- for result_block in result_blocks:
414
- link = result_block.find("a", href=True)
415
- title = result_block.find("h3")
416
- description_box = result_block.find(
417
- "div", {"style": "-webkit-line-clamp:2"}
418
- )
419
-
420
- if link and title and description_box:
421
- url = link["href"]
422
- results.append({
423
- "title": title.text,
424
- "href": url,
425
- "abstract": description_box.text,
426
- "index": len(results),
427
- "type": "web",
428
- "visible_text": "" # Initialize visible_text as empty string
429
- })
430
-
431
- if len(results) >= max_results:
432
- break # Stop if we have enough results
433
-
434
- # Parallelize text extraction if needed
435
- if extract_text:
436
- with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
437
- extraction_futures = [
438
- text_extractor.submit(self._extract_text_from_webpage,
439
- self._get_url("GET", result['href']),
440
- max_characters=max_text_length)
441
- for result in results
442
- if 'href' in result
443
- ]
444
- for i, future in enumerate(as_completed(extraction_futures)):
445
- try:
446
- results[i]['visible_text'] = future.result()
447
- except Exception as e:
448
- print(f"Error extracting text: {e}")
449
-
450
- except Exception as e:
451
- print(f"Error: {e}")
452
-
453
- # Cache results before returning
454
- self._cache_results(cache_key, results)
455
- return results
456
-
457
- def get_search_suggestions(self, query: str) -> List[str]:
458
- """Get search suggestions for a query"""
459
- params = {
460
- "client": "chrome",
461
- "q": query
462
- }
463
- content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
464
- params=params)
465
- suggestions = json.loads(content.decode('utf-8'))[1]
466
- return suggestions
467
-
468
- def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
469
- """
470
- Extracts visible text from HTML content using Scout parser.
471
- """
472
- soup = Scout(html_content) # Use Scout parser
473
- for tag in soup(["script", "style", "header", "footer", "nav"]):
474
- tag.extract()
475
- visible_text = soup.get_text(strip=True)
476
- if max_characters:
477
- visible_text = visible_text[:max_characters]
478
- return visible_text
479
-
480
- def __enter__(self):
481
- return self
482
-
483
- def __exit__(self, exc_type, exc_val, exc_tb):
484
- self.client.close()
485
- self._executor.shutdown()
486
-
487
-
488
- if __name__ == "__main__":
489
- from rich import print
490
- searcher = GoogleS(rate_limit=3.0, use_litlogger=True)
491
- results = searcher.search("HelpingAI-9B", max_results=5, extract_text=False, max_text_length=200)
492
- for result in results:
1
+ import requests
2
+ from typing import Dict, List, Optional, Union, Any
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from webscout.scout import Scout
5
+ from urllib.parse import quote, urljoin
6
+ from webscout.litagent import LitAgent
7
+
8
+ import time
9
+ import random
10
+ import json
11
+ import os
12
+ from datetime import datetime, timedelta
13
+ from functools import lru_cache
14
+ from webscout.Litlogger import Logger, LogFormat
15
+ class GoogleS:
16
+ """A Python interface for Google search with advanced features
17
+
18
+ The GoogleS class provides a powerful interface to perform web searches, image searches,
19
+ and advanced filtering on Google. Built with love by HAI to keep it
20
+
21
+ Basic Usage:
22
+ >>> from webscout.DWEBS import GoogleS
23
+ >>> searcher = GoogleS()
24
+ >>> # Simple web search
25
+ >>> results = searcher.search("Python programming")
26
+ >>> for result in results:
27
+ ... print(f"Title: {result['title']}")
28
+ ... print(f"URL: {result['href']}")
29
+ ... print(f"Description: {result['abstract']}")
30
+
31
+ Advanced Web Search:
32
+ >>> # Search with filters
33
+ >>> results = searcher.search(
34
+ ... query="Python tutorials",
35
+ ... site="github.com",
36
+ ... file_type="pdf",
37
+ ... time_period="month",
38
+ ... max_results=5
39
+ ... )
40
+ >>> # Example response format:
41
+ >>> {
42
+ ... 'title': 'Python Tutorial',
43
+ ... 'href': 'https://example.com/python-tutorial',
44
+ ... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
45
+ ... 'index': 0,
46
+ ... 'type': 'web',
47
+ ... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
48
+ ... }
49
+
50
+ Image Search:
51
+ >>> # Search for images
52
+ >>> images = searcher.search_images(
53
+ ... query="cute puppies",
54
+ ... size="large",
55
+ ... color="color",
56
+ ... type_filter="photo",
57
+ ... max_results=5
58
+ ... )
59
+ >>> # Example response format:
60
+ >>> {
61
+ ... 'title': 'Cute Puppy Image',
62
+ ... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
63
+ ... 'full_url': 'https://example.com/puppy-full.jpg',
64
+ ... 'type': 'image'
65
+ ... }
66
+
67
+ Features:
68
+ - Web Search: Get detailed web results with title, URL, and description
69
+ - Image Search: Find images with thumbnails and full-resolution URLs
70
+ - Advanced Filters: Site-specific search, file types, time periods
71
+ - Rate Limiting: Smart request handling to avoid blocks
72
+ - Caching: Save results for faster repeat searches
73
+ - Retry Logic: Automatic retry on temporary failures
74
+ - Logging: Optional LitLogger integration for beautiful console output
75
+ - Proxy Support: Use custom proxies for requests
76
+ - Concurrent Processing: Multi-threaded requests for better performance
77
+
78
+ Response Format:
79
+ Web Search Results:
80
+ {
81
+ 'title': str, # Title of the webpage
82
+ 'href': str, # URL of the webpage
83
+ 'abstract': str, # Brief description or snippet
84
+ 'index': int, # Result position
85
+ 'type': 'web', # Result type identifier
86
+ 'visible_text': str # Full page text (if extract_text=True)
87
+ }
88
+
89
+ Image Search Results:
90
+ {
91
+ 'title': str, # Image title or description
92
+ 'thumbnail': str, # Thumbnail image URL
93
+ 'full_url': str, # Full resolution image URL
94
+ 'type': 'image' # Result type identifier
95
+ }
96
+ """
97
+
98
+ SEARCH_TYPES = {
99
+ "web": "https://www.google.com/search",
100
+ "image": "https://www.google.com/images",
101
+ "news": "https://www.google.com/news",
102
+ }
103
+
104
+ def __init__(
105
+ self,
106
+ headers: Optional[Dict[str, str]] = None,
107
+ proxy: Optional[str] = None,
108
+ timeout: Optional[int] = 10,
109
+ max_workers: int = 20,
110
+ cache_dir: Optional[str] = None,
111
+ rate_limit: float = 2.0,
112
+ use_litlogger: bool = False
113
+ ):
114
+ """
115
+ Initialize the GoogleS object with enhanced features.
116
+
117
+ Args:
118
+ cache_dir: Directory to store search result cache
119
+ rate_limit: Minimum time between requests in seconds
120
+ use_litlogger: Whether to use LitLogger for logging (default: False)
121
+ """
122
+ self.proxy = proxy
123
+ self.headers = headers if headers else {
124
+ "User-Agent": LitAgent().random() # Use LitAgent to generate user agent
125
+ }
126
+ self.headers["Referer"] = "https://www.google.com/"
127
+ self.client = requests.Session()
128
+ self.client.headers.update(self.headers)
129
+ if proxy:
130
+ self.client.proxies.update({"http": proxy, "https": proxy})
131
+ self.timeout = timeout
132
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
133
+ self.cache_dir = cache_dir
134
+ if cache_dir and not os.path.exists(cache_dir):
135
+ os.makedirs(cache_dir)
136
+ self.last_request_time = 0
137
+ self.rate_limit = rate_limit
138
+ self.use_litlogger = use_litlogger
139
+
140
+ # Setup enhanced logging with LitLogger if enabled
141
+ if self.use_litlogger:
142
+ self.logger = Logger(
143
+ name="GoogleS",
144
+ format=LogFormat.MODERN_EMOJI,
145
+ )
146
+
147
+ def _respect_rate_limit(self):
148
+ """Ensure minimum time between requests"""
149
+ current_time = time.time()
150
+ time_since_last = current_time - self.last_request_time
151
+ if time_since_last < self.rate_limit:
152
+ sleep_time = self.rate_limit - time_since_last
153
+ if self.use_litlogger:
154
+ self.logger.debug(f"Rate limiting: Waiting {sleep_time:.2f} seconds")
155
+ time.sleep(sleep_time)
156
+ self.last_request_time = time.time()
157
+
158
+ def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
159
+ data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
160
+ """
161
+ Makes an HTTP request with manual retry logic and rate limiting.
162
+
163
+ Args:
164
+ method (str): HTTP method (GET, POST, etc.)
165
+ url (str): Target URL
166
+ params (Optional[Dict[str, str]]): Query parameters
167
+ data (Optional[Union[Dict[str, str], bytes]]): Request payload
168
+ max_retries (int): Maximum number of retry attempts
169
+
170
+ Returns:
171
+ bytes: Response content
172
+ """
173
+ retry_count = 0
174
+ base_delay = 5 # Base delay in seconds
175
+
176
+ while retry_count < max_retries:
177
+ try:
178
+ self._respect_rate_limit()
179
+ response = self.client.request(
180
+ method=method,
181
+ url=url,
182
+ params=params,
183
+ data=data,
184
+ timeout=self.timeout
185
+ )
186
+
187
+ if response.status_code == 429:
188
+ retry_delay = base_delay * (2 ** retry_count) # Exponential backoff
189
+ if self.use_litlogger:
190
+ self.logger.warning(f"Rate limited by Google. Waiting {retry_delay} seconds before retry...")
191
+ time.sleep(retry_delay)
192
+ retry_count += 1
193
+ continue
194
+
195
+ response.raise_for_status()
196
+ return response.content
197
+
198
+ except requests.exceptions.RequestException as e:
199
+ if retry_count == max_retries - 1:
200
+ if self.use_litlogger:
201
+ self.logger.error(f"Max retries reached. Last error: {str(e)}")
202
+ raise
203
+
204
+ retry_delay = base_delay * (2 ** retry_count)
205
+ if self.use_litlogger:
206
+ self.logger.warning(f"Request failed. Retrying in {retry_delay} seconds... Error: {str(e)}")
207
+ time.sleep(retry_delay)
208
+ retry_count += 1
209
+
210
+ raise Exception("Max retries reached")
211
+
212
+ @lru_cache(maxsize=100)
213
+ def _cache_key(self, query: str, **kwargs) -> str:
214
+ """Generate a cache key from search parameters"""
215
+ cache_data = {'query': query, **kwargs}
216
+ return json.dumps(cache_data, sort_keys=True)
217
+
218
+ def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
219
+ """Retrieve cached results if they exist and are not expired"""
220
+ if not self.cache_dir:
221
+ return None
222
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
223
+ if os.path.exists(cache_file):
224
+ with open(cache_file, 'r') as f:
225
+ cached_data = json.load(f)
226
+ if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
227
+ if self.use_litlogger:
228
+ self.logger.info(f"Using cached results for: {cache_key}")
229
+ return cached_data['results']
230
+ if self.use_litlogger:
231
+ self.logger.debug(f"No valid cache found for: {cache_key}")
232
+ return None
233
+
234
+ def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
235
+ """Cache search results"""
236
+ if not self.cache_dir:
237
+ return
238
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
239
+ with open(cache_file, 'w') as f:
240
+ json.dump({
241
+ 'timestamp': datetime.now().isoformat(),
242
+ 'results': results
243
+ }, f)
244
+
245
+ def search_images(
246
+ self,
247
+ query: str,
248
+ max_results: int = 10,
249
+ size: Optional[str] = None,
250
+ color: Optional[str] = None,
251
+ type_filter: Optional[str] = None,
252
+ **kwargs
253
+ ) -> List[Dict[str, str]]:
254
+ """Search for images on Google with style!
255
+
256
+ Args:
257
+ query (str): What you're looking for fam
258
+ max_results (int): How many results you want (default: 10)
259
+ size (Optional[str]): Image size filter
260
+ - 'large': Big pics
261
+ - 'medium': Medium sized
262
+ - 'icon': Small icons
263
+ color (Optional[str]): Color filter
264
+ - 'color': Full color
265
+ - 'gray': Black and white
266
+ - 'transparent': Transparent background
267
+ type_filter (Optional[str]): Type of image
268
+ - 'face': Just faces
269
+ - 'photo': Real photos
270
+ - 'clipart': Vector art
271
+ - 'lineart': Line drawings
272
+
273
+ Returns:
274
+ List[Dict[str, str]]: List of image results with these keys:
275
+ - 'thumbnail': Small preview URL
276
+ - 'full_url': Full resolution image URL
277
+ - 'title': Image title/description
278
+ - 'type': Always 'image'
279
+
280
+ Example:
281
+ >>> searcher = GoogleS()
282
+ >>> # Find some cool nature pics
283
+ >>> images = searcher.search_images(
284
+ ... query="beautiful landscapes",
285
+ ... size="large",
286
+ ... color="color",
287
+ ... max_results=5
288
+ ... )
289
+ >>> for img in images:
290
+ ... print(f"Found: {img['title']}")
291
+ ... print(f"URL: {img['full_url']}")
292
+ """
293
+ params = {
294
+ "q": query,
295
+ "tbm": "isch",
296
+ "num": max_results
297
+ }
298
+
299
+ if size:
300
+ params["tbs"] = f"isz:{size}"
301
+ if color:
302
+ params["tbs"] = f"ic:{color}"
303
+ if type_filter:
304
+ params["tbs"] = f"itp:{type_filter}"
305
+
306
+ content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
307
+ soup = Scout(content) # Use Scout parser
308
+
309
+ results = []
310
+ for img in soup.find_all("img", class_="rg_i"):
311
+ if len(results) >= max_results:
312
+ break
313
+
314
+ img_data = {
315
+ "thumbnail": img.get("src", ""),
316
+ "title": img.get("alt", ""),
317
+ "type": "image"
318
+ }
319
+
320
+ # Extract full resolution image URL if available
321
+ parent = img.parent
322
+ if parent and parent.get("href"):
323
+ img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
324
+
325
+ results.append(img_data)
326
+
327
+ return results
328
+
329
+ def search(
330
+ self,
331
+ query: str,
332
+ region: str = "us-en",
333
+ language: str = "en",
334
+ safe: str = "off",
335
+ time_period: Optional[str] = None,
336
+ max_results: int = 10,
337
+ extract_text: bool = False,
338
+ max_text_length: Optional[int] = 100,
339
+ site: Optional[str] = None, # Search within specific site
340
+ file_type: Optional[str] = None, # Filter by file type
341
+ sort_by: str = "relevance", # relevance, date
342
+ exclude_terms: Optional[List[str]] = None, # Terms to exclude
343
+ exact_phrase: Optional[str] = None, # Exact phrase match
344
+ ) -> List[Dict[str, Union[str, int]]]:
345
+ """
346
+ Enhanced search with additional filters and options.
347
+
348
+ Args:
349
+ site: Limit search to specific website
350
+ file_type: Filter by file type (pdf, doc, etc.)
351
+ sort_by: Sort results by relevance or date
352
+ exclude_terms: List of terms to exclude from search
353
+ exact_phrase: Exact phrase to match
354
+ """
355
+ if self.use_litlogger:
356
+ self.logger.info(f"Starting search for: {query}")
357
+
358
+ # Build advanced query
359
+ advanced_query = query
360
+ if site:
361
+ advanced_query += f" site:{site}"
362
+ if file_type:
363
+ advanced_query += f" filetype:{file_type}"
364
+ if exclude_terms:
365
+ advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
366
+ if exact_phrase:
367
+ advanced_query = f'"{exact_phrase}"' + advanced_query
368
+
369
+ if self.use_litlogger:
370
+ self.logger.debug(f"Advanced query: {advanced_query}")
371
+
372
+ # Check cache first
373
+ cache_key = self._cache_key(advanced_query, region=region, language=language,
374
+ safe=safe, time_period=time_period, sort_by=sort_by)
375
+ cached_results = self._get_cached_results(cache_key)
376
+ if cached_results:
377
+ return cached_results[:max_results]
378
+
379
+ # Continue with regular search implementation...
380
+ results = []
381
+ futures = []
382
+ start = 0
383
+
384
+ while len(results) < max_results:
385
+ params = {
386
+ "q": advanced_query,
387
+ "num": 10,
388
+ "hl": language,
389
+ "start": start,
390
+ "safe": safe,
391
+ "gl": region,
392
+ }
393
+ if time_period:
394
+ params["tbs"] = f"qdr:{time_period}"
395
+
396
+ futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
397
+ start += 10
398
+
399
+ for future in as_completed(futures):
400
+ try:
401
+ resp_content = future.result()
402
+ soup = Scout(resp_content) # Use Scout parser
403
+
404
+ result_blocks = soup.find_all("div", class_="g")
405
+
406
+ if not result_blocks:
407
+ break
408
+
409
+ # Extract links and titles first
410
+ for result_block in result_blocks:
411
+ link = result_block.find("a", href=True)
412
+ title = result_block.find("h3")
413
+ description_box = result_block.find(
414
+ "div", {"style": "-webkit-line-clamp:2"}
415
+ )
416
+
417
+ if link and title and description_box:
418
+ url = link["href"]
419
+ results.append({
420
+ "title": title.text,
421
+ "href": url,
422
+ "abstract": description_box.text,
423
+ "index": len(results),
424
+ "type": "web",
425
+ "visible_text": "" # Initialize visible_text as empty string
426
+ })
427
+
428
+ if len(results) >= max_results:
429
+ break # Stop if we have enough results
430
+
431
+ # Parallelize text extraction if needed
432
+ if extract_text:
433
+ with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
434
+ extraction_futures = [
435
+ text_extractor.submit(self._extract_text_from_webpage,
436
+ self._get_url("GET", result['href']),
437
+ max_characters=max_text_length)
438
+ for result in results
439
+ if 'href' in result
440
+ ]
441
+ for i, future in enumerate(as_completed(extraction_futures)):
442
+ try:
443
+ results[i]['visible_text'] = future.result()
444
+ except Exception as e:
445
+ print(f"Error extracting text: {e}")
446
+
447
+ except Exception as e:
448
+ print(f"Error: {e}")
449
+
450
+ # Cache results before returning
451
+ self._cache_results(cache_key, results)
452
+ return results
453
+
454
+ def get_search_suggestions(self, query: str) -> List[str]:
455
+ """Get search suggestions for a query"""
456
+ params = {
457
+ "client": "chrome",
458
+ "q": query
459
+ }
460
+ content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
461
+ params=params)
462
+ suggestions = json.loads(content.decode('utf-8'))[1]
463
+ return suggestions
464
+
465
+ def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
466
+ """
467
+ Extracts visible text from HTML content using Scout parser.
468
+ """
469
+ soup = Scout(html_content) # Use Scout parser
470
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
471
+ tag.extract()
472
+ visible_text = soup.get_text(strip=True)
473
+ if max_characters:
474
+ visible_text = visible_text[:max_characters]
475
+ return visible_text
476
+
477
+ def __enter__(self):
478
+ return self
479
+
480
+ def __exit__(self, exc_type, exc_val, exc_tb):
481
+ self.client.close()
482
+ self._executor.shutdown()
483
+
484
+
485
+ if __name__ == "__main__":
486
+ from rich import print
487
+ searcher = GoogleS(rate_limit=3.0, use_litlogger=True)
488
+ results = searcher.search("HelpingAI-9B", max_results=5, extract_text=False, max_text_length=200)
489
+ for result in results:
493
490
  print(result)