webscout 7.5__py3-none-any.whl → 7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (118) hide show
  1. webscout/AIauto.py +5 -53
  2. webscout/AIutel.py +8 -318
  3. webscout/DWEBS.py +460 -489
  4. webscout/Extra/YTToolkit/YTdownloader.py +14 -53
  5. webscout/Extra/YTToolkit/transcriber.py +12 -13
  6. webscout/Extra/YTToolkit/ytapi/video.py +0 -1
  7. webscout/Extra/__init__.py +0 -1
  8. webscout/Extra/autocoder/autocoder_utiles.py +0 -4
  9. webscout/Extra/autocoder/rawdog.py +13 -41
  10. webscout/Extra/gguf.py +652 -428
  11. webscout/Extra/weather.py +178 -156
  12. webscout/Extra/weather_ascii.py +70 -17
  13. webscout/Litlogger/core/logger.py +1 -2
  14. webscout/Litlogger/handlers/file.py +1 -1
  15. webscout/Litlogger/styles/formats.py +0 -2
  16. webscout/Litlogger/utils/detectors.py +0 -1
  17. webscout/Provider/AISEARCH/DeepFind.py +0 -1
  18. webscout/Provider/AISEARCH/ISou.py +1 -1
  19. webscout/Provider/AISEARCH/felo_search.py +0 -1
  20. webscout/Provider/AllenAI.py +24 -9
  21. webscout/Provider/C4ai.py +29 -11
  22. webscout/Provider/ChatGPTGratis.py +24 -56
  23. webscout/Provider/DeepSeek.py +25 -17
  24. webscout/Provider/Deepinfra.py +115 -48
  25. webscout/Provider/Gemini.py +1 -1
  26. webscout/Provider/Glider.py +25 -8
  27. webscout/Provider/HF_space/qwen_qwen2.py +2 -2
  28. webscout/Provider/HeckAI.py +23 -7
  29. webscout/Provider/Jadve.py +20 -5
  30. webscout/Provider/Netwrck.py +42 -19
  31. webscout/Provider/PI.py +4 -2
  32. webscout/Provider/Perplexitylabs.py +26 -6
  33. webscout/Provider/PizzaGPT.py +10 -51
  34. webscout/Provider/TTI/AiForce/async_aiforce.py +4 -37
  35. webscout/Provider/TTI/AiForce/sync_aiforce.py +41 -38
  36. webscout/Provider/TTI/FreeAIPlayground/__init__.py +9 -9
  37. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +206 -206
  38. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +192 -192
  39. webscout/Provider/TTI/MagicStudio/__init__.py +2 -0
  40. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +111 -0
  41. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +109 -0
  42. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +5 -24
  43. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +2 -22
  44. webscout/Provider/TTI/__init__.py +2 -3
  45. webscout/Provider/TTI/aiarta/async_aiarta.py +14 -14
  46. webscout/Provider/TTI/aiarta/sync_aiarta.py +52 -21
  47. webscout/Provider/TTI/fastflux/__init__.py +22 -0
  48. webscout/Provider/TTI/fastflux/async_fastflux.py +257 -0
  49. webscout/Provider/TTI/fastflux/sync_fastflux.py +247 -0
  50. webscout/Provider/TTS/__init__.py +2 -2
  51. webscout/Provider/TTS/deepgram.py +12 -39
  52. webscout/Provider/TTS/elevenlabs.py +14 -40
  53. webscout/Provider/TTS/gesserit.py +11 -35
  54. webscout/Provider/TTS/murfai.py +13 -39
  55. webscout/Provider/TTS/parler.py +17 -40
  56. webscout/Provider/TTS/speechma.py +180 -0
  57. webscout/Provider/TTS/streamElements.py +17 -44
  58. webscout/Provider/TextPollinationsAI.py +39 -59
  59. webscout/Provider/Venice.py +25 -8
  60. webscout/Provider/WiseCat.py +27 -5
  61. webscout/Provider/Youchat.py +64 -37
  62. webscout/Provider/__init__.py +0 -6
  63. webscout/Provider/akashgpt.py +20 -5
  64. webscout/Provider/flowith.py +20 -5
  65. webscout/Provider/freeaichat.py +32 -45
  66. webscout/Provider/koala.py +20 -5
  67. webscout/Provider/llamatutor.py +1 -1
  68. webscout/Provider/llmchat.py +30 -8
  69. webscout/Provider/multichat.py +65 -9
  70. webscout/Provider/talkai.py +1 -0
  71. webscout/Provider/turboseek.py +3 -0
  72. webscout/Provider/tutorai.py +2 -0
  73. webscout/Provider/typegpt.py +154 -64
  74. webscout/Provider/x0gpt.py +3 -1
  75. webscout/Provider/yep.py +102 -20
  76. webscout/__init__.py +3 -0
  77. webscout/cli.py +4 -40
  78. webscout/conversation.py +1 -10
  79. webscout/litagent/__init__.py +2 -2
  80. webscout/litagent/agent.py +351 -20
  81. webscout/litagent/constants.py +34 -5
  82. webscout/litprinter/__init__.py +0 -3
  83. webscout/models.py +181 -0
  84. webscout/optimizers.py +1 -1
  85. webscout/prompt_manager.py +2 -8
  86. webscout/scout/core/scout.py +1 -4
  87. webscout/scout/core/search_result.py +1 -1
  88. webscout/scout/core/text_utils.py +1 -1
  89. webscout/scout/core.py +2 -5
  90. webscout/scout/element.py +1 -1
  91. webscout/scout/parsers/html_parser.py +1 -1
  92. webscout/scout/utils.py +0 -1
  93. webscout/swiftcli/__init__.py +1 -3
  94. webscout/tempid.py +1 -1
  95. webscout/update_checker.py +1 -3
  96. webscout/version.py +1 -1
  97. webscout/webscout_search_async.py +1 -2
  98. webscout/yep_search.py +297 -297
  99. {webscout-7.5.dist-info → webscout-7.6.dist-info}/LICENSE.md +4 -4
  100. {webscout-7.5.dist-info → webscout-7.6.dist-info}/METADATA +101 -390
  101. {webscout-7.5.dist-info → webscout-7.6.dist-info}/RECORD +104 -110
  102. webscout/Extra/autollama.py +0 -231
  103. webscout/Provider/Amigo.py +0 -274
  104. webscout/Provider/Bing.py +0 -243
  105. webscout/Provider/DiscordRocks.py +0 -253
  106. webscout/Provider/TTI/blackbox/__init__.py +0 -4
  107. webscout/Provider/TTI/blackbox/async_blackbox.py +0 -212
  108. webscout/Provider/TTI/blackbox/sync_blackbox.py +0 -199
  109. webscout/Provider/TTI/deepinfra/__init__.py +0 -4
  110. webscout/Provider/TTI/deepinfra/async_deepinfra.py +0 -227
  111. webscout/Provider/TTI/deepinfra/sync_deepinfra.py +0 -199
  112. webscout/Provider/TTI/imgninza/__init__.py +0 -4
  113. webscout/Provider/TTI/imgninza/async_ninza.py +0 -214
  114. webscout/Provider/TTI/imgninza/sync_ninza.py +0 -209
  115. webscout/Provider/TTS/voicepod.py +0 -117
  116. {webscout-7.5.dist-info → webscout-7.6.dist-info}/WHEEL +0 -0
  117. {webscout-7.5.dist-info → webscout-7.6.dist-info}/entry_points.txt +0 -0
  118. {webscout-7.5.dist-info → webscout-7.6.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py CHANGED
@@ -1,490 +1,461 @@
1
- import requests
2
- from typing import Dict, List, Optional, Union, Any
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from webscout.scout import Scout
5
- from urllib.parse import quote, urljoin
6
- from webscout.litagent import LitAgent
7
-
8
- import time
9
- import random
10
- import json
11
- import os
12
- from datetime import datetime, timedelta
13
- from functools import lru_cache
14
- from webscout.Litlogger import Logger, LogFormat
15
- class GoogleS:
16
- """A Python interface for Google search with advanced features
17
-
18
- The GoogleS class provides a powerful interface to perform web searches, image searches,
19
- and advanced filtering on Google. Built with love by HAI to keep it
20
-
21
- Basic Usage:
22
- >>> from webscout.DWEBS import GoogleS
23
- >>> searcher = GoogleS()
24
- >>> # Simple web search
25
- >>> results = searcher.search("Python programming")
26
- >>> for result in results:
27
- ... print(f"Title: {result['title']}")
28
- ... print(f"URL: {result['href']}")
29
- ... print(f"Description: {result['abstract']}")
30
-
31
- Advanced Web Search:
32
- >>> # Search with filters
33
- >>> results = searcher.search(
34
- ... query="Python tutorials",
35
- ... site="github.com",
36
- ... file_type="pdf",
37
- ... time_period="month",
38
- ... max_results=5
39
- ... )
40
- >>> # Example response format:
41
- >>> {
42
- ... 'title': 'Python Tutorial',
43
- ... 'href': 'https://example.com/python-tutorial',
44
- ... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
45
- ... 'index': 0,
46
- ... 'type': 'web',
47
- ... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
48
- ... }
49
-
50
- Image Search:
51
- >>> # Search for images
52
- >>> images = searcher.search_images(
53
- ... query="cute puppies",
54
- ... size="large",
55
- ... color="color",
56
- ... type_filter="photo",
57
- ... max_results=5
58
- ... )
59
- >>> # Example response format:
60
- >>> {
61
- ... 'title': 'Cute Puppy Image',
62
- ... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
63
- ... 'full_url': 'https://example.com/puppy-full.jpg',
64
- ... 'type': 'image'
65
- ... }
66
-
67
- Features:
68
- - Web Search: Get detailed web results with title, URL, and description
69
- - Image Search: Find images with thumbnails and full-resolution URLs
70
- - Advanced Filters: Site-specific search, file types, time periods
71
- - Rate Limiting: Smart request handling to avoid blocks
72
- - Caching: Save results for faster repeat searches
73
- - Retry Logic: Automatic retry on temporary failures
74
- - Logging: Optional LitLogger integration for beautiful console output
75
- - Proxy Support: Use custom proxies for requests
76
- - Concurrent Processing: Multi-threaded requests for better performance
77
-
78
- Response Format:
79
- Web Search Results:
80
- {
81
- 'title': str, # Title of the webpage
82
- 'href': str, # URL of the webpage
83
- 'abstract': str, # Brief description or snippet
84
- 'index': int, # Result position
85
- 'type': 'web', # Result type identifier
86
- 'visible_text': str # Full page text (if extract_text=True)
87
- }
88
-
89
- Image Search Results:
90
- {
91
- 'title': str, # Image title or description
92
- 'thumbnail': str, # Thumbnail image URL
93
- 'full_url': str, # Full resolution image URL
94
- 'type': 'image' # Result type identifier
95
- }
96
- """
97
-
98
- SEARCH_TYPES = {
99
- "web": "https://www.google.com/search",
100
- "image": "https://www.google.com/images",
101
- "news": "https://www.google.com/news",
102
- }
103
-
104
- def __init__(
105
- self,
106
- headers: Optional[Dict[str, str]] = None,
107
- proxy: Optional[str] = None,
108
- timeout: Optional[int] = 10,
109
- max_workers: int = 20,
110
- cache_dir: Optional[str] = None,
111
- rate_limit: float = 2.0,
112
- use_litlogger: bool = False
113
- ):
114
- """
115
- Initialize the GoogleS object with enhanced features.
116
-
117
- Args:
118
- cache_dir: Directory to store search result cache
119
- rate_limit: Minimum time between requests in seconds
120
- use_litlogger: Whether to use LitLogger for logging (default: False)
121
- """
122
- self.proxy = proxy
123
- self.headers = headers if headers else {
124
- "User-Agent": LitAgent().random() # Use LitAgent to generate user agent
125
- }
126
- self.headers["Referer"] = "https://www.google.com/"
127
- self.client = requests.Session()
128
- self.client.headers.update(self.headers)
129
- if proxy:
130
- self.client.proxies.update({"http": proxy, "https": proxy})
131
- self.timeout = timeout
132
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
133
- self.cache_dir = cache_dir
134
- if cache_dir and not os.path.exists(cache_dir):
135
- os.makedirs(cache_dir)
136
- self.last_request_time = 0
137
- self.rate_limit = rate_limit
138
- self.use_litlogger = use_litlogger
139
-
140
- # Setup enhanced logging with LitLogger if enabled
141
- if self.use_litlogger:
142
- self.logger = Logger(
143
- name="GoogleS",
144
- format=LogFormat.MODERN_EMOJI,
145
- )
146
-
147
- def _respect_rate_limit(self):
148
- """Ensure minimum time between requests"""
149
- current_time = time.time()
150
- time_since_last = current_time - self.last_request_time
151
- if time_since_last < self.rate_limit:
152
- sleep_time = self.rate_limit - time_since_last
153
- if self.use_litlogger:
154
- self.logger.debug(f"Rate limiting: Waiting {sleep_time:.2f} seconds")
155
- time.sleep(sleep_time)
156
- self.last_request_time = time.time()
157
-
158
- def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
159
- data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
160
- """
161
- Makes an HTTP request with manual retry logic and rate limiting.
162
-
163
- Args:
164
- method (str): HTTP method (GET, POST, etc.)
165
- url (str): Target URL
166
- params (Optional[Dict[str, str]]): Query parameters
167
- data (Optional[Union[Dict[str, str], bytes]]): Request payload
168
- max_retries (int): Maximum number of retry attempts
169
-
170
- Returns:
171
- bytes: Response content
172
- """
173
- retry_count = 0
174
- base_delay = 5 # Base delay in seconds
175
-
176
- while retry_count < max_retries:
177
- try:
178
- self._respect_rate_limit()
179
- response = self.client.request(
180
- method=method,
181
- url=url,
182
- params=params,
183
- data=data,
184
- timeout=self.timeout
185
- )
186
-
187
- if response.status_code == 429:
188
- retry_delay = base_delay * (2 ** retry_count) # Exponential backoff
189
- if self.use_litlogger:
190
- self.logger.warning(f"Rate limited by Google. Waiting {retry_delay} seconds before retry...")
191
- time.sleep(retry_delay)
192
- retry_count += 1
193
- continue
194
-
195
- response.raise_for_status()
196
- return response.content
197
-
198
- except requests.exceptions.RequestException as e:
199
- if retry_count == max_retries - 1:
200
- if self.use_litlogger:
201
- self.logger.error(f"Max retries reached. Last error: {str(e)}")
202
- raise
203
-
204
- retry_delay = base_delay * (2 ** retry_count)
205
- if self.use_litlogger:
206
- self.logger.warning(f"Request failed. Retrying in {retry_delay} seconds... Error: {str(e)}")
207
- time.sleep(retry_delay)
208
- retry_count += 1
209
-
210
- raise Exception("Max retries reached")
211
-
212
- @lru_cache(maxsize=100)
213
- def _cache_key(self, query: str, **kwargs) -> str:
214
- """Generate a cache key from search parameters"""
215
- cache_data = {'query': query, **kwargs}
216
- return json.dumps(cache_data, sort_keys=True)
217
-
218
- def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
219
- """Retrieve cached results if they exist and are not expired"""
220
- if not self.cache_dir:
221
- return None
222
- cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
223
- if os.path.exists(cache_file):
224
- with open(cache_file, 'r') as f:
225
- cached_data = json.load(f)
226
- if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
227
- if self.use_litlogger:
228
- self.logger.info(f"Using cached results for: {cache_key}")
229
- return cached_data['results']
230
- if self.use_litlogger:
231
- self.logger.debug(f"No valid cache found for: {cache_key}")
232
- return None
233
-
234
- def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
235
- """Cache search results"""
236
- if not self.cache_dir:
237
- return
238
- cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
239
- with open(cache_file, 'w') as f:
240
- json.dump({
241
- 'timestamp': datetime.now().isoformat(),
242
- 'results': results
243
- }, f)
244
-
245
- def search_images(
246
- self,
247
- query: str,
248
- max_results: int = 10,
249
- size: Optional[str] = None,
250
- color: Optional[str] = None,
251
- type_filter: Optional[str] = None,
252
- **kwargs
253
- ) -> List[Dict[str, str]]:
254
- """Search for images on Google with style!
255
-
256
- Args:
257
- query (str): What you're looking for fam
258
- max_results (int): How many results you want (default: 10)
259
- size (Optional[str]): Image size filter
260
- - 'large': Big pics
261
- - 'medium': Medium sized
262
- - 'icon': Small icons
263
- color (Optional[str]): Color filter
264
- - 'color': Full color
265
- - 'gray': Black and white
266
- - 'transparent': Transparent background
267
- type_filter (Optional[str]): Type of image
268
- - 'face': Just faces
269
- - 'photo': Real photos
270
- - 'clipart': Vector art
271
- - 'lineart': Line drawings
272
-
273
- Returns:
274
- List[Dict[str, str]]: List of image results with these keys:
275
- - 'thumbnail': Small preview URL
276
- - 'full_url': Full resolution image URL
277
- - 'title': Image title/description
278
- - 'type': Always 'image'
279
-
280
- Example:
281
- >>> searcher = GoogleS()
282
- >>> # Find some cool nature pics
283
- >>> images = searcher.search_images(
284
- ... query="beautiful landscapes",
285
- ... size="large",
286
- ... color="color",
287
- ... max_results=5
288
- ... )
289
- >>> for img in images:
290
- ... print(f"Found: {img['title']}")
291
- ... print(f"URL: {img['full_url']}")
292
- """
293
- params = {
294
- "q": query,
295
- "tbm": "isch",
296
- "num": max_results
297
- }
298
-
299
- if size:
300
- params["tbs"] = f"isz:{size}"
301
- if color:
302
- params["tbs"] = f"ic:{color}"
303
- if type_filter:
304
- params["tbs"] = f"itp:{type_filter}"
305
-
306
- content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
307
- soup = Scout(content) # Use Scout parser
308
-
309
- results = []
310
- for img in soup.find_all("img", class_="rg_i"):
311
- if len(results) >= max_results:
312
- break
313
-
314
- img_data = {
315
- "thumbnail": img.get("src", ""),
316
- "title": img.get("alt", ""),
317
- "type": "image"
318
- }
319
-
320
- # Extract full resolution image URL if available
321
- parent = img.parent
322
- if parent and parent.get("href"):
323
- img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
324
-
325
- results.append(img_data)
326
-
327
- return results
328
-
329
- def search(
330
- self,
331
- query: str,
332
- region: str = "us-en",
333
- language: str = "en",
334
- safe: str = "off",
335
- time_period: Optional[str] = None,
336
- max_results: int = 10,
337
- extract_text: bool = False,
338
- max_text_length: Optional[int] = 100,
339
- site: Optional[str] = None, # Search within specific site
340
- file_type: Optional[str] = None, # Filter by file type
341
- sort_by: str = "relevance", # relevance, date
342
- exclude_terms: Optional[List[str]] = None, # Terms to exclude
343
- exact_phrase: Optional[str] = None, # Exact phrase match
344
- ) -> List[Dict[str, Union[str, int]]]:
345
- """
346
- Enhanced search with additional filters and options.
347
-
348
- Args:
349
- site: Limit search to specific website
350
- file_type: Filter by file type (pdf, doc, etc.)
351
- sort_by: Sort results by relevance or date
352
- exclude_terms: List of terms to exclude from search
353
- exact_phrase: Exact phrase to match
354
- """
355
- if self.use_litlogger:
356
- self.logger.info(f"Starting search for: {query}")
357
-
358
- # Build advanced query
359
- advanced_query = query
360
- if site:
361
- advanced_query += f" site:{site}"
362
- if file_type:
363
- advanced_query += f" filetype:{file_type}"
364
- if exclude_terms:
365
- advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
366
- if exact_phrase:
367
- advanced_query = f'"{exact_phrase}"' + advanced_query
368
-
369
- if self.use_litlogger:
370
- self.logger.debug(f"Advanced query: {advanced_query}")
371
-
372
- # Check cache first
373
- cache_key = self._cache_key(advanced_query, region=region, language=language,
374
- safe=safe, time_period=time_period, sort_by=sort_by)
375
- cached_results = self._get_cached_results(cache_key)
376
- if cached_results:
377
- return cached_results[:max_results]
378
-
379
- # Continue with regular search implementation...
380
- results = []
381
- futures = []
382
- start = 0
383
-
384
- while len(results) < max_results:
385
- params = {
386
- "q": advanced_query,
387
- "num": 10,
388
- "hl": language,
389
- "start": start,
390
- "safe": safe,
391
- "gl": region,
392
- }
393
- if time_period:
394
- params["tbs"] = f"qdr:{time_period}"
395
-
396
- futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
397
- start += 10
398
-
399
- for future in as_completed(futures):
400
- try:
401
- resp_content = future.result()
402
- soup = Scout(resp_content) # Use Scout parser
403
-
404
- result_blocks = soup.find_all("div", class_="g")
405
-
406
- if not result_blocks:
407
- break
408
-
409
- # Extract links and titles first
410
- for result_block in result_blocks:
411
- link = result_block.find("a", href=True)
412
- title = result_block.find("h3")
413
- description_box = result_block.find(
414
- "div", {"style": "-webkit-line-clamp:2"}
415
- )
416
-
417
- if link and title and description_box:
418
- url = link["href"]
419
- results.append({
420
- "title": title.text,
421
- "href": url,
422
- "abstract": description_box.text,
423
- "index": len(results),
424
- "type": "web",
425
- "visible_text": "" # Initialize visible_text as empty string
426
- })
427
-
428
- if len(results) >= max_results:
429
- break # Stop if we have enough results
430
-
431
- # Parallelize text extraction if needed
432
- if extract_text:
433
- with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
434
- extraction_futures = [
435
- text_extractor.submit(self._extract_text_from_webpage,
436
- self._get_url("GET", result['href']),
437
- max_characters=max_text_length)
438
- for result in results
439
- if 'href' in result
440
- ]
441
- for i, future in enumerate(as_completed(extraction_futures)):
442
- try:
443
- results[i]['visible_text'] = future.result()
444
- except Exception as e:
445
- print(f"Error extracting text: {e}")
446
-
447
- except Exception as e:
448
- print(f"Error: {e}")
449
-
450
- # Cache results before returning
451
- self._cache_results(cache_key, results)
452
- return results
453
-
454
- def get_search_suggestions(self, query: str) -> List[str]:
455
- """Get search suggestions for a query"""
456
- params = {
457
- "client": "chrome",
458
- "q": query
459
- }
460
- content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
461
- params=params)
462
- suggestions = json.loads(content.decode('utf-8'))[1]
463
- return suggestions
464
-
465
- def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
466
- """
467
- Extracts visible text from HTML content using Scout parser.
468
- """
469
- soup = Scout(html_content) # Use Scout parser
470
- for tag in soup(["script", "style", "header", "footer", "nav"]):
471
- tag.extract()
472
- visible_text = soup.get_text(strip=True)
473
- if max_characters:
474
- visible_text = visible_text[:max_characters]
475
- return visible_text
476
-
477
- def __enter__(self):
478
- return self
479
-
480
- def __exit__(self, exc_type, exc_val, exc_tb):
481
- self.client.close()
482
- self._executor.shutdown()
483
-
484
-
485
- if __name__ == "__main__":
486
- from rich import print
487
- searcher = GoogleS(rate_limit=3.0, use_litlogger=True)
488
- results = searcher.search("HelpingAI-9B", max_results=5, extract_text=False, max_text_length=200)
489
- for result in results:
1
+ import requests
2
+ from typing import Dict, List, Optional, Union, Any
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from webscout.scout import Scout
5
+ from urllib.parse import urljoin
6
+ from webscout.litagent import LitAgent
7
+
8
+ import time
9
+ import json
10
+ import os
11
+ from datetime import datetime, timedelta
12
+ from functools import lru_cache
13
+ from webscout.Litlogger import Logger, LogFormat
14
+ class GoogleS:
15
+ """A Python interface for Google search with advanced features
16
+
17
+ The GoogleS class provides a powerful interface to perform web searches, image searches,
18
+ and advanced filtering on Google. Built with love by HAI to keep it
19
+
20
+ Basic Usage:
21
+ >>> from webscout.DWEBS import GoogleS
22
+ >>> searcher = GoogleS()
23
+ >>> # Simple web search
24
+ >>> results = searcher.search("Python programming")
25
+ >>> for result in results:
26
+ ... print(f"Title: {result['title']}")
27
+ ... print(f"URL: {result['href']}")
28
+ ... print(f"Description: {result['abstract']}")
29
+
30
+ Advanced Web Search:
31
+ >>> # Search with filters
32
+ >>> results = searcher.search(
33
+ ... query="Python tutorials",
34
+ ... site="github.com",
35
+ ... file_type="pdf",
36
+ ... time_period="month",
37
+ ... max_results=5
38
+ ... )
39
+ >>> # Example response format:
40
+ >>> {
41
+ ... 'title': 'Python Tutorial',
42
+ ... 'href': 'https://example.com/python-tutorial',
43
+ ... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
44
+ ... 'index': 0,
45
+ ... 'type': 'web',
46
+ ... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
47
+ ... }
48
+
49
+ Image Search:
50
+ >>> # Search for images
51
+ >>> images = searcher.search_images(
52
+ ... query="cute puppies",
53
+ ... size="large",
54
+ ... color="color",
55
+ ... type_filter="photo",
56
+ ... max_results=5
57
+ ... )
58
+ >>> # Example response format:
59
+ >>> {
60
+ ... 'title': 'Cute Puppy Image',
61
+ ... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
62
+ ... 'full_url': 'https://example.com/puppy-full.jpg',
63
+ ... 'type': 'image'
64
+ ... }
65
+
66
+ Features:
67
+ - Web Search: Get detailed web results with title, URL, and description
68
+ - Image Search: Find images with thumbnails and full-resolution URLs
69
+ - Advanced Filters: Site-specific search, file types, time periods
70
+ - Rate Limiting: Smart request handling to avoid blocks
71
+ - Caching: Save results for faster repeat searches
72
+ - Retry Logic: Automatic retry on temporary failures
73
+ - Logging: Optional LitLogger integration for beautiful console output
74
+ - Proxy Support: Use custom proxies for requests
75
+ - Concurrent Processing: Multi-threaded requests for better performance
76
+
77
+ Response Format:
78
+ Web Search Results:
79
+ {
80
+ 'title': str, # Title of the webpage
81
+ 'href': str, # URL of the webpage
82
+ 'abstract': str, # Brief description or snippet
83
+ 'index': int, # Result position
84
+ 'type': 'web', # Result type identifier
85
+ 'visible_text': str # Full page text (if extract_text=True)
86
+ }
87
+
88
+ Image Search Results:
89
+ {
90
+ 'title': str, # Image title or description
91
+ 'thumbnail': str, # Thumbnail image URL
92
+ 'full_url': str, # Full resolution image URL
93
+ 'type': 'image' # Result type identifier
94
+ }
95
+ """
96
+
97
+ SEARCH_TYPES = {
98
+ "web": "https://www.google.com/search",
99
+ "image": "https://www.google.com/images",
100
+ "news": "https://www.google.com/news",
101
+ }
102
+
103
+ def __init__(
104
+ self,
105
+ headers: Optional[Dict[str, str]] = None,
106
+ proxy: Optional[str] = None,
107
+ timeout: Optional[int] = 10,
108
+ max_workers: int = 20,
109
+ cache_dir: Optional[str] = None,
110
+ rate_limit: float = 2.0,
111
+ ):
112
+ """
113
+ Initialize the GoogleS object with enhanced features.
114
+
115
+ Args:
116
+ cache_dir: Directory to store search result cache
117
+ rate_limit: Minimum time between requests in seconds
118
+ """
119
+ self.proxy = proxy
120
+ self.headers = headers if headers else {
121
+ "User-Agent": LitAgent().random() # Use LitAgent to generate user agent
122
+ }
123
+ self.headers["Referer"] = "https://www.google.com/"
124
+ self.client = requests.Session()
125
+ self.client.headers.update(self.headers)
126
+ if proxy:
127
+ self.client.proxies.update({"http": proxy, "https": proxy})
128
+ self.timeout = timeout
129
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
130
+ self.cache_dir = cache_dir
131
+ if cache_dir and not os.path.exists(cache_dir):
132
+ os.makedirs(cache_dir)
133
+ self.last_request_time = 0
134
+ self.rate_limit = rate_limit
135
+
136
+ def _respect_rate_limit(self):
137
+ """Ensure minimum time between requests"""
138
+ current_time = time.time()
139
+ time_since_last = current_time - self.last_request_time
140
+ if time_since_last < self.rate_limit:
141
+ sleep_time = self.rate_limit - time_since_last
142
+ time.sleep(sleep_time)
143
+ self.last_request_time = time.time()
144
+
145
+ def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
146
+ data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
147
+ """
148
+ Makes an HTTP request with manual retry logic and rate limiting.
149
+
150
+ Args:
151
+ method (str): HTTP method (GET, POST, etc.)
152
+ url (str): Target URL
153
+ params (Optional[Dict[str, str]]): Query parameters
154
+ data (Optional[Union[Dict[str, str], bytes]]): Request payload
155
+ max_retries (int): Maximum number of retry attempts
156
+
157
+ Returns:
158
+ bytes: Response content
159
+ """
160
+ retry_count = 0
161
+ base_delay = 5 # Base delay in seconds
162
+
163
+ while retry_count < max_retries:
164
+ try:
165
+ self._respect_rate_limit()
166
+ response = self.client.request(
167
+ method=method,
168
+ url=url,
169
+ params=params,
170
+ data=data,
171
+ timeout=self.timeout
172
+ )
173
+
174
+ if response.status_code == 429:
175
+ retry_delay = base_delay * (2 ** retry_count) # Exponential backoff
176
+ time.sleep(retry_delay)
177
+ retry_count += 1
178
+ continue
179
+
180
+ response.raise_for_status()
181
+ return response.content
182
+
183
+ except requests.exceptions.RequestException as e:
184
+ if retry_count == max_retries - 1:
185
+ raise
186
+
187
+ retry_delay = base_delay * (2 ** retry_count)
188
+ time.sleep(retry_delay)
189
+ retry_count += 1
190
+
191
+ raise Exception("Max retries reached")
192
+
193
+ @lru_cache(maxsize=100)
194
+ def _cache_key(self, query: str, **kwargs) -> str:
195
+ """Generate a cache key from search parameters"""
196
+ cache_data = {'query': query, **kwargs}
197
+ return json.dumps(cache_data, sort_keys=True)
198
+
199
+ def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
200
+ """Retrieve cached results if they exist and are not expired"""
201
+ if not self.cache_dir:
202
+ return None
203
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
204
+ if os.path.exists(cache_file):
205
+ with open(cache_file, 'r') as f:
206
+ cached_data = json.load(f)
207
+ if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
208
+ return cached_data['results']
209
+ return None
210
+
211
+ def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
212
+ """Cache search results"""
213
+ if not self.cache_dir:
214
+ return
215
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
216
+ with open(cache_file, 'w') as f:
217
+ json.dump({
218
+ 'timestamp': datetime.now().isoformat(),
219
+ 'results': results
220
+ }, f)
221
+
222
+ def search_images(
223
+ self,
224
+ query: str,
225
+ max_results: int = 10,
226
+ size: Optional[str] = None,
227
+ color: Optional[str] = None,
228
+ type_filter: Optional[str] = None,
229
+ **kwargs
230
+ ) -> List[Dict[str, str]]:
231
+ """Search for images on Google with style!
232
+
233
+ Args:
234
+ query (str): What you're looking for fam
235
+ max_results (int): How many results you want (default: 10)
236
+ size (Optional[str]): Image size filter
237
+ - 'large': Big pics
238
+ - 'medium': Medium sized
239
+ - 'icon': Small icons
240
+ color (Optional[str]): Color filter
241
+ - 'color': Full color
242
+ - 'gray': Black and white
243
+ - 'transparent': Transparent background
244
+ type_filter (Optional[str]): Type of image
245
+ - 'face': Just faces
246
+ - 'photo': Real photos
247
+ - 'clipart': Vector art
248
+ - 'lineart': Line drawings
249
+
250
+ Returns:
251
+ List[Dict[str, str]]: List of image results with these keys:
252
+ - 'thumbnail': Small preview URL
253
+ - 'full_url': Full resolution image URL
254
+ - 'title': Image title/description
255
+ - 'type': Always 'image'
256
+
257
+ Example:
258
+ >>> searcher = GoogleS()
259
+ >>> # Find some cool nature pics
260
+ >>> images = searcher.search_images(
261
+ ... query="beautiful landscapes",
262
+ ... size="large",
263
+ ... color="color",
264
+ ... max_results=5
265
+ ... )
266
+ >>> for img in images:
267
+ ... print(f"Found: {img['title']}")
268
+ ... print(f"URL: {img['full_url']}")
269
+ """
270
+ params = {
271
+ "q": query,
272
+ "tbm": "isch",
273
+ "num": max_results
274
+ }
275
+
276
+ if size:
277
+ params["tbs"] = f"isz:{size}"
278
+ if color:
279
+ params["tbs"] = f"ic:{color}"
280
+ if type_filter:
281
+ params["tbs"] = f"itp:{type_filter}"
282
+
283
+ content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
284
+ soup = Scout(content) # Use Scout parser
285
+
286
+ results = []
287
+ for img in soup.find_all("img", class_="rg_i"):
288
+ if len(results) >= max_results:
289
+ break
290
+
291
+ img_data = {
292
+ "thumbnail": img.get("src", ""),
293
+ "title": img.get("alt", ""),
294
+ "type": "image"
295
+ }
296
+
297
+ # Extract full resolution image URL if available
298
+ parent = img.parent
299
+ if parent and parent.get("href"):
300
+ img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
301
+
302
+ results.append(img_data)
303
+
304
+ return results
305
+
306
+ def search(
307
+ self,
308
+ query: str,
309
+ region: str = "us-en",
310
+ language: str = "en",
311
+ safe: str = "off",
312
+ time_period: Optional[str] = None,
313
+ max_results: int = 10,
314
+ extract_text: bool = False,
315
+ max_text_length: Optional[int] = 100,
316
+ site: Optional[str] = None, # Search within specific site
317
+ file_type: Optional[str] = None, # Filter by file type
318
+ sort_by: str = "relevance", # relevance, date
319
+ exclude_terms: Optional[List[str]] = None, # Terms to exclude
320
+ exact_phrase: Optional[str] = None, # Exact phrase match
321
+ ) -> List[Dict[str, Union[str, int]]]:
322
+ """
323
+ Enhanced search with additional filters and options.
324
+
325
+ Args:
326
+ site: Limit search to specific website
327
+ file_type: Filter by file type (pdf, doc, etc.)
328
+ sort_by: Sort results by relevance or date
329
+ exclude_terms: List of terms to exclude from search
330
+ exact_phrase: Exact phrase to match
331
+ """
332
+ # Build advanced query
333
+ advanced_query = query
334
+ if site:
335
+ advanced_query += f" site:{site}"
336
+ if file_type:
337
+ advanced_query += f" filetype:{file_type}"
338
+ if exclude_terms:
339
+ advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
340
+ if exact_phrase:
341
+ advanced_query = f'"{exact_phrase}"' + advanced_query
342
+
343
+ # Check cache first
344
+ cache_key = self._cache_key(advanced_query, region=region, language=language,
345
+ safe=safe, time_period=time_period, sort_by=sort_by)
346
+ cached_results = self._get_cached_results(cache_key)
347
+ if cached_results:
348
+ return cached_results[:max_results]
349
+
350
+ # Continue with regular search implementation...
351
+ results = []
352
+ futures = []
353
+ start = 0
354
+
355
+ while len(results) < max_results:
356
+ params = {
357
+ "q": advanced_query,
358
+ "num": 10,
359
+ "hl": language,
360
+ "start": start,
361
+ "safe": safe,
362
+ "gl": region,
363
+ }
364
+ if time_period:
365
+ params["tbs"] = f"qdr:{time_period}"
366
+
367
+ futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
368
+ start += 10
369
+
370
+ for future in as_completed(futures):
371
+ try:
372
+ resp_content = future.result()
373
+ soup = Scout(resp_content) # Use Scout parser
374
+
375
+ result_blocks = soup.find_all("div", class_="g")
376
+
377
+ if not result_blocks:
378
+ break
379
+
380
+ # Extract links and titles first
381
+ for result_block in result_blocks:
382
+ link = result_block.find("a", href=True)
383
+ title = result_block.find("h3")
384
+ description_box = result_block.find(
385
+ "div", {"style": "-webkit-line-clamp:2"}
386
+ )
387
+
388
+ if link and title and description_box:
389
+ url = link["href"]
390
+ results.append({
391
+ "title": title.text,
392
+ "href": url,
393
+ "abstract": description_box.text,
394
+ "index": len(results),
395
+ "type": "web",
396
+ "visible_text": "" # Initialize visible_text as empty string
397
+ })
398
+
399
+ if len(results) >= max_results:
400
+ break # Stop if we have enough results
401
+
402
+ # Parallelize text extraction if needed
403
+ if extract_text:
404
+ with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
405
+ extraction_futures = [
406
+ text_extractor.submit(self._extract_text_from_webpage,
407
+ self._get_url("GET", result['href']),
408
+ max_characters=max_text_length)
409
+ for result in results
410
+ if 'href' in result
411
+ ]
412
+ for i, future in enumerate(as_completed(extraction_futures)):
413
+ try:
414
+ results[i]['visible_text'] = future.result()
415
+ except Exception as e:
416
+ print(f"Error extracting text: {e}")
417
+
418
+ except Exception as e:
419
+ print(f"Error: {e}")
420
+
421
+ # Cache results before returning
422
+ self._cache_results(cache_key, results)
423
+ return results
424
+
425
+ def get_search_suggestions(self, query: str) -> List[str]:
426
+ """Get search suggestions for a query"""
427
+ params = {
428
+ "client": "chrome",
429
+ "q": query
430
+ }
431
+ content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
432
+ params=params)
433
+ suggestions = json.loads(content.decode('utf-8'))[1]
434
+ return suggestions
435
+
436
+ def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
437
+ """
438
+ Extracts visible text from HTML content using Scout parser.
439
+ """
440
+ soup = Scout(html_content) # Use Scout parser
441
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
442
+ tag.extract()
443
+ visible_text = soup.get_text(strip=True)
444
+ if max_characters:
445
+ visible_text = visible_text[:max_characters]
446
+ return visible_text
447
+
448
+ def __enter__(self):
449
+ return self
450
+
451
+ def __exit__(self, exc_type, exc_val, exc_tb):
452
+ self.client.close()
453
+ self._executor.shutdown()
454
+
455
+
456
+ if __name__ == "__main__":
457
+ from rich import print
458
+ searcher = GoogleS(rate_limit=3.0)
459
+ results = searcher.search("HelpingAI-9B", max_results=5, extract_text=False, max_text_length=200)
460
+ for result in results:
490
461
  print(result)