webscout 7.7__py3-none-any.whl → 7.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (134) hide show
  1. webscout/AIutel.py +2 -1
  2. webscout/Bard.py +12 -29
  3. webscout/DWEBS.py +477 -461
  4. webscout/Extra/__init__.py +2 -0
  5. webscout/Extra/autocoder/__init__.py +9 -9
  6. webscout/Extra/autocoder/{rawdog.py → autocoder.py} +849 -790
  7. webscout/Extra/autocoder/autocoder_utiles.py +332 -194
  8. webscout/Extra/gguf.py +682 -682
  9. webscout/Extra/tempmail/__init__.py +26 -0
  10. webscout/Extra/tempmail/async_utils.py +141 -0
  11. webscout/Extra/tempmail/base.py +156 -0
  12. webscout/Extra/tempmail/cli.py +187 -0
  13. webscout/Extra/tempmail/mail_tm.py +361 -0
  14. webscout/Extra/tempmail/temp_mail_io.py +292 -0
  15. webscout/Provider/AI21.py +1 -1
  16. webscout/Provider/AISEARCH/DeepFind.py +2 -2
  17. webscout/Provider/AISEARCH/ISou.py +2 -2
  18. webscout/Provider/AISEARCH/felo_search.py +6 -6
  19. webscout/Provider/AISEARCH/genspark_search.py +1 -1
  20. webscout/Provider/Aitopia.py +292 -0
  21. webscout/Provider/AllenAI.py +1 -1
  22. webscout/Provider/Andi.py +3 -3
  23. webscout/Provider/C4ai.py +1 -1
  24. webscout/Provider/ChatGPTES.py +3 -5
  25. webscout/Provider/ChatGPTGratis.py +4 -4
  26. webscout/Provider/Chatify.py +2 -2
  27. webscout/Provider/Cloudflare.py +3 -2
  28. webscout/Provider/DeepSeek.py +2 -2
  29. webscout/Provider/Deepinfra.py +288 -286
  30. webscout/Provider/ElectronHub.py +709 -634
  31. webscout/Provider/ExaChat.py +325 -0
  32. webscout/Provider/Free2GPT.py +2 -2
  33. webscout/Provider/Gemini.py +167 -179
  34. webscout/Provider/GithubChat.py +1 -1
  35. webscout/Provider/Glider.py +4 -4
  36. webscout/Provider/Groq.py +41 -27
  37. webscout/Provider/HF_space/qwen_qwen2.py +1 -1
  38. webscout/Provider/HeckAI.py +1 -1
  39. webscout/Provider/HuggingFaceChat.py +1 -1
  40. webscout/Provider/Hunyuan.py +1 -1
  41. webscout/Provider/Jadve.py +3 -3
  42. webscout/Provider/Koboldai.py +3 -3
  43. webscout/Provider/LambdaChat.py +3 -2
  44. webscout/Provider/Llama.py +3 -5
  45. webscout/Provider/Llama3.py +4 -12
  46. webscout/Provider/Marcus.py +3 -3
  47. webscout/Provider/OLLAMA.py +8 -8
  48. webscout/Provider/Openai.py +7 -3
  49. webscout/Provider/PI.py +1 -1
  50. webscout/Provider/Perplexitylabs.py +1 -1
  51. webscout/Provider/Phind.py +1 -1
  52. webscout/Provider/PizzaGPT.py +1 -1
  53. webscout/Provider/QwenLM.py +4 -7
  54. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +3 -1
  55. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +3 -3
  56. webscout/Provider/TTI/ImgSys/__init__.py +23 -0
  57. webscout/Provider/TTI/ImgSys/async_imgsys.py +202 -0
  58. webscout/Provider/TTI/ImgSys/sync_imgsys.py +195 -0
  59. webscout/Provider/TTI/__init__.py +3 -1
  60. webscout/Provider/TTI/artbit/async_artbit.py +1 -1
  61. webscout/Provider/TTI/artbit/sync_artbit.py +1 -1
  62. webscout/Provider/TTI/huggingface/async_huggingface.py +1 -1
  63. webscout/Provider/TTI/huggingface/sync_huggingface.py +1 -1
  64. webscout/Provider/TTI/piclumen/__init__.py +22 -22
  65. webscout/Provider/TTI/piclumen/sync_piclumen.py +232 -232
  66. webscout/Provider/TTI/pixelmuse/__init__.py +4 -0
  67. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +249 -0
  68. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +182 -0
  69. webscout/Provider/TTI/talkai/sync_talkai.py +1 -1
  70. webscout/Provider/TTS/utils.py +1 -1
  71. webscout/Provider/TeachAnything.py +1 -1
  72. webscout/Provider/TextPollinationsAI.py +232 -230
  73. webscout/Provider/TwoAI.py +1 -2
  74. webscout/Provider/Venice.py +4 -2
  75. webscout/Provider/VercelAI.py +234 -0
  76. webscout/Provider/WebSim.py +3 -2
  77. webscout/Provider/WiseCat.py +10 -12
  78. webscout/Provider/Youchat.py +1 -1
  79. webscout/Provider/__init__.py +10 -4
  80. webscout/Provider/ai4chat.py +1 -1
  81. webscout/Provider/aimathgpt.py +2 -6
  82. webscout/Provider/akashgpt.py +1 -1
  83. webscout/Provider/askmyai.py +4 -4
  84. webscout/Provider/{DARKAI.py → asksteve.py} +56 -77
  85. webscout/Provider/bagoodex.py +2 -2
  86. webscout/Provider/cerebras.py +1 -1
  87. webscout/Provider/chatglm.py +4 -4
  88. webscout/Provider/cleeai.py +1 -0
  89. webscout/Provider/copilot.py +21 -9
  90. webscout/Provider/elmo.py +1 -1
  91. webscout/Provider/flowith.py +1 -1
  92. webscout/Provider/freeaichat.py +64 -31
  93. webscout/Provider/gaurish.py +3 -5
  94. webscout/Provider/geminiprorealtime.py +1 -1
  95. webscout/Provider/granite.py +4 -4
  96. webscout/Provider/hermes.py +5 -5
  97. webscout/Provider/julius.py +1 -1
  98. webscout/Provider/koala.py +1 -1
  99. webscout/Provider/lepton.py +1 -1
  100. webscout/Provider/llama3mitril.py +4 -4
  101. webscout/Provider/llamatutor.py +1 -1
  102. webscout/Provider/llmchat.py +3 -3
  103. webscout/Provider/meta.py +1 -1
  104. webscout/Provider/multichat.py +10 -10
  105. webscout/Provider/promptrefine.py +1 -1
  106. webscout/Provider/searchchat.py +293 -0
  107. webscout/Provider/sonus.py +2 -2
  108. webscout/Provider/talkai.py +2 -2
  109. webscout/Provider/turboseek.py +1 -1
  110. webscout/Provider/tutorai.py +1 -1
  111. webscout/Provider/typegpt.py +5 -42
  112. webscout/Provider/uncovr.py +312 -297
  113. webscout/Provider/x0gpt.py +1 -1
  114. webscout/Provider/yep.py +64 -12
  115. webscout/__init__.py +3 -1
  116. webscout/cli.py +59 -98
  117. webscout/conversation.py +350 -17
  118. webscout/litprinter/__init__.py +59 -667
  119. webscout/optimizers.py +419 -419
  120. webscout/tempid.py +11 -11
  121. webscout/update_checker.py +14 -12
  122. webscout/utils.py +2 -2
  123. webscout/version.py +1 -1
  124. webscout/webscout_search.py +146 -87
  125. webscout/webscout_search_async.py +148 -27
  126. {webscout-7.7.dist-info → webscout-7.9.dist-info}/METADATA +92 -66
  127. webscout-7.9.dist-info/RECORD +248 -0
  128. webscout/Provider/EDITEE.py +0 -192
  129. webscout/litprinter/colors.py +0 -54
  130. webscout-7.7.dist-info/RECORD +0 -234
  131. {webscout-7.7.dist-info → webscout-7.9.dist-info}/LICENSE.md +0 -0
  132. {webscout-7.7.dist-info → webscout-7.9.dist-info}/WHEEL +0 -0
  133. {webscout-7.7.dist-info → webscout-7.9.dist-info}/entry_points.txt +0 -0
  134. {webscout-7.7.dist-info → webscout-7.9.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py CHANGED
@@ -1,461 +1,477 @@
1
- import requests
2
- from typing import Dict, List, Optional, Union, Any
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from webscout.scout import Scout
5
- from urllib.parse import urljoin
6
- from webscout.litagent import LitAgent
7
-
8
- import time
9
- import json
10
- import os
11
- from datetime import datetime, timedelta
12
- from functools import lru_cache
13
- from webscout.Litlogger import Logger, LogFormat
14
- class GoogleS:
15
- """A Python interface for Google search with advanced features
16
-
17
- The GoogleS class provides a powerful interface to perform web searches, image searches,
18
- and advanced filtering on Google. Built with love by HAI to keep it
19
-
20
- Basic Usage:
21
- >>> from webscout.DWEBS import GoogleS
22
- >>> searcher = GoogleS()
23
- >>> # Simple web search
24
- >>> results = searcher.search("Python programming")
25
- >>> for result in results:
26
- ... print(f"Title: {result['title']}")
27
- ... print(f"URL: {result['href']}")
28
- ... print(f"Description: {result['abstract']}")
29
-
30
- Advanced Web Search:
31
- >>> # Search with filters
32
- >>> results = searcher.search(
33
- ... query="Python tutorials",
34
- ... site="github.com",
35
- ... file_type="pdf",
36
- ... time_period="month",
37
- ... max_results=5
38
- ... )
39
- >>> # Example response format:
40
- >>> {
41
- ... 'title': 'Python Tutorial',
42
- ... 'href': 'https://example.com/python-tutorial',
43
- ... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
44
- ... 'index': 0,
45
- ... 'type': 'web',
46
- ... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
47
- ... }
48
-
49
- Image Search:
50
- >>> # Search for images
51
- >>> images = searcher.search_images(
52
- ... query="cute puppies",
53
- ... size="large",
54
- ... color="color",
55
- ... type_filter="photo",
56
- ... max_results=5
57
- ... )
58
- >>> # Example response format:
59
- >>> {
60
- ... 'title': 'Cute Puppy Image',
61
- ... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
62
- ... 'full_url': 'https://example.com/puppy-full.jpg',
63
- ... 'type': 'image'
64
- ... }
65
-
66
- Features:
67
- - Web Search: Get detailed web results with title, URL, and description
68
- - Image Search: Find images with thumbnails and full-resolution URLs
69
- - Advanced Filters: Site-specific search, file types, time periods
70
- - Rate Limiting: Smart request handling to avoid blocks
71
- - Caching: Save results for faster repeat searches
72
- - Retry Logic: Automatic retry on temporary failures
73
- - Logging: Optional LitLogger integration for beautiful console output
74
- - Proxy Support: Use custom proxies for requests
75
- - Concurrent Processing: Multi-threaded requests for better performance
76
-
77
- Response Format:
78
- Web Search Results:
79
- {
80
- 'title': str, # Title of the webpage
81
- 'href': str, # URL of the webpage
82
- 'abstract': str, # Brief description or snippet
83
- 'index': int, # Result position
84
- 'type': 'web', # Result type identifier
85
- 'visible_text': str # Full page text (if extract_text=True)
86
- }
87
-
88
- Image Search Results:
89
- {
90
- 'title': str, # Image title or description
91
- 'thumbnail': str, # Thumbnail image URL
92
- 'full_url': str, # Full resolution image URL
93
- 'type': 'image' # Result type identifier
94
- }
95
- """
96
-
97
- SEARCH_TYPES = {
98
- "web": "https://www.google.com/search",
99
- "image": "https://www.google.com/images",
100
- "news": "https://www.google.com/news",
101
- }
102
-
103
- def __init__(
104
- self,
105
- headers: Optional[Dict[str, str]] = None,
106
- proxy: Optional[str] = None,
107
- timeout: Optional[int] = 10,
108
- max_workers: int = 20,
109
- cache_dir: Optional[str] = None,
110
- rate_limit: float = 2.0,
111
- ):
112
- """
113
- Initialize the GoogleS object with enhanced features.
114
-
115
- Args:
116
- cache_dir: Directory to store search result cache
117
- rate_limit: Minimum time between requests in seconds
118
- """
119
- self.proxy = proxy
120
- self.headers = headers if headers else {
121
- "User-Agent": LitAgent().random() # Use LitAgent to generate user agent
122
- }
123
- self.headers["Referer"] = "https://www.google.com/"
124
- self.client = requests.Session()
125
- self.client.headers.update(self.headers)
126
- if proxy:
127
- self.client.proxies.update({"http": proxy, "https": proxy})
128
- self.timeout = timeout
129
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
130
- self.cache_dir = cache_dir
131
- if cache_dir and not os.path.exists(cache_dir):
132
- os.makedirs(cache_dir)
133
- self.last_request_time = 0
134
- self.rate_limit = rate_limit
135
-
136
- def _respect_rate_limit(self):
137
- """Ensure minimum time between requests"""
138
- current_time = time.time()
139
- time_since_last = current_time - self.last_request_time
140
- if time_since_last < self.rate_limit:
141
- sleep_time = self.rate_limit - time_since_last
142
- time.sleep(sleep_time)
143
- self.last_request_time = time.time()
144
-
145
- def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
146
- data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
147
- """
148
- Makes an HTTP request with manual retry logic and rate limiting.
149
-
150
- Args:
151
- method (str): HTTP method (GET, POST, etc.)
152
- url (str): Target URL
153
- params (Optional[Dict[str, str]]): Query parameters
154
- data (Optional[Union[Dict[str, str], bytes]]): Request payload
155
- max_retries (int): Maximum number of retry attempts
156
-
157
- Returns:
158
- bytes: Response content
159
- """
160
- retry_count = 0
161
- base_delay = 5 # Base delay in seconds
162
-
163
- while retry_count < max_retries:
164
- try:
165
- self._respect_rate_limit()
166
- response = self.client.request(
167
- method=method,
168
- url=url,
169
- params=params,
170
- data=data,
171
- timeout=self.timeout
172
- )
173
-
174
- if response.status_code == 429:
175
- retry_delay = base_delay * (2 ** retry_count) # Exponential backoff
176
- time.sleep(retry_delay)
177
- retry_count += 1
178
- continue
179
-
180
- response.raise_for_status()
181
- return response.content
182
-
183
- except requests.exceptions.RequestException as e:
184
- if retry_count == max_retries - 1:
185
- raise
186
-
187
- retry_delay = base_delay * (2 ** retry_count)
188
- time.sleep(retry_delay)
189
- retry_count += 1
190
-
191
- raise Exception("Max retries reached")
192
-
193
- @lru_cache(maxsize=100)
194
- def _cache_key(self, query: str, **kwargs) -> str:
195
- """Generate a cache key from search parameters"""
196
- cache_data = {'query': query, **kwargs}
197
- return json.dumps(cache_data, sort_keys=True)
198
-
199
- def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
200
- """Retrieve cached results if they exist and are not expired"""
201
- if not self.cache_dir:
202
- return None
203
- cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
204
- if os.path.exists(cache_file):
205
- with open(cache_file, 'r') as f:
206
- cached_data = json.load(f)
207
- if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
208
- return cached_data['results']
209
- return None
210
-
211
- def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
212
- """Cache search results"""
213
- if not self.cache_dir:
214
- return
215
- cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
216
- with open(cache_file, 'w') as f:
217
- json.dump({
218
- 'timestamp': datetime.now().isoformat(),
219
- 'results': results
220
- }, f)
221
-
222
- def search_images(
223
- self,
224
- query: str,
225
- max_results: int = 10,
226
- size: Optional[str] = None,
227
- color: Optional[str] = None,
228
- type_filter: Optional[str] = None,
229
- **kwargs
230
- ) -> List[Dict[str, str]]:
231
- """Search for images on Google with style!
232
-
233
- Args:
234
- query (str): What you're looking for fam
235
- max_results (int): How many results you want (default: 10)
236
- size (Optional[str]): Image size filter
237
- - 'large': Big pics
238
- - 'medium': Medium sized
239
- - 'icon': Small icons
240
- color (Optional[str]): Color filter
241
- - 'color': Full color
242
- - 'gray': Black and white
243
- - 'transparent': Transparent background
244
- type_filter (Optional[str]): Type of image
245
- - 'face': Just faces
246
- - 'photo': Real photos
247
- - 'clipart': Vector art
248
- - 'lineart': Line drawings
249
-
250
- Returns:
251
- List[Dict[str, str]]: List of image results with these keys:
252
- - 'thumbnail': Small preview URL
253
- - 'full_url': Full resolution image URL
254
- - 'title': Image title/description
255
- - 'type': Always 'image'
256
-
257
- Example:
258
- >>> searcher = GoogleS()
259
- >>> # Find some cool nature pics
260
- >>> images = searcher.search_images(
261
- ... query="beautiful landscapes",
262
- ... size="large",
263
- ... color="color",
264
- ... max_results=5
265
- ... )
266
- >>> for img in images:
267
- ... print(f"Found: {img['title']}")
268
- ... print(f"URL: {img['full_url']}")
269
- """
270
- params = {
271
- "q": query,
272
- "tbm": "isch",
273
- "num": max_results
274
- }
275
-
276
- if size:
277
- params["tbs"] = f"isz:{size}"
278
- if color:
279
- params["tbs"] = f"ic:{color}"
280
- if type_filter:
281
- params["tbs"] = f"itp:{type_filter}"
282
-
283
- content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
284
- soup = Scout(content) # Use Scout parser
285
-
286
- results = []
287
- for img in soup.find_all("img", class_="rg_i"):
288
- if len(results) >= max_results:
289
- break
290
-
291
- img_data = {
292
- "thumbnail": img.get("src", ""),
293
- "title": img.get("alt", ""),
294
- "type": "image"
295
- }
296
-
297
- # Extract full resolution image URL if available
298
- parent = img.parent
299
- if parent and parent.get("href"):
300
- img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
301
-
302
- results.append(img_data)
303
-
304
- return results
305
-
306
- def search(
307
- self,
308
- query: str,
309
- region: str = "us-en",
310
- language: str = "en",
311
- safe: str = "off",
312
- time_period: Optional[str] = None,
313
- max_results: int = 10,
314
- extract_text: bool = False,
315
- max_text_length: Optional[int] = 100,
316
- site: Optional[str] = None, # Search within specific site
317
- file_type: Optional[str] = None, # Filter by file type
318
- sort_by: str = "relevance", # relevance, date
319
- exclude_terms: Optional[List[str]] = None, # Terms to exclude
320
- exact_phrase: Optional[str] = None, # Exact phrase match
321
- ) -> List[Dict[str, Union[str, int]]]:
322
- """
323
- Enhanced search with additional filters and options.
324
-
325
- Args:
326
- site: Limit search to specific website
327
- file_type: Filter by file type (pdf, doc, etc.)
328
- sort_by: Sort results by relevance or date
329
- exclude_terms: List of terms to exclude from search
330
- exact_phrase: Exact phrase to match
331
- """
332
- # Build advanced query
333
- advanced_query = query
334
- if site:
335
- advanced_query += f" site:{site}"
336
- if file_type:
337
- advanced_query += f" filetype:{file_type}"
338
- if exclude_terms:
339
- advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
340
- if exact_phrase:
341
- advanced_query = f'"{exact_phrase}"' + advanced_query
342
-
343
- # Check cache first
344
- cache_key = self._cache_key(advanced_query, region=region, language=language,
345
- safe=safe, time_period=time_period, sort_by=sort_by)
346
- cached_results = self._get_cached_results(cache_key)
347
- if cached_results:
348
- return cached_results[:max_results]
349
-
350
- # Continue with regular search implementation...
351
- results = []
352
- futures = []
353
- start = 0
354
-
355
- while len(results) < max_results:
356
- params = {
357
- "q": advanced_query,
358
- "num": 10,
359
- "hl": language,
360
- "start": start,
361
- "safe": safe,
362
- "gl": region,
363
- }
364
- if time_period:
365
- params["tbs"] = f"qdr:{time_period}"
366
-
367
- futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
368
- start += 10
369
-
370
- for future in as_completed(futures):
371
- try:
372
- resp_content = future.result()
373
- soup = Scout(resp_content) # Use Scout parser
374
-
375
- result_blocks = soup.find_all("div", class_="g")
376
-
377
- if not result_blocks:
378
- break
379
-
380
- # Extract links and titles first
381
- for result_block in result_blocks:
382
- link = result_block.find("a", href=True)
383
- title = result_block.find("h3")
384
- description_box = result_block.find(
385
- "div", {"style": "-webkit-line-clamp:2"}
386
- )
387
-
388
- if link and title and description_box:
389
- url = link["href"]
390
- results.append({
391
- "title": title.text,
392
- "href": url,
393
- "abstract": description_box.text,
394
- "index": len(results),
395
- "type": "web",
396
- "visible_text": "" # Initialize visible_text as empty string
397
- })
398
-
399
- if len(results) >= max_results:
400
- break # Stop if we have enough results
401
-
402
- # Parallelize text extraction if needed
403
- if extract_text:
404
- with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
405
- extraction_futures = [
406
- text_extractor.submit(self._extract_text_from_webpage,
407
- self._get_url("GET", result['href']),
408
- max_characters=max_text_length)
409
- for result in results
410
- if 'href' in result
411
- ]
412
- for i, future in enumerate(as_completed(extraction_futures)):
413
- try:
414
- results[i]['visible_text'] = future.result()
415
- except Exception as e:
416
- print(f"Error extracting text: {e}")
417
-
418
- except Exception as e:
419
- print(f"Error: {e}")
420
-
421
- # Cache results before returning
422
- self._cache_results(cache_key, results)
423
- return results
424
-
425
- def get_search_suggestions(self, query: str) -> List[str]:
426
- """Get search suggestions for a query"""
427
- params = {
428
- "client": "chrome",
429
- "q": query
430
- }
431
- content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
432
- params=params)
433
- suggestions = json.loads(content.decode('utf-8'))[1]
434
- return suggestions
435
-
436
- def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
437
- """
438
- Extracts visible text from HTML content using Scout parser.
439
- """
440
- soup = Scout(html_content) # Use Scout parser
441
- for tag in soup(["script", "style", "header", "footer", "nav"]):
442
- tag.extract()
443
- visible_text = soup.get_text(strip=True)
444
- if max_characters:
445
- visible_text = visible_text[:max_characters]
446
- return visible_text
447
-
448
- def __enter__(self):
449
- return self
450
-
451
- def __exit__(self, exc_type, exc_val, exc_tb):
452
- self.client.close()
453
- self._executor.shutdown()
454
-
455
-
456
- if __name__ == "__main__":
457
- from rich import print
458
- searcher = GoogleS(rate_limit=3.0)
459
- results = searcher.search("HelpingAI-9B", max_results=5, extract_text=False, max_text_length=200)
460
- for result in results:
461
- print(result)
1
+ """
2
+ DWEBS - A Google search library with advanced features
3
+ """
4
+ import random
5
+ from time import sleep
6
+ from webscout.scout import Scout
7
+ from requests import get
8
+ from urllib.parse import unquote, urlencode
9
+ from typing import List, Dict, Optional, Union, Iterator, Any
10
+ from concurrent.futures import ThreadPoolExecutor
11
+
12
+
13
+ class SearchResult:
14
+ """Class to represent a search result with metadata."""
15
+
16
+ def __init__(self, url: str, title: str, description: str):
17
+ """
18
+ Initialize a search result.
19
+
20
+ Args:
21
+ url: The URL of the search result
22
+ title: The title of the search result
23
+ description: The description/snippet of the search result
24
+ """
25
+ self.url = url
26
+ self.title = title
27
+ self.description = description
28
+ # Additional metadata that can be populated
29
+ self.metadata: Dict[str, Any] = {}
30
+
31
+ def __repr__(self) -> str:
32
+ """Return string representation of search result."""
33
+ return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
34
+
35
+
36
+ class GoogleSearch:
37
+ """Google search implementation with configurable parameters and advanced features."""
38
+
39
+ _executor: ThreadPoolExecutor = ThreadPoolExecutor()
40
+
41
+ def __init__(
42
+ self,
43
+ timeout: int = 10,
44
+ proxies: Optional[Dict[str, str]] = None,
45
+ verify: bool = True,
46
+ lang: str = "en",
47
+ sleep_interval: float = 0.0
48
+ ):
49
+ """
50
+ Initialize GoogleSearch with custom settings.
51
+
52
+ Args:
53
+ timeout: Request timeout in seconds
54
+ proxies: Proxy configuration for requests
55
+ verify: Whether to verify SSL certificates
56
+ lang: Search language
57
+ sleep_interval: Sleep time between pagination requests
58
+ """
59
+ self.timeout = timeout
60
+ self.proxies = proxies if proxies else {}
61
+ self.verify = verify
62
+ self.lang = lang
63
+ self.sleep_interval = sleep_interval
64
+ self.base_url = "https://www.google.com/search"
65
+
66
+ def _get_useragent(self) -> str:
67
+ """
68
+ Generate a random user agent string.
69
+
70
+ Returns:
71
+ Random user agent string
72
+ """
73
+ lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
74
+ libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
75
+ ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
76
+ openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
77
+ return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
78
+
79
+ def _make_request(self, term: str, results: int, start: int = 0, search_type: str = None) -> str:
80
+ """
81
+ Make a request to Google search.
82
+
83
+ Args:
84
+ term: Search query
85
+ results: Number of results to request
86
+ start: Start position for pagination
87
+ search_type: Type of search ('', 'nws', 'isch')
88
+
89
+ Returns:
90
+ HTML response content
91
+ """
92
+ params = {
93
+ "q": term,
94
+ "num": results + 2, # Request slightly more than needed
95
+ "hl": self.lang,
96
+ "start": start,
97
+ }
98
+
99
+ # Add search type if specified
100
+ if search_type:
101
+ params["tbm"] = search_type
102
+
103
+ try:
104
+ resp = get(
105
+ url=self.base_url,
106
+ headers={
107
+ "User-Agent": self._get_useragent(),
108
+ "Accept-Language": self.lang,
109
+ "Accept-Encoding": "gzip, deflate, br",
110
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
111
+ },
112
+ params=params,
113
+ proxies=self.proxies if any(self.proxies) else None,
114
+ timeout=self.timeout,
115
+ verify=self.verify,
116
+ cookies={
117
+ 'CONSENT': 'PENDING+987',
118
+ 'SOCS': 'CAESHAgBEhIaAB',
119
+ }
120
+ )
121
+ resp.raise_for_status()
122
+ return resp.text
123
+ except Exception as e:
124
+ raise RuntimeError(f"Search request failed: {str(e)}")
125
+
126
+ def _extract_url(self, raw_link: str) -> Optional[str]:
127
+ """
128
+ Extract actual URL from Google redirect URL.
129
+
130
+ Args:
131
+ raw_link: Raw link from Google search
132
+
133
+ Returns:
134
+ Actual URL or None if invalid
135
+ """
136
+ if not raw_link:
137
+ return None
138
+
139
+ if raw_link.startswith("/url?"):
140
+ try:
141
+ link = unquote(raw_link.split("&")[0].replace("/url?q=", ""))
142
+ return link
143
+ except Exception:
144
+ return None
145
+ elif raw_link.startswith("http"):
146
+ return unquote(raw_link)
147
+
148
+ return None
149
+
150
+ def _is_valid_result(self, link: str, fetched_links: set, unique: bool) -> bool:
151
+ """
152
+ Check if search result is valid.
153
+
154
+ Args:
155
+ link: URL to check
156
+ fetched_links: Set of already fetched links
157
+ unique: Whether to filter duplicate links
158
+
159
+ Returns:
160
+ Boolean indicating if result is valid
161
+ """
162
+ if any(x in link for x in ["google.", "/search?", "webcache."]):
163
+ return False
164
+
165
+ if link in fetched_links and unique:
166
+ return False
167
+
168
+ return True
169
+
170
+ def _parse_search_results(
171
+ self,
172
+ html: str,
173
+ num_results: int,
174
+ fetched_links: set,
175
+ unique: bool
176
+ ) -> List[SearchResult]:
177
+ """
178
+ Parse search results from HTML.
179
+
180
+ Args:
181
+ html: HTML content to parse
182
+ num_results: Maximum number of results to return
183
+ fetched_links: Set of already fetched links
184
+ unique: Filter duplicate links
185
+
186
+ Returns:
187
+ List of SearchResult objects
188
+ """
189
+ results = []
190
+ soup = Scout(html, features="html.parser")
191
+ result_blocks = soup.find_all("div", class_="ezO2md")
192
+
193
+ if not result_blocks:
194
+ # Try alternative class patterns if the main one doesn't match
195
+ result_blocks = soup.find_all("div", attrs={"class": lambda c: c and "g" in c.split()})
196
+
197
+ for result in result_blocks:
198
+ # Find the link - looking for various potential Google result classes
199
+ link_tag = result.find("a", class_=["fuLhoc", "ZWRArf"])
200
+ if not link_tag:
201
+ link_tag = result.find("a")
202
+ if not link_tag:
203
+ continue
204
+
205
+ raw_link = link_tag.get("href", "")
206
+ link = self._extract_url(raw_link)
207
+
208
+ if not link:
209
+ continue
210
+
211
+ if not self._is_valid_result(link, fetched_links, unique):
212
+ continue
213
+
214
+ # Get title - it's the text content of the link tag for these results
215
+ title = link_tag.get_text(strip=True)
216
+ if not title:
217
+ continue
218
+
219
+ # Get description - it's in a span with class FrIlee or potentially other classes
220
+ description_tag = result.find("span", class_="FrIlee")
221
+ if not description_tag:
222
+ description_tag = result.find(["div", "span"], class_=lambda c: c and any(x in c for x in ["snippet", "description", "VwiC3b"]))
223
+
224
+ description = description_tag.get_text(strip=True) if description_tag else ""
225
+
226
+ # Create result object
227
+ search_result = SearchResult(link, title, description)
228
+
229
+ # Add extra metadata if available
230
+ citation = result.find("cite")
231
+ if citation:
232
+ search_result.metadata["source"] = citation.get_text(strip=True)
233
+
234
+ timestamp = result.find("span", class_=lambda c: c and "ZE5qJf" in c)
235
+ if timestamp:
236
+ search_result.metadata["date"] = timestamp.get_text(strip=True)
237
+
238
+ fetched_links.add(link)
239
+ results.append(search_result)
240
+
241
+ if len(results) >= num_results:
242
+ break
243
+
244
+ return results
245
+
246
+ def text(
247
+ self,
248
+ keywords: str,
249
+ region: str = None,
250
+ safesearch: str = "moderate",
251
+ max_results: int = 10,
252
+ start_num: int = 0,
253
+ unique: bool = True
254
+ ) -> List[SearchResult]:
255
+ """
256
+ Search Google for web results.
257
+
258
+ Args:
259
+ keywords: Search query
260
+ region: Region for search results (ISO country code)
261
+ safesearch: SafeSearch setting ("on", "moderate", "off")
262
+ max_results: Maximum number of results to return
263
+ start_num: Starting position for pagination
264
+ unique: Filter duplicate results
265
+
266
+ Returns:
267
+ List of SearchResult objects with search results
268
+ """
269
+ if not keywords:
270
+ raise ValueError("Search keywords cannot be empty")
271
+
272
+ # Map safesearch values to Google's safe parameter
273
+ safe_map = {
274
+ "on": "active",
275
+ "moderate": "moderate",
276
+ "off": "off"
277
+ }
278
+ safe = safe_map.get(safesearch.lower(), "moderate")
279
+
280
+ # Keep track of unique results
281
+ fetched_results = []
282
+ fetched_links = set()
283
+ start = start_num
284
+
285
+ while len(fetched_results) < max_results:
286
+ response_html = self._make_request(
287
+ term=keywords,
288
+ results=max_results - len(fetched_results),
289
+ start=start
290
+ )
291
+
292
+ results = self._parse_search_results(
293
+ html=response_html,
294
+ num_results=max_results - len(fetched_results),
295
+ fetched_links=fetched_links,
296
+ unique=unique
297
+ )
298
+
299
+ if not results:
300
+ break
301
+
302
+ fetched_results.extend(results)
303
+
304
+ if len(fetched_results) >= max_results:
305
+ break
306
+
307
+ start += 10
308
+ sleep(self.sleep_interval)
309
+
310
+ return fetched_results[:max_results]
311
+
312
+ def news(
313
+ self,
314
+ keywords: str,
315
+ region: str = None,
316
+ safesearch: str = "moderate",
317
+ max_results: int = 10
318
+ ) -> List[SearchResult]:
319
+ """
320
+ Search Google News for news results.
321
+
322
+ Args:
323
+ keywords: Search query
324
+ region: Region for search results (ISO country code)
325
+ safesearch: SafeSearch setting ("on", "moderate", "off")
326
+ max_results: Maximum number of results to return
327
+
328
+ Returns:
329
+ List of SearchResult objects with news results
330
+ """
331
+ if not keywords:
332
+ raise ValueError("Search keywords cannot be empty")
333
+
334
+ # Map safesearch values to Google's safe parameter
335
+ safe_map = {
336
+ "on": "active",
337
+ "moderate": "moderate",
338
+ "off": "off"
339
+ }
340
+ safe = safe_map.get(safesearch.lower(), "moderate")
341
+
342
+ # Keep track of unique results
343
+ fetched_results = []
344
+ fetched_links = set()
345
+
346
+ response_html = self._make_request(
347
+ term=keywords,
348
+ results=max_results,
349
+ search_type="nws"
350
+ )
351
+
352
+ results = self._parse_search_results(
353
+ html=response_html,
354
+ num_results=max_results,
355
+ fetched_links=fetched_links,
356
+ unique=True
357
+ )
358
+
359
+ return results[:max_results]
360
+
361
+ def suggestions(self, query: str, region: str = None) -> List[str]:
362
+ """
363
+ Get search suggestions for a query term.
364
+
365
+ Args:
366
+ query: Search query
367
+ region: Region for suggestions (ISO country code)
368
+
369
+ Returns:
370
+ List of search suggestions
371
+ """
372
+ if not query:
373
+ raise ValueError("Search query cannot be empty")
374
+
375
+ try:
376
+ params = {
377
+ "client": "firefox",
378
+ "q": query,
379
+ }
380
+
381
+ # Add region if specified
382
+ if region and region.lower() != "all":
383
+ params["gl"] = region
384
+
385
+ url = f"https://www.google.com/complete/search?{urlencode(params)}"
386
+
387
+ headers = {
388
+ "User-Agent": self._get_useragent(),
389
+ "Accept": "application/json, text/javascript, */*",
390
+ "Accept-Language": self.lang,
391
+ }
392
+
393
+ response = get(
394
+ url=url,
395
+ headers=headers,
396
+ timeout=self.timeout,
397
+ verify=self.verify
398
+ )
399
+ response.raise_for_status()
400
+
401
+ # Response format is typically: ["original query", ["suggestion1", "suggestion2", ...]]
402
+ data = response.json()
403
+ if isinstance(data, list) and len(data) > 1 and isinstance(data[1], list):
404
+ return data[1]
405
+ return []
406
+
407
+ except Exception as e:
408
+ # Return empty list on error instead of raising exception
409
+ return []
410
+
411
+
412
+ # Legacy function support for backward compatibility
413
+ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=True, region=None, start_num=0, unique=False):
414
+ """Legacy function for backward compatibility."""
415
+ google_search = GoogleSearch(
416
+ timeout=timeout,
417
+ proxies={"https": proxy, "http": proxy} if proxy else None,
418
+ verify=ssl_verify,
419
+ lang=lang,
420
+ sleep_interval=sleep_interval
421
+ )
422
+
423
+ results = google_search.text(
424
+ keywords=term,
425
+ region=region,
426
+ safesearch="on" if safe == "active" else "moderate" if safe == "moderate" else "off",
427
+ max_results=num_results,
428
+ start_num=start_num,
429
+ unique=unique
430
+ )
431
+
432
+ # Convert to simple URLs if not advanced mode
433
+ if not advanced:
434
+ return [result.url for result in results]
435
+ return results
436
+
437
+
438
+ if __name__ == "__main__":
439
+ from rich import print
440
+ google = GoogleSearch(
441
+ timeout=10, # Optional: Set custom timeout
442
+ proxies=None, # Optional: Use proxies
443
+ verify=True # Optional: SSL verification
444
+ )
445
+
446
+ # Text Search
447
+ print("TEXT SEARCH RESULTS:")
448
+ text_results = google.text(
449
+ keywords="Python programming",
450
+ region="us", # Optional: Region for results
451
+ safesearch="moderate", # Optional: "on", "moderate", "off"
452
+ max_results=3 # Optional: Limit number of results
453
+ )
454
+ for result in text_results:
455
+ print(f"Title: {result.title}")
456
+ print(f"URL: {result.url}")
457
+ print(f"Description: {result.description}")
458
+ print("---")
459
+
460
+ # News Search
461
+ print("\nNEWS SEARCH RESULTS:")
462
+ news_results = google.news(
463
+ keywords="artificial intelligence",
464
+ region="us",
465
+ safesearch="moderate",
466
+ max_results=2
467
+ )
468
+ for result in news_results:
469
+ print(f"Title: {result.title}")
470
+ print(f"URL: {result.url}")
471
+ print(f"Description: {result.description}")
472
+ print("---")
473
+
474
+ # Search Suggestions
475
+ print("\nSEARCH SUGGESTIONS:")
476
+ suggestions = google.suggestions("how to")
477
+ print(suggestions)