webscout 7.7__py3-none-any.whl → 7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (121) hide show
  1. webscout/AIutel.py +2 -1
  2. webscout/Bard.py +14 -11
  3. webscout/DWEBS.py +431 -415
  4. webscout/Extra/autocoder/__init__.py +9 -9
  5. webscout/Extra/autocoder/autocoder_utiles.py +332 -194
  6. webscout/Extra/autocoder/rawdog.py +68 -9
  7. webscout/Extra/gguf.py +682 -682
  8. webscout/Provider/AI21.py +1 -1
  9. webscout/Provider/AISEARCH/DeepFind.py +2 -2
  10. webscout/Provider/AISEARCH/ISou.py +2 -2
  11. webscout/Provider/AISEARCH/felo_search.py +6 -6
  12. webscout/Provider/AISEARCH/genspark_search.py +1 -1
  13. webscout/Provider/Aitopia.py +292 -0
  14. webscout/Provider/AllenAI.py +1 -1
  15. webscout/Provider/Andi.py +3 -3
  16. webscout/Provider/C4ai.py +1 -1
  17. webscout/Provider/ChatGPTES.py +3 -5
  18. webscout/Provider/ChatGPTGratis.py +4 -4
  19. webscout/Provider/Chatify.py +2 -2
  20. webscout/Provider/Cloudflare.py +3 -2
  21. webscout/Provider/DARKAI.py +3 -2
  22. webscout/Provider/DeepSeek.py +2 -2
  23. webscout/Provider/Deepinfra.py +1 -1
  24. webscout/Provider/EDITEE.py +1 -1
  25. webscout/Provider/ElectronHub.py +178 -96
  26. webscout/Provider/ExaChat.py +310 -0
  27. webscout/Provider/Free2GPT.py +2 -2
  28. webscout/Provider/Gemini.py +5 -19
  29. webscout/Provider/GithubChat.py +1 -1
  30. webscout/Provider/Glider.py +4 -4
  31. webscout/Provider/Groq.py +3 -3
  32. webscout/Provider/HF_space/qwen_qwen2.py +1 -1
  33. webscout/Provider/HeckAI.py +1 -1
  34. webscout/Provider/HuggingFaceChat.py +1 -1
  35. webscout/Provider/Hunyuan.py +1 -1
  36. webscout/Provider/Jadve.py +3 -3
  37. webscout/Provider/Koboldai.py +3 -3
  38. webscout/Provider/LambdaChat.py +1 -1
  39. webscout/Provider/Llama.py +3 -5
  40. webscout/Provider/Llama3.py +4 -12
  41. webscout/Provider/Marcus.py +3 -3
  42. webscout/Provider/OLLAMA.py +8 -8
  43. webscout/Provider/Openai.py +7 -3
  44. webscout/Provider/PI.py +1 -1
  45. webscout/Provider/Perplexitylabs.py +1 -1
  46. webscout/Provider/Phind.py +1 -1
  47. webscout/Provider/PizzaGPT.py +1 -1
  48. webscout/Provider/QwenLM.py +4 -7
  49. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +3 -1
  50. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +3 -3
  51. webscout/Provider/TTI/ImgSys/__init__.py +23 -0
  52. webscout/Provider/TTI/ImgSys/async_imgsys.py +202 -0
  53. webscout/Provider/TTI/ImgSys/sync_imgsys.py +195 -0
  54. webscout/Provider/TTI/__init__.py +3 -1
  55. webscout/Provider/TTI/artbit/async_artbit.py +1 -1
  56. webscout/Provider/TTI/artbit/sync_artbit.py +1 -1
  57. webscout/Provider/TTI/huggingface/async_huggingface.py +1 -1
  58. webscout/Provider/TTI/huggingface/sync_huggingface.py +1 -1
  59. webscout/Provider/TTI/piclumen/__init__.py +22 -22
  60. webscout/Provider/TTI/piclumen/sync_piclumen.py +232 -232
  61. webscout/Provider/TTI/pixelmuse/__init__.py +4 -0
  62. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +249 -0
  63. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +182 -0
  64. webscout/Provider/TTI/talkai/sync_talkai.py +1 -1
  65. webscout/Provider/TTS/utils.py +1 -1
  66. webscout/Provider/TeachAnything.py +1 -1
  67. webscout/Provider/TextPollinationsAI.py +4 -4
  68. webscout/Provider/TwoAI.py +1 -2
  69. webscout/Provider/Venice.py +4 -2
  70. webscout/Provider/VercelAI.py +234 -0
  71. webscout/Provider/WebSim.py +3 -2
  72. webscout/Provider/WiseCat.py +10 -12
  73. webscout/Provider/Youchat.py +1 -1
  74. webscout/Provider/__init__.py +10 -0
  75. webscout/Provider/ai4chat.py +1 -1
  76. webscout/Provider/aimathgpt.py +2 -6
  77. webscout/Provider/akashgpt.py +1 -1
  78. webscout/Provider/askmyai.py +4 -4
  79. webscout/Provider/asksteve.py +203 -0
  80. webscout/Provider/bagoodex.py +2 -2
  81. webscout/Provider/cerebras.py +1 -1
  82. webscout/Provider/chatglm.py +4 -4
  83. webscout/Provider/cleeai.py +1 -0
  84. webscout/Provider/copilot.py +427 -415
  85. webscout/Provider/elmo.py +1 -1
  86. webscout/Provider/flowith.py +1 -1
  87. webscout/Provider/freeaichat.py +57 -31
  88. webscout/Provider/gaurish.py +3 -5
  89. webscout/Provider/geminiprorealtime.py +1 -1
  90. webscout/Provider/granite.py +4 -4
  91. webscout/Provider/hermes.py +5 -5
  92. webscout/Provider/julius.py +1 -1
  93. webscout/Provider/koala.py +1 -1
  94. webscout/Provider/lepton.py +1 -1
  95. webscout/Provider/llama3mitril.py +4 -4
  96. webscout/Provider/llamatutor.py +1 -1
  97. webscout/Provider/llmchat.py +3 -3
  98. webscout/Provider/meta.py +1 -1
  99. webscout/Provider/multichat.py +10 -10
  100. webscout/Provider/promptrefine.py +1 -1
  101. webscout/Provider/searchchat.py +293 -0
  102. webscout/Provider/sonus.py +2 -2
  103. webscout/Provider/talkai.py +2 -2
  104. webscout/Provider/turboseek.py +1 -1
  105. webscout/Provider/tutorai.py +1 -1
  106. webscout/Provider/typegpt.py +5 -42
  107. webscout/Provider/uncovr.py +4 -2
  108. webscout/Provider/x0gpt.py +1 -1
  109. webscout/__init__.py +36 -36
  110. webscout/cli.py +293 -332
  111. webscout/tempid.py +11 -11
  112. webscout/utils.py +2 -2
  113. webscout/version.py +1 -1
  114. webscout/webscout_search.py +1282 -1223
  115. webscout/webscout_search_async.py +813 -692
  116. {webscout-7.7.dist-info → webscout-7.8.dist-info}/METADATA +50 -29
  117. {webscout-7.7.dist-info → webscout-7.8.dist-info}/RECORD +121 -110
  118. {webscout-7.7.dist-info → webscout-7.8.dist-info}/LICENSE.md +0 -0
  119. {webscout-7.7.dist-info → webscout-7.8.dist-info}/WHEEL +0 -0
  120. {webscout-7.7.dist-info → webscout-7.8.dist-info}/entry_points.txt +0 -0
  121. {webscout-7.7.dist-info → webscout-7.8.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py CHANGED
@@ -1,461 +1,477 @@
1
- import requests
2
- from typing import Dict, List, Optional, Union, Any
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
1
+ """
2
+ DWEBS - A Google search library with advanced features
3
+ """
4
+ import random
5
+ from time import sleep
4
6
  from webscout.scout import Scout
5
- from urllib.parse import urljoin
6
- from webscout.litagent import LitAgent
7
+ from requests import get
8
+ from urllib.parse import unquote, urlencode
9
+ from typing import List, Dict, Optional, Union, Iterator, Any
10
+ from concurrent.futures import ThreadPoolExecutor
7
11
 
8
- import time
9
- import json
10
- import os
11
- from datetime import datetime, timedelta
12
- from functools import lru_cache
13
- from webscout.Litlogger import Logger, LogFormat
14
- class GoogleS:
15
- """A Python interface for Google search with advanced features
16
-
17
- The GoogleS class provides a powerful interface to perform web searches, image searches,
18
- and advanced filtering on Google. Built with love by HAI to keep it
19
-
20
- Basic Usage:
21
- >>> from webscout.DWEBS import GoogleS
22
- >>> searcher = GoogleS()
23
- >>> # Simple web search
24
- >>> results = searcher.search("Python programming")
25
- >>> for result in results:
26
- ... print(f"Title: {result['title']}")
27
- ... print(f"URL: {result['href']}")
28
- ... print(f"Description: {result['abstract']}")
29
-
30
- Advanced Web Search:
31
- >>> # Search with filters
32
- >>> results = searcher.search(
33
- ... query="Python tutorials",
34
- ... site="github.com",
35
- ... file_type="pdf",
36
- ... time_period="month",
37
- ... max_results=5
38
- ... )
39
- >>> # Example response format:
40
- >>> {
41
- ... 'title': 'Python Tutorial',
42
- ... 'href': 'https://example.com/python-tutorial',
43
- ... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
44
- ... 'index': 0,
45
- ... 'type': 'web',
46
- ... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
47
- ... }
48
-
49
- Image Search:
50
- >>> # Search for images
51
- >>> images = searcher.search_images(
52
- ... query="cute puppies",
53
- ... size="large",
54
- ... color="color",
55
- ... type_filter="photo",
56
- ... max_results=5
57
- ... )
58
- >>> # Example response format:
59
- >>> {
60
- ... 'title': 'Cute Puppy Image',
61
- ... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
62
- ... 'full_url': 'https://example.com/puppy-full.jpg',
63
- ... 'type': 'image'
64
- ... }
65
-
66
- Features:
67
- - Web Search: Get detailed web results with title, URL, and description
68
- - Image Search: Find images with thumbnails and full-resolution URLs
69
- - Advanced Filters: Site-specific search, file types, time periods
70
- - Rate Limiting: Smart request handling to avoid blocks
71
- - Caching: Save results for faster repeat searches
72
- - Retry Logic: Automatic retry on temporary failures
73
- - Logging: Optional LitLogger integration for beautiful console output
74
- - Proxy Support: Use custom proxies for requests
75
- - Concurrent Processing: Multi-threaded requests for better performance
76
-
77
- Response Format:
78
- Web Search Results:
79
- {
80
- 'title': str, # Title of the webpage
81
- 'href': str, # URL of the webpage
82
- 'abstract': str, # Brief description or snippet
83
- 'index': int, # Result position
84
- 'type': 'web', # Result type identifier
85
- 'visible_text': str # Full page text (if extract_text=True)
86
- }
87
-
88
- Image Search Results:
89
- {
90
- 'title': str, # Image title or description
91
- 'thumbnail': str, # Thumbnail image URL
92
- 'full_url': str, # Full resolution image URL
93
- 'type': 'image' # Result type identifier
94
- }
95
- """
96
-
97
- SEARCH_TYPES = {
98
- "web": "https://www.google.com/search",
99
- "image": "https://www.google.com/images",
100
- "news": "https://www.google.com/news",
101
- }
102
12
 
13
+ class SearchResult:
14
+ """Class to represent a search result with metadata."""
15
+
16
+ def __init__(self, url: str, title: str, description: str):
17
+ """
18
+ Initialize a search result.
19
+
20
+ Args:
21
+ url: The URL of the search result
22
+ title: The title of the search result
23
+ description: The description/snippet of the search result
24
+ """
25
+ self.url = url
26
+ self.title = title
27
+ self.description = description
28
+ # Additional metadata that can be populated
29
+ self.metadata: Dict[str, Any] = {}
30
+
31
+ def __repr__(self) -> str:
32
+ """Return string representation of search result."""
33
+ return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
34
+
35
+
36
+ class GoogleSearch:
37
+ """Google search implementation with configurable parameters and advanced features."""
38
+
39
+ _executor: ThreadPoolExecutor = ThreadPoolExecutor()
40
+
103
41
  def __init__(
104
42
  self,
105
- headers: Optional[Dict[str, str]] = None,
106
- proxy: Optional[str] = None,
107
- timeout: Optional[int] = 10,
108
- max_workers: int = 20,
109
- cache_dir: Optional[str] = None,
110
- rate_limit: float = 2.0,
43
+ timeout: int = 10,
44
+ proxies: Optional[Dict[str, str]] = None,
45
+ verify: bool = True,
46
+ lang: str = "en",
47
+ sleep_interval: float = 0.0
111
48
  ):
112
49
  """
113
- Initialize the GoogleS object with enhanced features.
50
+ Initialize GoogleSearch with custom settings.
114
51
 
115
52
  Args:
116
- cache_dir: Directory to store search result cache
117
- rate_limit: Minimum time between requests in seconds
53
+ timeout: Request timeout in seconds
54
+ proxies: Proxy configuration for requests
55
+ verify: Whether to verify SSL certificates
56
+ lang: Search language
57
+ sleep_interval: Sleep time between pagination requests
118
58
  """
119
- self.proxy = proxy
120
- self.headers = headers if headers else {
121
- "User-Agent": LitAgent().random() # Use LitAgent to generate user agent
122
- }
123
- self.headers["Referer"] = "https://www.google.com/"
124
- self.client = requests.Session()
125
- self.client.headers.update(self.headers)
126
- if proxy:
127
- self.client.proxies.update({"http": proxy, "https": proxy})
128
59
  self.timeout = timeout
129
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
130
- self.cache_dir = cache_dir
131
- if cache_dir and not os.path.exists(cache_dir):
132
- os.makedirs(cache_dir)
133
- self.last_request_time = 0
134
- self.rate_limit = rate_limit
135
-
136
- def _respect_rate_limit(self):
137
- """Ensure minimum time between requests"""
138
- current_time = time.time()
139
- time_since_last = current_time - self.last_request_time
140
- if time_since_last < self.rate_limit:
141
- sleep_time = self.rate_limit - time_since_last
142
- time.sleep(sleep_time)
143
- self.last_request_time = time.time()
144
-
145
- def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
146
- data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
60
+ self.proxies = proxies if proxies else {}
61
+ self.verify = verify
62
+ self.lang = lang
63
+ self.sleep_interval = sleep_interval
64
+ self.base_url = "https://www.google.com/search"
65
+
66
+ def _get_useragent(self) -> str:
147
67
  """
148
- Makes an HTTP request with manual retry logic and rate limiting.
149
-
150
- Args:
151
- method (str): HTTP method (GET, POST, etc.)
152
- url (str): Target URL
153
- params (Optional[Dict[str, str]]): Query parameters
154
- data (Optional[Union[Dict[str, str], bytes]]): Request payload
155
- max_retries (int): Maximum number of retry attempts
68
+ Generate a random user agent string.
156
69
 
157
70
  Returns:
158
- bytes: Response content
71
+ Random user agent string
72
+ """
73
+ lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
74
+ libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
75
+ ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
76
+ openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
77
+ return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
78
+
79
+ def _make_request(self, term: str, results: int, start: int = 0, search_type: str = None) -> str:
159
80
  """
160
- retry_count = 0
161
- base_delay = 5 # Base delay in seconds
81
+ Make a request to Google search.
162
82
 
163
- while retry_count < max_retries:
164
- try:
165
- self._respect_rate_limit()
166
- response = self.client.request(
167
- method=method,
168
- url=url,
169
- params=params,
170
- data=data,
171
- timeout=self.timeout
172
- )
173
-
174
- if response.status_code == 429:
175
- retry_delay = base_delay * (2 ** retry_count) # Exponential backoff
176
- time.sleep(retry_delay)
177
- retry_count += 1
178
- continue
179
-
180
- response.raise_for_status()
181
- return response.content
182
-
183
- except requests.exceptions.RequestException as e:
184
- if retry_count == max_retries - 1:
185
- raise
186
-
187
- retry_delay = base_delay * (2 ** retry_count)
188
- time.sleep(retry_delay)
189
- retry_count += 1
190
-
191
- raise Exception("Max retries reached")
192
-
193
- @lru_cache(maxsize=100)
194
- def _cache_key(self, query: str, **kwargs) -> str:
195
- """Generate a cache key from search parameters"""
196
- cache_data = {'query': query, **kwargs}
197
- return json.dumps(cache_data, sort_keys=True)
198
-
199
- def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
200
- """Retrieve cached results if they exist and are not expired"""
201
- if not self.cache_dir:
202
- return None
203
- cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
204
- if os.path.exists(cache_file):
205
- with open(cache_file, 'r') as f:
206
- cached_data = json.load(f)
207
- if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
208
- return cached_data['results']
209
- return None
210
-
211
- def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
212
- """Cache search results"""
213
- if not self.cache_dir:
214
- return
215
- cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
216
- with open(cache_file, 'w') as f:
217
- json.dump({
218
- 'timestamp': datetime.now().isoformat(),
219
- 'results': results
220
- }, f)
221
-
222
- def search_images(
223
- self,
224
- query: str,
225
- max_results: int = 10,
226
- size: Optional[str] = None,
227
- color: Optional[str] = None,
228
- type_filter: Optional[str] = None,
229
- **kwargs
230
- ) -> List[Dict[str, str]]:
231
- """Search for images on Google with style!
232
-
233
83
  Args:
234
- query (str): What you're looking for fam
235
- max_results (int): How many results you want (default: 10)
236
- size (Optional[str]): Image size filter
237
- - 'large': Big pics
238
- - 'medium': Medium sized
239
- - 'icon': Small icons
240
- color (Optional[str]): Color filter
241
- - 'color': Full color
242
- - 'gray': Black and white
243
- - 'transparent': Transparent background
244
- type_filter (Optional[str]): Type of image
245
- - 'face': Just faces
246
- - 'photo': Real photos
247
- - 'clipart': Vector art
248
- - 'lineart': Line drawings
249
-
84
+ term: Search query
85
+ results: Number of results to request
86
+ start: Start position for pagination
87
+ search_type: Type of search ('', 'nws', 'isch')
88
+
250
89
  Returns:
251
- List[Dict[str, str]]: List of image results with these keys:
252
- - 'thumbnail': Small preview URL
253
- - 'full_url': Full resolution image URL
254
- - 'title': Image title/description
255
- - 'type': Always 'image'
256
-
257
- Example:
258
- >>> searcher = GoogleS()
259
- >>> # Find some cool nature pics
260
- >>> images = searcher.search_images(
261
- ... query="beautiful landscapes",
262
- ... size="large",
263
- ... color="color",
264
- ... max_results=5
265
- ... )
266
- >>> for img in images:
267
- ... print(f"Found: {img['title']}")
268
- ... print(f"URL: {img['full_url']}")
90
+ HTML response content
269
91
  """
270
92
  params = {
271
- "q": query,
272
- "tbm": "isch",
273
- "num": max_results
93
+ "q": term,
94
+ "num": results + 2, # Request slightly more than needed
95
+ "hl": self.lang,
96
+ "start": start,
274
97
  }
275
98
 
276
- if size:
277
- params["tbs"] = f"isz:{size}"
278
- if color:
279
- params["tbs"] = f"ic:{color}"
280
- if type_filter:
281
- params["tbs"] = f"itp:{type_filter}"
282
-
283
- content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
284
- soup = Scout(content) # Use Scout parser
99
+ # Add search type if specified
100
+ if search_type:
101
+ params["tbm"] = search_type
102
+
103
+ try:
104
+ resp = get(
105
+ url=self.base_url,
106
+ headers={
107
+ "User-Agent": self._get_useragent(),
108
+ "Accept-Language": self.lang,
109
+ "Accept-Encoding": "gzip, deflate, br",
110
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
111
+ },
112
+ params=params,
113
+ proxies=self.proxies if any(self.proxies) else None,
114
+ timeout=self.timeout,
115
+ verify=self.verify,
116
+ cookies={
117
+ 'CONSENT': 'PENDING+987',
118
+ 'SOCS': 'CAESHAgBEhIaAB',
119
+ }
120
+ )
121
+ resp.raise_for_status()
122
+ return resp.text
123
+ except Exception as e:
124
+ raise RuntimeError(f"Search request failed: {str(e)}")
125
+
126
+ def _extract_url(self, raw_link: str) -> Optional[str]:
127
+ """
128
+ Extract actual URL from Google redirect URL.
285
129
 
286
- results = []
287
- for img in soup.find_all("img", class_="rg_i"):
288
- if len(results) >= max_results:
289
- break
130
+ Args:
131
+ raw_link: Raw link from Google search
290
132
 
291
- img_data = {
292
- "thumbnail": img.get("src", ""),
293
- "title": img.get("alt", ""),
294
- "type": "image"
295
- }
133
+ Returns:
134
+ Actual URL or None if invalid
135
+ """
136
+ if not raw_link:
137
+ return None
138
+
139
+ if raw_link.startswith("/url?"):
140
+ try:
141
+ link = unquote(raw_link.split("&")[0].replace("/url?q=", ""))
142
+ return link
143
+ except Exception:
144
+ return None
145
+ elif raw_link.startswith("http"):
146
+ return unquote(raw_link)
147
+
148
+ return None
149
+
150
+ def _is_valid_result(self, link: str, fetched_links: set, unique: bool) -> bool:
151
+ """
152
+ Check if search result is valid.
153
+
154
+ Args:
155
+ link: URL to check
156
+ fetched_links: Set of already fetched links
157
+ unique: Whether to filter duplicate links
296
158
 
297
- # Extract full resolution image URL if available
298
- parent = img.parent
299
- if parent and parent.get("href"):
300
- img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
159
+ Returns:
160
+ Boolean indicating if result is valid
161
+ """
162
+ if any(x in link for x in ["google.", "/search?", "webcache."]):
163
+ return False
301
164
 
302
- results.append(img_data)
165
+ if link in fetched_links and unique:
166
+ return False
303
167
 
304
- return results
305
-
306
- def search(
168
+ return True
169
+
170
+ def _parse_search_results(
307
171
  self,
308
- query: str,
309
- region: str = "us-en",
310
- language: str = "en",
311
- safe: str = "off",
312
- time_period: Optional[str] = None,
313
- max_results: int = 10,
314
- extract_text: bool = False,
315
- max_text_length: Optional[int] = 100,
316
- site: Optional[str] = None, # Search within specific site
317
- file_type: Optional[str] = None, # Filter by file type
318
- sort_by: str = "relevance", # relevance, date
319
- exclude_terms: Optional[List[str]] = None, # Terms to exclude
320
- exact_phrase: Optional[str] = None, # Exact phrase match
321
- ) -> List[Dict[str, Union[str, int]]]:
172
+ html: str,
173
+ num_results: int,
174
+ fetched_links: set,
175
+ unique: bool
176
+ ) -> List[SearchResult]:
322
177
  """
323
- Enhanced search with additional filters and options.
178
+ Parse search results from HTML.
324
179
 
325
180
  Args:
326
- site: Limit search to specific website
327
- file_type: Filter by file type (pdf, doc, etc.)
328
- sort_by: Sort results by relevance or date
329
- exclude_terms: List of terms to exclude from search
330
- exact_phrase: Exact phrase to match
181
+ html: HTML content to parse
182
+ num_results: Maximum number of results to return
183
+ fetched_links: Set of already fetched links
184
+ unique: Filter duplicate links
185
+
186
+ Returns:
187
+ List of SearchResult objects
331
188
  """
332
- # Build advanced query
333
- advanced_query = query
334
- if site:
335
- advanced_query += f" site:{site}"
336
- if file_type:
337
- advanced_query += f" filetype:{file_type}"
338
- if exclude_terms:
339
- advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
340
- if exact_phrase:
341
- advanced_query = f'"{exact_phrase}"' + advanced_query
342
-
343
- # Check cache first
344
- cache_key = self._cache_key(advanced_query, region=region, language=language,
345
- safe=safe, time_period=time_period, sort_by=sort_by)
346
- cached_results = self._get_cached_results(cache_key)
347
- if cached_results:
348
- return cached_results[:max_results]
349
-
350
- # Continue with regular search implementation...
351
189
  results = []
352
- futures = []
353
- start = 0
354
-
355
- while len(results) < max_results:
356
- params = {
357
- "q": advanced_query,
358
- "num": 10,
359
- "hl": language,
360
- "start": start,
361
- "safe": safe,
362
- "gl": region,
363
- }
364
- if time_period:
365
- params["tbs"] = f"qdr:{time_period}"
366
-
367
- futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
368
- start += 10
369
-
370
- for future in as_completed(futures):
371
- try:
372
- resp_content = future.result()
373
- soup = Scout(resp_content) # Use Scout parser
374
-
375
- result_blocks = soup.find_all("div", class_="g")
376
-
377
- if not result_blocks:
378
- break
379
-
380
- # Extract links and titles first
381
- for result_block in result_blocks:
382
- link = result_block.find("a", href=True)
383
- title = result_block.find("h3")
384
- description_box = result_block.find(
385
- "div", {"style": "-webkit-line-clamp:2"}
386
- )
190
+ soup = Scout(html, features="html.parser")
191
+ result_blocks = soup.find_all("div", class_="ezO2md")
192
+
193
+ if not result_blocks:
194
+ # Try alternative class patterns if the main one doesn't match
195
+ result_blocks = soup.find_all("div", attrs={"class": lambda c: c and "g" in c.split()})
196
+
197
+ for result in result_blocks:
198
+ # Find the link - looking for various potential Google result classes
199
+ link_tag = result.find("a", class_=["fuLhoc", "ZWRArf"])
200
+ if not link_tag:
201
+ link_tag = result.find("a")
202
+ if not link_tag:
203
+ continue
204
+
205
+ raw_link = link_tag.get("href", "")
206
+ link = self._extract_url(raw_link)
207
+
208
+ if not link:
209
+ continue
387
210
 
388
- if link and title and description_box:
389
- url = link["href"]
390
- results.append({
391
- "title": title.text,
392
- "href": url,
393
- "abstract": description_box.text,
394
- "index": len(results),
395
- "type": "web",
396
- "visible_text": "" # Initialize visible_text as empty string
397
- })
211
+ if not self._is_valid_result(link, fetched_links, unique):
212
+ continue
398
213
 
399
- if len(results) >= max_results:
400
- break # Stop if we have enough results
214
+ # Get title - it's the text content of the link tag for these results
215
+ title = link_tag.get_text(strip=True)
216
+ if not title:
217
+ continue
401
218
 
402
- # Parallelize text extraction if needed
403
- if extract_text:
404
- with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
405
- extraction_futures = [
406
- text_extractor.submit(self._extract_text_from_webpage,
407
- self._get_url("GET", result['href']),
408
- max_characters=max_text_length)
409
- for result in results
410
- if 'href' in result
411
- ]
412
- for i, future in enumerate(as_completed(extraction_futures)):
413
- try:
414
- results[i]['visible_text'] = future.result()
415
- except Exception as e:
416
- print(f"Error extracting text: {e}")
219
+ # Get description - it's in a span with class FrIlee or potentially other classes
220
+ description_tag = result.find("span", class_="FrIlee")
221
+ if not description_tag:
222
+ description_tag = result.find(["div", "span"], class_=lambda c: c and any(x in c for x in ["snippet", "description", "VwiC3b"]))
223
+
224
+ description = description_tag.get_text(strip=True) if description_tag else ""
417
225
 
418
- except Exception as e:
419
- print(f"Error: {e}")
226
+ # Create result object
227
+ search_result = SearchResult(link, title, description)
228
+
229
+ # Add extra metadata if available
230
+ citation = result.find("cite")
231
+ if citation:
232
+ search_result.metadata["source"] = citation.get_text(strip=True)
233
+
234
+ timestamp = result.find("span", class_=lambda c: c and "ZE5qJf" in c)
235
+ if timestamp:
236
+ search_result.metadata["date"] = timestamp.get_text(strip=True)
420
237
 
421
- # Cache results before returning
422
- self._cache_results(cache_key, results)
238
+ fetched_links.add(link)
239
+ results.append(search_result)
240
+
241
+ if len(results) >= num_results:
242
+ break
243
+
423
244
  return results
424
-
425
- def get_search_suggestions(self, query: str) -> List[str]:
426
- """Get search suggestions for a query"""
427
- params = {
428
- "client": "chrome",
429
- "q": query
245
+
246
+ def text(
247
+ self,
248
+ keywords: str,
249
+ region: str = None,
250
+ safesearch: str = "moderate",
251
+ max_results: int = 10,
252
+ start_num: int = 0,
253
+ unique: bool = True
254
+ ) -> List[SearchResult]:
255
+ """
256
+ Search Google for web results.
257
+
258
+ Args:
259
+ keywords: Search query
260
+ region: Region for search results (ISO country code)
261
+ safesearch: SafeSearch setting ("on", "moderate", "off")
262
+ max_results: Maximum number of results to return
263
+ start_num: Starting position for pagination
264
+ unique: Filter duplicate results
265
+
266
+ Returns:
267
+ List of SearchResult objects with search results
268
+ """
269
+ if not keywords:
270
+ raise ValueError("Search keywords cannot be empty")
271
+
272
+ # Map safesearch values to Google's safe parameter
273
+ safe_map = {
274
+ "on": "active",
275
+ "moderate": "moderate",
276
+ "off": "off"
430
277
  }
431
- content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
432
- params=params)
433
- suggestions = json.loads(content.decode('utf-8'))[1]
434
- return suggestions
435
-
436
- def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
278
+ safe = safe_map.get(safesearch.lower(), "moderate")
279
+
280
+ # Keep track of unique results
281
+ fetched_results = []
282
+ fetched_links = set()
283
+ start = start_num
284
+
285
+ while len(fetched_results) < max_results:
286
+ response_html = self._make_request(
287
+ term=keywords,
288
+ results=max_results - len(fetched_results),
289
+ start=start
290
+ )
291
+
292
+ results = self._parse_search_results(
293
+ html=response_html,
294
+ num_results=max_results - len(fetched_results),
295
+ fetched_links=fetched_links,
296
+ unique=unique
297
+ )
298
+
299
+ if not results:
300
+ break
301
+
302
+ fetched_results.extend(results)
303
+
304
+ if len(fetched_results) >= max_results:
305
+ break
306
+
307
+ start += 10
308
+ sleep(self.sleep_interval)
309
+
310
+ return fetched_results[:max_results]
311
+
312
+ def news(
313
+ self,
314
+ keywords: str,
315
+ region: str = None,
316
+ safesearch: str = "moderate",
317
+ max_results: int = 10
318
+ ) -> List[SearchResult]:
437
319
  """
438
- Extracts visible text from HTML content using Scout parser.
320
+ Search Google News for news results.
321
+
322
+ Args:
323
+ keywords: Search query
324
+ region: Region for search results (ISO country code)
325
+ safesearch: SafeSearch setting ("on", "moderate", "off")
326
+ max_results: Maximum number of results to return
327
+
328
+ Returns:
329
+ List of SearchResult objects with news results
439
330
  """
440
- soup = Scout(html_content) # Use Scout parser
441
- for tag in soup(["script", "style", "header", "footer", "nav"]):
442
- tag.extract()
443
- visible_text = soup.get_text(strip=True)
444
- if max_characters:
445
- visible_text = visible_text[:max_characters]
446
- return visible_text
447
-
448
- def __enter__(self):
449
- return self
450
-
451
- def __exit__(self, exc_type, exc_val, exc_tb):
452
- self.client.close()
453
- self._executor.shutdown()
331
+ if not keywords:
332
+ raise ValueError("Search keywords cannot be empty")
333
+
334
+ # Map safesearch values to Google's safe parameter
335
+ safe_map = {
336
+ "on": "active",
337
+ "moderate": "moderate",
338
+ "off": "off"
339
+ }
340
+ safe = safe_map.get(safesearch.lower(), "moderate")
341
+
342
+ # Keep track of unique results
343
+ fetched_results = []
344
+ fetched_links = set()
345
+
346
+ response_html = self._make_request(
347
+ term=keywords,
348
+ results=max_results,
349
+ search_type="nws"
350
+ )
351
+
352
+ results = self._parse_search_results(
353
+ html=response_html,
354
+ num_results=max_results,
355
+ fetched_links=fetched_links,
356
+ unique=True
357
+ )
358
+
359
+ return results[:max_results]
360
+
361
+ def suggestions(self, query: str, region: str = None) -> List[str]:
362
+ """
363
+ Get search suggestions for a query term.
364
+
365
+ Args:
366
+ query: Search query
367
+ region: Region for suggestions (ISO country code)
368
+
369
+ Returns:
370
+ List of search suggestions
371
+ """
372
+ if not query:
373
+ raise ValueError("Search query cannot be empty")
374
+
375
+ try:
376
+ params = {
377
+ "client": "firefox",
378
+ "q": query,
379
+ }
380
+
381
+ # Add region if specified
382
+ if region and region.lower() != "all":
383
+ params["gl"] = region
384
+
385
+ url = f"https://www.google.com/complete/search?{urlencode(params)}"
386
+
387
+ headers = {
388
+ "User-Agent": self._get_useragent(),
389
+ "Accept": "application/json, text/javascript, */*",
390
+ "Accept-Language": self.lang,
391
+ }
392
+
393
+ response = get(
394
+ url=url,
395
+ headers=headers,
396
+ timeout=self.timeout,
397
+ verify=self.verify
398
+ )
399
+ response.raise_for_status()
400
+
401
+ # Response format is typically: ["original query", ["suggestion1", "suggestion2", ...]]
402
+ data = response.json()
403
+ if isinstance(data, list) and len(data) > 1 and isinstance(data[1], list):
404
+ return data[1]
405
+ return []
406
+
407
+ except Exception as e:
408
+ # Return empty list on error instead of raising exception
409
+ return []
410
+
411
+
412
+ # Legacy function support for backward compatibility
413
+ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=True, region=None, start_num=0, unique=False):
414
+ """Legacy function for backward compatibility."""
415
+ google_search = GoogleSearch(
416
+ timeout=timeout,
417
+ proxies={"https": proxy, "http": proxy} if proxy else None,
418
+ verify=ssl_verify,
419
+ lang=lang,
420
+ sleep_interval=sleep_interval
421
+ )
422
+
423
+ results = google_search.text(
424
+ keywords=term,
425
+ region=region,
426
+ safesearch="on" if safe == "active" else "moderate" if safe == "moderate" else "off",
427
+ max_results=num_results,
428
+ start_num=start_num,
429
+ unique=unique
430
+ )
431
+
432
+ # Convert to simple URLs if not advanced mode
433
+ if not advanced:
434
+ return [result.url for result in results]
435
+ return results
454
436
 
455
437
 
456
438
  if __name__ == "__main__":
457
439
  from rich import print
458
- searcher = GoogleS(rate_limit=3.0)
459
- results = searcher.search("HelpingAI-9B", max_results=5, extract_text=False, max_text_length=200)
460
- for result in results:
461
- print(result)
440
+ google = GoogleSearch(
441
+ timeout=10, # Optional: Set custom timeout
442
+ proxies=None, # Optional: Use proxies
443
+ verify=True # Optional: SSL verification
444
+ )
445
+
446
+ # Text Search
447
+ print("TEXT SEARCH RESULTS:")
448
+ text_results = google.text(
449
+ keywords="Python programming",
450
+ region="us", # Optional: Region for results
451
+ safesearch="moderate", # Optional: "on", "moderate", "off"
452
+ max_results=3 # Optional: Limit number of results
453
+ )
454
+ for result in text_results:
455
+ print(f"Title: {result.title}")
456
+ print(f"URL: {result.url}")
457
+ print(f"Description: {result.description}")
458
+ print("---")
459
+
460
+ # News Search
461
+ print("\nNEWS SEARCH RESULTS:")
462
+ news_results = google.news(
463
+ keywords="artificial intelligence",
464
+ region="us",
465
+ safesearch="moderate",
466
+ max_results=2
467
+ )
468
+ for result in news_results:
469
+ print(f"Title: {result.title}")
470
+ print(f"URL: {result.url}")
471
+ print(f"Description: {result.description}")
472
+ print("---")
473
+
474
+ # Search Suggestions
475
+ print("\nSEARCH SUGGESTIONS:")
476
+ suggestions = google.suggestions("how to")
477
+ print(suggestions)