webscout 7.7__py3-none-any.whl → 7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIutel.py +2 -1
- webscout/Bard.py +14 -11
- webscout/DWEBS.py +431 -415
- webscout/Extra/autocoder/__init__.py +9 -9
- webscout/Extra/autocoder/autocoder_utiles.py +332 -194
- webscout/Extra/autocoder/rawdog.py +68 -9
- webscout/Extra/gguf.py +682 -682
- webscout/Provider/AI21.py +1 -1
- webscout/Provider/AISEARCH/DeepFind.py +2 -2
- webscout/Provider/AISEARCH/ISou.py +2 -2
- webscout/Provider/AISEARCH/felo_search.py +6 -6
- webscout/Provider/AISEARCH/genspark_search.py +1 -1
- webscout/Provider/Aitopia.py +292 -0
- webscout/Provider/AllenAI.py +1 -1
- webscout/Provider/Andi.py +3 -3
- webscout/Provider/C4ai.py +1 -1
- webscout/Provider/ChatGPTES.py +3 -5
- webscout/Provider/ChatGPTGratis.py +4 -4
- webscout/Provider/Chatify.py +2 -2
- webscout/Provider/Cloudflare.py +3 -2
- webscout/Provider/DARKAI.py +3 -2
- webscout/Provider/DeepSeek.py +2 -2
- webscout/Provider/Deepinfra.py +1 -1
- webscout/Provider/EDITEE.py +1 -1
- webscout/Provider/ElectronHub.py +178 -96
- webscout/Provider/ExaChat.py +310 -0
- webscout/Provider/Free2GPT.py +2 -2
- webscout/Provider/Gemini.py +5 -19
- webscout/Provider/GithubChat.py +1 -1
- webscout/Provider/Glider.py +4 -4
- webscout/Provider/Groq.py +3 -3
- webscout/Provider/HF_space/qwen_qwen2.py +1 -1
- webscout/Provider/HeckAI.py +1 -1
- webscout/Provider/HuggingFaceChat.py +1 -1
- webscout/Provider/Hunyuan.py +1 -1
- webscout/Provider/Jadve.py +3 -3
- webscout/Provider/Koboldai.py +3 -3
- webscout/Provider/LambdaChat.py +1 -1
- webscout/Provider/Llama.py +3 -5
- webscout/Provider/Llama3.py +4 -12
- webscout/Provider/Marcus.py +3 -3
- webscout/Provider/OLLAMA.py +8 -8
- webscout/Provider/Openai.py +7 -3
- webscout/Provider/PI.py +1 -1
- webscout/Provider/Perplexitylabs.py +1 -1
- webscout/Provider/Phind.py +1 -1
- webscout/Provider/PizzaGPT.py +1 -1
- webscout/Provider/QwenLM.py +4 -7
- webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +3 -1
- webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +3 -3
- webscout/Provider/TTI/ImgSys/__init__.py +23 -0
- webscout/Provider/TTI/ImgSys/async_imgsys.py +202 -0
- webscout/Provider/TTI/ImgSys/sync_imgsys.py +195 -0
- webscout/Provider/TTI/__init__.py +3 -1
- webscout/Provider/TTI/artbit/async_artbit.py +1 -1
- webscout/Provider/TTI/artbit/sync_artbit.py +1 -1
- webscout/Provider/TTI/huggingface/async_huggingface.py +1 -1
- webscout/Provider/TTI/huggingface/sync_huggingface.py +1 -1
- webscout/Provider/TTI/piclumen/__init__.py +22 -22
- webscout/Provider/TTI/piclumen/sync_piclumen.py +232 -232
- webscout/Provider/TTI/pixelmuse/__init__.py +4 -0
- webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +249 -0
- webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +182 -0
- webscout/Provider/TTI/talkai/sync_talkai.py +1 -1
- webscout/Provider/TTS/utils.py +1 -1
- webscout/Provider/TeachAnything.py +1 -1
- webscout/Provider/TextPollinationsAI.py +4 -4
- webscout/Provider/TwoAI.py +1 -2
- webscout/Provider/Venice.py +4 -2
- webscout/Provider/VercelAI.py +234 -0
- webscout/Provider/WebSim.py +3 -2
- webscout/Provider/WiseCat.py +10 -12
- webscout/Provider/Youchat.py +1 -1
- webscout/Provider/__init__.py +10 -0
- webscout/Provider/ai4chat.py +1 -1
- webscout/Provider/aimathgpt.py +2 -6
- webscout/Provider/akashgpt.py +1 -1
- webscout/Provider/askmyai.py +4 -4
- webscout/Provider/asksteve.py +203 -0
- webscout/Provider/bagoodex.py +2 -2
- webscout/Provider/cerebras.py +1 -1
- webscout/Provider/chatglm.py +4 -4
- webscout/Provider/cleeai.py +1 -0
- webscout/Provider/copilot.py +427 -415
- webscout/Provider/elmo.py +1 -1
- webscout/Provider/flowith.py +1 -1
- webscout/Provider/freeaichat.py +57 -31
- webscout/Provider/gaurish.py +3 -5
- webscout/Provider/geminiprorealtime.py +1 -1
- webscout/Provider/granite.py +4 -4
- webscout/Provider/hermes.py +5 -5
- webscout/Provider/julius.py +1 -1
- webscout/Provider/koala.py +1 -1
- webscout/Provider/lepton.py +1 -1
- webscout/Provider/llama3mitril.py +4 -4
- webscout/Provider/llamatutor.py +1 -1
- webscout/Provider/llmchat.py +3 -3
- webscout/Provider/meta.py +1 -1
- webscout/Provider/multichat.py +10 -10
- webscout/Provider/promptrefine.py +1 -1
- webscout/Provider/searchchat.py +293 -0
- webscout/Provider/sonus.py +2 -2
- webscout/Provider/talkai.py +2 -2
- webscout/Provider/turboseek.py +1 -1
- webscout/Provider/tutorai.py +1 -1
- webscout/Provider/typegpt.py +5 -42
- webscout/Provider/uncovr.py +4 -2
- webscout/Provider/x0gpt.py +1 -1
- webscout/__init__.py +36 -36
- webscout/cli.py +293 -332
- webscout/tempid.py +11 -11
- webscout/utils.py +2 -2
- webscout/version.py +1 -1
- webscout/webscout_search.py +1282 -1223
- webscout/webscout_search_async.py +813 -692
- {webscout-7.7.dist-info → webscout-7.8.dist-info}/METADATA +50 -29
- {webscout-7.7.dist-info → webscout-7.8.dist-info}/RECORD +121 -110
- {webscout-7.7.dist-info → webscout-7.8.dist-info}/LICENSE.md +0 -0
- {webscout-7.7.dist-info → webscout-7.8.dist-info}/WHEEL +0 -0
- {webscout-7.7.dist-info → webscout-7.8.dist-info}/entry_points.txt +0 -0
- {webscout-7.7.dist-info → webscout-7.8.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py
CHANGED
|
@@ -1,461 +1,477 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
"""
|
|
2
|
+
DWEBS - A Google search library with advanced features
|
|
3
|
+
"""
|
|
4
|
+
import random
|
|
5
|
+
from time import sleep
|
|
4
6
|
from webscout.scout import Scout
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
+
from requests import get
|
|
8
|
+
from urllib.parse import unquote, urlencode
|
|
9
|
+
from typing import List, Dict, Optional, Union, Iterator, Any
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
11
|
|
|
8
|
-
import time
|
|
9
|
-
import json
|
|
10
|
-
import os
|
|
11
|
-
from datetime import datetime, timedelta
|
|
12
|
-
from functools import lru_cache
|
|
13
|
-
from webscout.Litlogger import Logger, LogFormat
|
|
14
|
-
class GoogleS:
|
|
15
|
-
"""A Python interface for Google search with advanced features
|
|
16
|
-
|
|
17
|
-
The GoogleS class provides a powerful interface to perform web searches, image searches,
|
|
18
|
-
and advanced filtering on Google. Built with love by HAI to keep it
|
|
19
|
-
|
|
20
|
-
Basic Usage:
|
|
21
|
-
>>> from webscout.DWEBS import GoogleS
|
|
22
|
-
>>> searcher = GoogleS()
|
|
23
|
-
>>> # Simple web search
|
|
24
|
-
>>> results = searcher.search("Python programming")
|
|
25
|
-
>>> for result in results:
|
|
26
|
-
... print(f"Title: {result['title']}")
|
|
27
|
-
... print(f"URL: {result['href']}")
|
|
28
|
-
... print(f"Description: {result['abstract']}")
|
|
29
|
-
|
|
30
|
-
Advanced Web Search:
|
|
31
|
-
>>> # Search with filters
|
|
32
|
-
>>> results = searcher.search(
|
|
33
|
-
... query="Python tutorials",
|
|
34
|
-
... site="github.com",
|
|
35
|
-
... file_type="pdf",
|
|
36
|
-
... time_period="month",
|
|
37
|
-
... max_results=5
|
|
38
|
-
... )
|
|
39
|
-
>>> # Example response format:
|
|
40
|
-
>>> {
|
|
41
|
-
... 'title': 'Python Tutorial',
|
|
42
|
-
... 'href': 'https://example.com/python-tutorial',
|
|
43
|
-
... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
|
|
44
|
-
... 'index': 0,
|
|
45
|
-
... 'type': 'web',
|
|
46
|
-
... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
|
|
47
|
-
... }
|
|
48
|
-
|
|
49
|
-
Image Search:
|
|
50
|
-
>>> # Search for images
|
|
51
|
-
>>> images = searcher.search_images(
|
|
52
|
-
... query="cute puppies",
|
|
53
|
-
... size="large",
|
|
54
|
-
... color="color",
|
|
55
|
-
... type_filter="photo",
|
|
56
|
-
... max_results=5
|
|
57
|
-
... )
|
|
58
|
-
>>> # Example response format:
|
|
59
|
-
>>> {
|
|
60
|
-
... 'title': 'Cute Puppy Image',
|
|
61
|
-
... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
|
|
62
|
-
... 'full_url': 'https://example.com/puppy-full.jpg',
|
|
63
|
-
... 'type': 'image'
|
|
64
|
-
... }
|
|
65
|
-
|
|
66
|
-
Features:
|
|
67
|
-
- Web Search: Get detailed web results with title, URL, and description
|
|
68
|
-
- Image Search: Find images with thumbnails and full-resolution URLs
|
|
69
|
-
- Advanced Filters: Site-specific search, file types, time periods
|
|
70
|
-
- Rate Limiting: Smart request handling to avoid blocks
|
|
71
|
-
- Caching: Save results for faster repeat searches
|
|
72
|
-
- Retry Logic: Automatic retry on temporary failures
|
|
73
|
-
- Logging: Optional LitLogger integration for beautiful console output
|
|
74
|
-
- Proxy Support: Use custom proxies for requests
|
|
75
|
-
- Concurrent Processing: Multi-threaded requests for better performance
|
|
76
|
-
|
|
77
|
-
Response Format:
|
|
78
|
-
Web Search Results:
|
|
79
|
-
{
|
|
80
|
-
'title': str, # Title of the webpage
|
|
81
|
-
'href': str, # URL of the webpage
|
|
82
|
-
'abstract': str, # Brief description or snippet
|
|
83
|
-
'index': int, # Result position
|
|
84
|
-
'type': 'web', # Result type identifier
|
|
85
|
-
'visible_text': str # Full page text (if extract_text=True)
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
Image Search Results:
|
|
89
|
-
{
|
|
90
|
-
'title': str, # Image title or description
|
|
91
|
-
'thumbnail': str, # Thumbnail image URL
|
|
92
|
-
'full_url': str, # Full resolution image URL
|
|
93
|
-
'type': 'image' # Result type identifier
|
|
94
|
-
}
|
|
95
|
-
"""
|
|
96
|
-
|
|
97
|
-
SEARCH_TYPES = {
|
|
98
|
-
"web": "https://www.google.com/search",
|
|
99
|
-
"image": "https://www.google.com/images",
|
|
100
|
-
"news": "https://www.google.com/news",
|
|
101
|
-
}
|
|
102
12
|
|
|
13
|
+
class SearchResult:
|
|
14
|
+
"""Class to represent a search result with metadata."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, url: str, title: str, description: str):
|
|
17
|
+
"""
|
|
18
|
+
Initialize a search result.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
url: The URL of the search result
|
|
22
|
+
title: The title of the search result
|
|
23
|
+
description: The description/snippet of the search result
|
|
24
|
+
"""
|
|
25
|
+
self.url = url
|
|
26
|
+
self.title = title
|
|
27
|
+
self.description = description
|
|
28
|
+
# Additional metadata that can be populated
|
|
29
|
+
self.metadata: Dict[str, Any] = {}
|
|
30
|
+
|
|
31
|
+
def __repr__(self) -> str:
|
|
32
|
+
"""Return string representation of search result."""
|
|
33
|
+
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class GoogleSearch:
|
|
37
|
+
"""Google search implementation with configurable parameters and advanced features."""
|
|
38
|
+
|
|
39
|
+
_executor: ThreadPoolExecutor = ThreadPoolExecutor()
|
|
40
|
+
|
|
103
41
|
def __init__(
|
|
104
42
|
self,
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
rate_limit: float = 2.0,
|
|
43
|
+
timeout: int = 10,
|
|
44
|
+
proxies: Optional[Dict[str, str]] = None,
|
|
45
|
+
verify: bool = True,
|
|
46
|
+
lang: str = "en",
|
|
47
|
+
sleep_interval: float = 0.0
|
|
111
48
|
):
|
|
112
49
|
"""
|
|
113
|
-
Initialize
|
|
50
|
+
Initialize GoogleSearch with custom settings.
|
|
114
51
|
|
|
115
52
|
Args:
|
|
116
|
-
|
|
117
|
-
|
|
53
|
+
timeout: Request timeout in seconds
|
|
54
|
+
proxies: Proxy configuration for requests
|
|
55
|
+
verify: Whether to verify SSL certificates
|
|
56
|
+
lang: Search language
|
|
57
|
+
sleep_interval: Sleep time between pagination requests
|
|
118
58
|
"""
|
|
119
|
-
self.proxy = proxy
|
|
120
|
-
self.headers = headers if headers else {
|
|
121
|
-
"User-Agent": LitAgent().random() # Use LitAgent to generate user agent
|
|
122
|
-
}
|
|
123
|
-
self.headers["Referer"] = "https://www.google.com/"
|
|
124
|
-
self.client = requests.Session()
|
|
125
|
-
self.client.headers.update(self.headers)
|
|
126
|
-
if proxy:
|
|
127
|
-
self.client.proxies.update({"http": proxy, "https": proxy})
|
|
128
59
|
self.timeout = timeout
|
|
129
|
-
self.
|
|
130
|
-
self.
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
self.
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def _respect_rate_limit(self):
|
|
137
|
-
"""Ensure minimum time between requests"""
|
|
138
|
-
current_time = time.time()
|
|
139
|
-
time_since_last = current_time - self.last_request_time
|
|
140
|
-
if time_since_last < self.rate_limit:
|
|
141
|
-
sleep_time = self.rate_limit - time_since_last
|
|
142
|
-
time.sleep(sleep_time)
|
|
143
|
-
self.last_request_time = time.time()
|
|
144
|
-
|
|
145
|
-
def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
|
|
146
|
-
data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
|
|
60
|
+
self.proxies = proxies if proxies else {}
|
|
61
|
+
self.verify = verify
|
|
62
|
+
self.lang = lang
|
|
63
|
+
self.sleep_interval = sleep_interval
|
|
64
|
+
self.base_url = "https://www.google.com/search"
|
|
65
|
+
|
|
66
|
+
def _get_useragent(self) -> str:
|
|
147
67
|
"""
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
method (str): HTTP method (GET, POST, etc.)
|
|
152
|
-
url (str): Target URL
|
|
153
|
-
params (Optional[Dict[str, str]]): Query parameters
|
|
154
|
-
data (Optional[Union[Dict[str, str], bytes]]): Request payload
|
|
155
|
-
max_retries (int): Maximum number of retry attempts
|
|
68
|
+
Generate a random user agent string.
|
|
156
69
|
|
|
157
70
|
Returns:
|
|
158
|
-
|
|
71
|
+
Random user agent string
|
|
72
|
+
"""
|
|
73
|
+
lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
|
|
74
|
+
libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
|
|
75
|
+
ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
|
|
76
|
+
openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
|
|
77
|
+
return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
|
|
78
|
+
|
|
79
|
+
def _make_request(self, term: str, results: int, start: int = 0, search_type: str = None) -> str:
|
|
159
80
|
"""
|
|
160
|
-
|
|
161
|
-
base_delay = 5 # Base delay in seconds
|
|
81
|
+
Make a request to Google search.
|
|
162
82
|
|
|
163
|
-
while retry_count < max_retries:
|
|
164
|
-
try:
|
|
165
|
-
self._respect_rate_limit()
|
|
166
|
-
response = self.client.request(
|
|
167
|
-
method=method,
|
|
168
|
-
url=url,
|
|
169
|
-
params=params,
|
|
170
|
-
data=data,
|
|
171
|
-
timeout=self.timeout
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
if response.status_code == 429:
|
|
175
|
-
retry_delay = base_delay * (2 ** retry_count) # Exponential backoff
|
|
176
|
-
time.sleep(retry_delay)
|
|
177
|
-
retry_count += 1
|
|
178
|
-
continue
|
|
179
|
-
|
|
180
|
-
response.raise_for_status()
|
|
181
|
-
return response.content
|
|
182
|
-
|
|
183
|
-
except requests.exceptions.RequestException as e:
|
|
184
|
-
if retry_count == max_retries - 1:
|
|
185
|
-
raise
|
|
186
|
-
|
|
187
|
-
retry_delay = base_delay * (2 ** retry_count)
|
|
188
|
-
time.sleep(retry_delay)
|
|
189
|
-
retry_count += 1
|
|
190
|
-
|
|
191
|
-
raise Exception("Max retries reached")
|
|
192
|
-
|
|
193
|
-
@lru_cache(maxsize=100)
|
|
194
|
-
def _cache_key(self, query: str, **kwargs) -> str:
|
|
195
|
-
"""Generate a cache key from search parameters"""
|
|
196
|
-
cache_data = {'query': query, **kwargs}
|
|
197
|
-
return json.dumps(cache_data, sort_keys=True)
|
|
198
|
-
|
|
199
|
-
def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
|
|
200
|
-
"""Retrieve cached results if they exist and are not expired"""
|
|
201
|
-
if not self.cache_dir:
|
|
202
|
-
return None
|
|
203
|
-
cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
|
|
204
|
-
if os.path.exists(cache_file):
|
|
205
|
-
with open(cache_file, 'r') as f:
|
|
206
|
-
cached_data = json.load(f)
|
|
207
|
-
if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
|
|
208
|
-
return cached_data['results']
|
|
209
|
-
return None
|
|
210
|
-
|
|
211
|
-
def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
|
|
212
|
-
"""Cache search results"""
|
|
213
|
-
if not self.cache_dir:
|
|
214
|
-
return
|
|
215
|
-
cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
|
|
216
|
-
with open(cache_file, 'w') as f:
|
|
217
|
-
json.dump({
|
|
218
|
-
'timestamp': datetime.now().isoformat(),
|
|
219
|
-
'results': results
|
|
220
|
-
}, f)
|
|
221
|
-
|
|
222
|
-
def search_images(
|
|
223
|
-
self,
|
|
224
|
-
query: str,
|
|
225
|
-
max_results: int = 10,
|
|
226
|
-
size: Optional[str] = None,
|
|
227
|
-
color: Optional[str] = None,
|
|
228
|
-
type_filter: Optional[str] = None,
|
|
229
|
-
**kwargs
|
|
230
|
-
) -> List[Dict[str, str]]:
|
|
231
|
-
"""Search for images on Google with style!
|
|
232
|
-
|
|
233
83
|
Args:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
- 'icon': Small icons
|
|
240
|
-
color (Optional[str]): Color filter
|
|
241
|
-
- 'color': Full color
|
|
242
|
-
- 'gray': Black and white
|
|
243
|
-
- 'transparent': Transparent background
|
|
244
|
-
type_filter (Optional[str]): Type of image
|
|
245
|
-
- 'face': Just faces
|
|
246
|
-
- 'photo': Real photos
|
|
247
|
-
- 'clipart': Vector art
|
|
248
|
-
- 'lineart': Line drawings
|
|
249
|
-
|
|
84
|
+
term: Search query
|
|
85
|
+
results: Number of results to request
|
|
86
|
+
start: Start position for pagination
|
|
87
|
+
search_type: Type of search ('', 'nws', 'isch')
|
|
88
|
+
|
|
250
89
|
Returns:
|
|
251
|
-
|
|
252
|
-
- 'thumbnail': Small preview URL
|
|
253
|
-
- 'full_url': Full resolution image URL
|
|
254
|
-
- 'title': Image title/description
|
|
255
|
-
- 'type': Always 'image'
|
|
256
|
-
|
|
257
|
-
Example:
|
|
258
|
-
>>> searcher = GoogleS()
|
|
259
|
-
>>> # Find some cool nature pics
|
|
260
|
-
>>> images = searcher.search_images(
|
|
261
|
-
... query="beautiful landscapes",
|
|
262
|
-
... size="large",
|
|
263
|
-
... color="color",
|
|
264
|
-
... max_results=5
|
|
265
|
-
... )
|
|
266
|
-
>>> for img in images:
|
|
267
|
-
... print(f"Found: {img['title']}")
|
|
268
|
-
... print(f"URL: {img['full_url']}")
|
|
90
|
+
HTML response content
|
|
269
91
|
"""
|
|
270
92
|
params = {
|
|
271
|
-
"q":
|
|
272
|
-
"
|
|
273
|
-
"
|
|
93
|
+
"q": term,
|
|
94
|
+
"num": results + 2, # Request slightly more than needed
|
|
95
|
+
"hl": self.lang,
|
|
96
|
+
"start": start,
|
|
274
97
|
}
|
|
275
98
|
|
|
276
|
-
if
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
99
|
+
# Add search type if specified
|
|
100
|
+
if search_type:
|
|
101
|
+
params["tbm"] = search_type
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
resp = get(
|
|
105
|
+
url=self.base_url,
|
|
106
|
+
headers={
|
|
107
|
+
"User-Agent": self._get_useragent(),
|
|
108
|
+
"Accept-Language": self.lang,
|
|
109
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
110
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
111
|
+
},
|
|
112
|
+
params=params,
|
|
113
|
+
proxies=self.proxies if any(self.proxies) else None,
|
|
114
|
+
timeout=self.timeout,
|
|
115
|
+
verify=self.verify,
|
|
116
|
+
cookies={
|
|
117
|
+
'CONSENT': 'PENDING+987',
|
|
118
|
+
'SOCS': 'CAESHAgBEhIaAB',
|
|
119
|
+
}
|
|
120
|
+
)
|
|
121
|
+
resp.raise_for_status()
|
|
122
|
+
return resp.text
|
|
123
|
+
except Exception as e:
|
|
124
|
+
raise RuntimeError(f"Search request failed: {str(e)}")
|
|
125
|
+
|
|
126
|
+
def _extract_url(self, raw_link: str) -> Optional[str]:
|
|
127
|
+
"""
|
|
128
|
+
Extract actual URL from Google redirect URL.
|
|
285
129
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
if len(results) >= max_results:
|
|
289
|
-
break
|
|
130
|
+
Args:
|
|
131
|
+
raw_link: Raw link from Google search
|
|
290
132
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
133
|
+
Returns:
|
|
134
|
+
Actual URL or None if invalid
|
|
135
|
+
"""
|
|
136
|
+
if not raw_link:
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
if raw_link.startswith("/url?"):
|
|
140
|
+
try:
|
|
141
|
+
link = unquote(raw_link.split("&")[0].replace("/url?q=", ""))
|
|
142
|
+
return link
|
|
143
|
+
except Exception:
|
|
144
|
+
return None
|
|
145
|
+
elif raw_link.startswith("http"):
|
|
146
|
+
return unquote(raw_link)
|
|
147
|
+
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
def _is_valid_result(self, link: str, fetched_links: set, unique: bool) -> bool:
|
|
151
|
+
"""
|
|
152
|
+
Check if search result is valid.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
link: URL to check
|
|
156
|
+
fetched_links: Set of already fetched links
|
|
157
|
+
unique: Whether to filter duplicate links
|
|
296
158
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
159
|
+
Returns:
|
|
160
|
+
Boolean indicating if result is valid
|
|
161
|
+
"""
|
|
162
|
+
if any(x in link for x in ["google.", "/search?", "webcache."]):
|
|
163
|
+
return False
|
|
301
164
|
|
|
302
|
-
|
|
165
|
+
if link in fetched_links and unique:
|
|
166
|
+
return False
|
|
303
167
|
|
|
304
|
-
return
|
|
305
|
-
|
|
306
|
-
def
|
|
168
|
+
return True
|
|
169
|
+
|
|
170
|
+
def _parse_search_results(
|
|
307
171
|
self,
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
max_results: int = 10,
|
|
314
|
-
extract_text: bool = False,
|
|
315
|
-
max_text_length: Optional[int] = 100,
|
|
316
|
-
site: Optional[str] = None, # Search within specific site
|
|
317
|
-
file_type: Optional[str] = None, # Filter by file type
|
|
318
|
-
sort_by: str = "relevance", # relevance, date
|
|
319
|
-
exclude_terms: Optional[List[str]] = None, # Terms to exclude
|
|
320
|
-
exact_phrase: Optional[str] = None, # Exact phrase match
|
|
321
|
-
) -> List[Dict[str, Union[str, int]]]:
|
|
172
|
+
html: str,
|
|
173
|
+
num_results: int,
|
|
174
|
+
fetched_links: set,
|
|
175
|
+
unique: bool
|
|
176
|
+
) -> List[SearchResult]:
|
|
322
177
|
"""
|
|
323
|
-
|
|
178
|
+
Parse search results from HTML.
|
|
324
179
|
|
|
325
180
|
Args:
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
181
|
+
html: HTML content to parse
|
|
182
|
+
num_results: Maximum number of results to return
|
|
183
|
+
fetched_links: Set of already fetched links
|
|
184
|
+
unique: Filter duplicate links
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of SearchResult objects
|
|
331
188
|
"""
|
|
332
|
-
# Build advanced query
|
|
333
|
-
advanced_query = query
|
|
334
|
-
if site:
|
|
335
|
-
advanced_query += f" site:{site}"
|
|
336
|
-
if file_type:
|
|
337
|
-
advanced_query += f" filetype:{file_type}"
|
|
338
|
-
if exclude_terms:
|
|
339
|
-
advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
|
|
340
|
-
if exact_phrase:
|
|
341
|
-
advanced_query = f'"{exact_phrase}"' + advanced_query
|
|
342
|
-
|
|
343
|
-
# Check cache first
|
|
344
|
-
cache_key = self._cache_key(advanced_query, region=region, language=language,
|
|
345
|
-
safe=safe, time_period=time_period, sort_by=sort_by)
|
|
346
|
-
cached_results = self._get_cached_results(cache_key)
|
|
347
|
-
if cached_results:
|
|
348
|
-
return cached_results[:max_results]
|
|
349
|
-
|
|
350
|
-
# Continue with regular search implementation...
|
|
351
189
|
results = []
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
resp_content = future.result()
|
|
373
|
-
soup = Scout(resp_content) # Use Scout parser
|
|
374
|
-
|
|
375
|
-
result_blocks = soup.find_all("div", class_="g")
|
|
376
|
-
|
|
377
|
-
if not result_blocks:
|
|
378
|
-
break
|
|
379
|
-
|
|
380
|
-
# Extract links and titles first
|
|
381
|
-
for result_block in result_blocks:
|
|
382
|
-
link = result_block.find("a", href=True)
|
|
383
|
-
title = result_block.find("h3")
|
|
384
|
-
description_box = result_block.find(
|
|
385
|
-
"div", {"style": "-webkit-line-clamp:2"}
|
|
386
|
-
)
|
|
190
|
+
soup = Scout(html, features="html.parser")
|
|
191
|
+
result_blocks = soup.find_all("div", class_="ezO2md")
|
|
192
|
+
|
|
193
|
+
if not result_blocks:
|
|
194
|
+
# Try alternative class patterns if the main one doesn't match
|
|
195
|
+
result_blocks = soup.find_all("div", attrs={"class": lambda c: c and "g" in c.split()})
|
|
196
|
+
|
|
197
|
+
for result in result_blocks:
|
|
198
|
+
# Find the link - looking for various potential Google result classes
|
|
199
|
+
link_tag = result.find("a", class_=["fuLhoc", "ZWRArf"])
|
|
200
|
+
if not link_tag:
|
|
201
|
+
link_tag = result.find("a")
|
|
202
|
+
if not link_tag:
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
raw_link = link_tag.get("href", "")
|
|
206
|
+
link = self._extract_url(raw_link)
|
|
207
|
+
|
|
208
|
+
if not link:
|
|
209
|
+
continue
|
|
387
210
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
results.append({
|
|
391
|
-
"title": title.text,
|
|
392
|
-
"href": url,
|
|
393
|
-
"abstract": description_box.text,
|
|
394
|
-
"index": len(results),
|
|
395
|
-
"type": "web",
|
|
396
|
-
"visible_text": "" # Initialize visible_text as empty string
|
|
397
|
-
})
|
|
211
|
+
if not self._is_valid_result(link, fetched_links, unique):
|
|
212
|
+
continue
|
|
398
213
|
|
|
399
|
-
|
|
400
|
-
|
|
214
|
+
# Get title - it's the text content of the link tag for these results
|
|
215
|
+
title = link_tag.get_text(strip=True)
|
|
216
|
+
if not title:
|
|
217
|
+
continue
|
|
401
218
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
max_characters=max_text_length)
|
|
409
|
-
for result in results
|
|
410
|
-
if 'href' in result
|
|
411
|
-
]
|
|
412
|
-
for i, future in enumerate(as_completed(extraction_futures)):
|
|
413
|
-
try:
|
|
414
|
-
results[i]['visible_text'] = future.result()
|
|
415
|
-
except Exception as e:
|
|
416
|
-
print(f"Error extracting text: {e}")
|
|
219
|
+
# Get description - it's in a span with class FrIlee or potentially other classes
|
|
220
|
+
description_tag = result.find("span", class_="FrIlee")
|
|
221
|
+
if not description_tag:
|
|
222
|
+
description_tag = result.find(["div", "span"], class_=lambda c: c and any(x in c for x in ["snippet", "description", "VwiC3b"]))
|
|
223
|
+
|
|
224
|
+
description = description_tag.get_text(strip=True) if description_tag else ""
|
|
417
225
|
|
|
418
|
-
|
|
419
|
-
|
|
226
|
+
# Create result object
|
|
227
|
+
search_result = SearchResult(link, title, description)
|
|
228
|
+
|
|
229
|
+
# Add extra metadata if available
|
|
230
|
+
citation = result.find("cite")
|
|
231
|
+
if citation:
|
|
232
|
+
search_result.metadata["source"] = citation.get_text(strip=True)
|
|
233
|
+
|
|
234
|
+
timestamp = result.find("span", class_=lambda c: c and "ZE5qJf" in c)
|
|
235
|
+
if timestamp:
|
|
236
|
+
search_result.metadata["date"] = timestamp.get_text(strip=True)
|
|
420
237
|
|
|
421
|
-
|
|
422
|
-
|
|
238
|
+
fetched_links.add(link)
|
|
239
|
+
results.append(search_result)
|
|
240
|
+
|
|
241
|
+
if len(results) >= num_results:
|
|
242
|
+
break
|
|
243
|
+
|
|
423
244
|
return results
|
|
424
|
-
|
|
425
|
-
def
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
245
|
+
|
|
246
|
+
def text(
|
|
247
|
+
self,
|
|
248
|
+
keywords: str,
|
|
249
|
+
region: str = None,
|
|
250
|
+
safesearch: str = "moderate",
|
|
251
|
+
max_results: int = 10,
|
|
252
|
+
start_num: int = 0,
|
|
253
|
+
unique: bool = True
|
|
254
|
+
) -> List[SearchResult]:
|
|
255
|
+
"""
|
|
256
|
+
Search Google for web results.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
keywords: Search query
|
|
260
|
+
region: Region for search results (ISO country code)
|
|
261
|
+
safesearch: SafeSearch setting ("on", "moderate", "off")
|
|
262
|
+
max_results: Maximum number of results to return
|
|
263
|
+
start_num: Starting position for pagination
|
|
264
|
+
unique: Filter duplicate results
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
List of SearchResult objects with search results
|
|
268
|
+
"""
|
|
269
|
+
if not keywords:
|
|
270
|
+
raise ValueError("Search keywords cannot be empty")
|
|
271
|
+
|
|
272
|
+
# Map safesearch values to Google's safe parameter
|
|
273
|
+
safe_map = {
|
|
274
|
+
"on": "active",
|
|
275
|
+
"moderate": "moderate",
|
|
276
|
+
"off": "off"
|
|
430
277
|
}
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
278
|
+
safe = safe_map.get(safesearch.lower(), "moderate")
|
|
279
|
+
|
|
280
|
+
# Keep track of unique results
|
|
281
|
+
fetched_results = []
|
|
282
|
+
fetched_links = set()
|
|
283
|
+
start = start_num
|
|
284
|
+
|
|
285
|
+
while len(fetched_results) < max_results:
|
|
286
|
+
response_html = self._make_request(
|
|
287
|
+
term=keywords,
|
|
288
|
+
results=max_results - len(fetched_results),
|
|
289
|
+
start=start
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
results = self._parse_search_results(
|
|
293
|
+
html=response_html,
|
|
294
|
+
num_results=max_results - len(fetched_results),
|
|
295
|
+
fetched_links=fetched_links,
|
|
296
|
+
unique=unique
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
if not results:
|
|
300
|
+
break
|
|
301
|
+
|
|
302
|
+
fetched_results.extend(results)
|
|
303
|
+
|
|
304
|
+
if len(fetched_results) >= max_results:
|
|
305
|
+
break
|
|
306
|
+
|
|
307
|
+
start += 10
|
|
308
|
+
sleep(self.sleep_interval)
|
|
309
|
+
|
|
310
|
+
return fetched_results[:max_results]
|
|
311
|
+
|
|
312
|
+
def news(
|
|
313
|
+
self,
|
|
314
|
+
keywords: str,
|
|
315
|
+
region: str = None,
|
|
316
|
+
safesearch: str = "moderate",
|
|
317
|
+
max_results: int = 10
|
|
318
|
+
) -> List[SearchResult]:
|
|
437
319
|
"""
|
|
438
|
-
|
|
320
|
+
Search Google News for news results.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
keywords: Search query
|
|
324
|
+
region: Region for search results (ISO country code)
|
|
325
|
+
safesearch: SafeSearch setting ("on", "moderate", "off")
|
|
326
|
+
max_results: Maximum number of results to return
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
List of SearchResult objects with news results
|
|
439
330
|
"""
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
331
|
+
if not keywords:
|
|
332
|
+
raise ValueError("Search keywords cannot be empty")
|
|
333
|
+
|
|
334
|
+
# Map safesearch values to Google's safe parameter
|
|
335
|
+
safe_map = {
|
|
336
|
+
"on": "active",
|
|
337
|
+
"moderate": "moderate",
|
|
338
|
+
"off": "off"
|
|
339
|
+
}
|
|
340
|
+
safe = safe_map.get(safesearch.lower(), "moderate")
|
|
341
|
+
|
|
342
|
+
# Keep track of unique results
|
|
343
|
+
fetched_results = []
|
|
344
|
+
fetched_links = set()
|
|
345
|
+
|
|
346
|
+
response_html = self._make_request(
|
|
347
|
+
term=keywords,
|
|
348
|
+
results=max_results,
|
|
349
|
+
search_type="nws"
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
results = self._parse_search_results(
|
|
353
|
+
html=response_html,
|
|
354
|
+
num_results=max_results,
|
|
355
|
+
fetched_links=fetched_links,
|
|
356
|
+
unique=True
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
return results[:max_results]
|
|
360
|
+
|
|
361
|
+
def suggestions(self, query: str, region: str = None) -> List[str]:
|
|
362
|
+
"""
|
|
363
|
+
Get search suggestions for a query term.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
query: Search query
|
|
367
|
+
region: Region for suggestions (ISO country code)
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
List of search suggestions
|
|
371
|
+
"""
|
|
372
|
+
if not query:
|
|
373
|
+
raise ValueError("Search query cannot be empty")
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
params = {
|
|
377
|
+
"client": "firefox",
|
|
378
|
+
"q": query,
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
# Add region if specified
|
|
382
|
+
if region and region.lower() != "all":
|
|
383
|
+
params["gl"] = region
|
|
384
|
+
|
|
385
|
+
url = f"https://www.google.com/complete/search?{urlencode(params)}"
|
|
386
|
+
|
|
387
|
+
headers = {
|
|
388
|
+
"User-Agent": self._get_useragent(),
|
|
389
|
+
"Accept": "application/json, text/javascript, */*",
|
|
390
|
+
"Accept-Language": self.lang,
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
response = get(
|
|
394
|
+
url=url,
|
|
395
|
+
headers=headers,
|
|
396
|
+
timeout=self.timeout,
|
|
397
|
+
verify=self.verify
|
|
398
|
+
)
|
|
399
|
+
response.raise_for_status()
|
|
400
|
+
|
|
401
|
+
# Response format is typically: ["original query", ["suggestion1", "suggestion2", ...]]
|
|
402
|
+
data = response.json()
|
|
403
|
+
if isinstance(data, list) and len(data) > 1 and isinstance(data[1], list):
|
|
404
|
+
return data[1]
|
|
405
|
+
return []
|
|
406
|
+
|
|
407
|
+
except Exception as e:
|
|
408
|
+
# Return empty list on error instead of raising exception
|
|
409
|
+
return []
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
# Legacy function support for backward compatibility
|
|
413
|
+
def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=True, region=None, start_num=0, unique=False):
|
|
414
|
+
"""Legacy function for backward compatibility."""
|
|
415
|
+
google_search = GoogleSearch(
|
|
416
|
+
timeout=timeout,
|
|
417
|
+
proxies={"https": proxy, "http": proxy} if proxy else None,
|
|
418
|
+
verify=ssl_verify,
|
|
419
|
+
lang=lang,
|
|
420
|
+
sleep_interval=sleep_interval
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
results = google_search.text(
|
|
424
|
+
keywords=term,
|
|
425
|
+
region=region,
|
|
426
|
+
safesearch="on" if safe == "active" else "moderate" if safe == "moderate" else "off",
|
|
427
|
+
max_results=num_results,
|
|
428
|
+
start_num=start_num,
|
|
429
|
+
unique=unique
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Convert to simple URLs if not advanced mode
|
|
433
|
+
if not advanced:
|
|
434
|
+
return [result.url for result in results]
|
|
435
|
+
return results
|
|
454
436
|
|
|
455
437
|
|
|
456
438
|
if __name__ == "__main__":
|
|
457
439
|
from rich import print
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
440
|
+
google = GoogleSearch(
|
|
441
|
+
timeout=10, # Optional: Set custom timeout
|
|
442
|
+
proxies=None, # Optional: Use proxies
|
|
443
|
+
verify=True # Optional: SSL verification
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Text Search
|
|
447
|
+
print("TEXT SEARCH RESULTS:")
|
|
448
|
+
text_results = google.text(
|
|
449
|
+
keywords="Python programming",
|
|
450
|
+
region="us", # Optional: Region for results
|
|
451
|
+
safesearch="moderate", # Optional: "on", "moderate", "off"
|
|
452
|
+
max_results=3 # Optional: Limit number of results
|
|
453
|
+
)
|
|
454
|
+
for result in text_results:
|
|
455
|
+
print(f"Title: {result.title}")
|
|
456
|
+
print(f"URL: {result.url}")
|
|
457
|
+
print(f"Description: {result.description}")
|
|
458
|
+
print("---")
|
|
459
|
+
|
|
460
|
+
# News Search
|
|
461
|
+
print("\nNEWS SEARCH RESULTS:")
|
|
462
|
+
news_results = google.news(
|
|
463
|
+
keywords="artificial intelligence",
|
|
464
|
+
region="us",
|
|
465
|
+
safesearch="moderate",
|
|
466
|
+
max_results=2
|
|
467
|
+
)
|
|
468
|
+
for result in news_results:
|
|
469
|
+
print(f"Title: {result.title}")
|
|
470
|
+
print(f"URL: {result.url}")
|
|
471
|
+
print(f"Description: {result.description}")
|
|
472
|
+
print("---")
|
|
473
|
+
|
|
474
|
+
# Search Suggestions
|
|
475
|
+
print("\nSEARCH SUGGESTIONS:")
|
|
476
|
+
suggestions = google.suggestions("how to")
|
|
477
|
+
print(suggestions)
|