webscout 6.2b0__py3-none-any.whl → 6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

webscout/DWEBS.py CHANGED
@@ -1,179 +1,323 @@
1
- from bs4 import BeautifulSoup
2
- import requests
3
- from typing import Dict, List, Optional, Union
4
- from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from urllib.parse import quote
6
- from termcolor import colored
7
- import time
8
- import random
9
-
10
- class GoogleS:
11
- """
12
- Class to perform Google searches and retrieve results.
13
- """
14
-
15
- def __init__(
16
- self,
17
- headers: Optional[Dict[str, str]] = None,
18
- proxy: Optional[str] = None,
19
- timeout: Optional[int] = 10,
20
- max_workers: int = 20 # Increased max workers for thread pool
21
- ):
22
- """Initializes the GoogleS object."""
23
- self.proxy = proxy
24
- self.headers = headers if headers else {
25
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"
26
- }
27
- self.headers["Referer"] = "https://www.google.com/"
28
- self.client = requests.Session()
29
- self.client.headers.update(self.headers)
30
- self.client.proxies.update({"http": self.proxy, "https": self.proxy})
31
- self.timeout = timeout
32
- self._executor = ThreadPoolExecutor(max_workers=max_workers)
33
-
34
- def __enter__(self):
35
- return self
36
-
37
- def __exit__(self, exc_type, exc_val, exc_tb):
38
- self.client.close()
39
-
40
- def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
41
- data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
42
- """
43
- Makes an HTTP request and returns the response content.
44
- """
45
- try:
46
- resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
47
- except Exception as ex:
48
- raise Exception(f"{url} {type(ex).__name__}: {ex}") from ex
49
- if resp.status_code == 200:
50
- return resp.content
51
- raise Exception(f"{resp.url} returned status code {resp.status_code}. {params=} {data=}")
52
-
53
- def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
54
- """
55
- Extracts visible text from HTML content using lxml parser.
56
- """
57
- soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
58
- for tag in soup(["script", "style", "header", "footer", "nav"]):
59
- tag.extract()
60
- visible_text = soup.get_text(strip=True)
61
- if max_characters:
62
- visible_text = visible_text[:max_characters]
63
- return visible_text
64
-
65
- def search(
66
- self,
67
- query: str,
68
- region: str = "us-en",
69
- language: str = "en",
70
- safe: str = "off",
71
- time_period: Optional[str] = None,
72
- max_results: int = 10,
73
- extract_text: bool = False,
74
- max_text_length: Optional[int] = 100,
75
- ) -> List[Dict[str, Union[str, int]]]:
76
- """
77
- Performs a Google search and returns the results.
78
-
79
- Args:
80
- query (str): The search query.
81
- region (str, optional): The region to search in (e.g., "us-en"). Defaults to "us-en".
82
- language (str, optional): The language of the search results (e.g., "en"). Defaults to "en".
83
- safe (str, optional): Safe search setting ("off", "active"). Defaults to "off".
84
- time_period (Optional[str], optional): Time period filter (e.g., "h" for past hour, "d" for past day).
85
- Defaults to None.
86
- max_results (int, optional): The maximum number of results to retrieve. Defaults to 10.
87
- extract_text (bool, optional): Whether to extract text from the linked web pages. Defaults to False.
88
- max_text_length (Optional[int], optional): The maximum length of the extracted text (in characters).
89
- Defaults to 100.
90
-
91
- Returns:
92
- List[Dict[str, Union[str, int]]]: A list of dictionaries, each representing a search result, containing:
93
- - 'title': The title of the result.
94
- - 'href': The URL of the result.
95
- - 'abstract': The description snippet of the result.
96
- - 'index': The index of the result in the list.
97
- - 'type': The type of result (currently always "web").
98
- - 'visible_text': The extracted text from the web page (if `extract_text` is True).
99
- """
100
- assert query, "Query cannot be empty."
101
-
102
- results = []
103
- futures = []
104
- start = 0
105
-
106
- while len(results) < max_results:
107
- params = {
108
- "q": query,
109
- "num": 10,
110
- "hl": language,
111
- "start": start,
112
- "safe": safe,
113
- "gl": region,
114
- }
115
- if time_period:
116
- params["tbs"] = f"qdr:{time_period}"
117
-
118
- futures.append(self._executor.submit(self._get_url, "GET", "https://www.google.com/search", params=params))
119
- start += 10
120
-
121
- for future in as_completed(futures):
122
- try:
123
- resp_content = future.result()
124
- soup = BeautifulSoup(resp_content, 'lxml') # Use lxml parser
125
- result_blocks = soup.find_all("div", class_="g")
126
-
127
- if not result_blocks:
128
- break
129
-
130
- # Extract links and titles first
131
- for result_block in result_blocks:
132
- link = result_block.find("a", href=True)
133
- title = result_block.find("h3")
134
- description_box = result_block.find(
135
- "div", {"style": "-webkit-line-clamp:2"}
136
- )
137
-
138
- if link and title and description_box:
139
- url = link["href"]
140
- results.append({
141
- "title": title.text,
142
- "href": url,
143
- "abstract": description_box.text,
144
- "index": len(results),
145
- "type": "web",
146
- "visible_text": "" # Initialize visible_text as empty string
147
- })
148
-
149
- if len(results) >= max_results:
150
- break # Stop if we have enough results
151
-
152
- # Parallelize text extraction if needed
153
- if extract_text:
154
- with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
155
- extraction_futures = [
156
- text_extractor.submit(self._extract_text_from_webpage,
157
- self._get_url("GET", result['href']),
158
- max_characters=max_text_length)
159
- for result in results
160
- if 'href' in result
161
- ]
162
- for i, future in enumerate(as_completed(extraction_futures)):
163
- try:
164
- results[i]['visible_text'] = future.result()
165
- except Exception as e:
166
- print(f"Error extracting text: {e}")
167
-
168
- except Exception as e:
169
- print(f"Error: {e}")
170
-
171
- return results
172
-
173
-
174
- if __name__ == "__main__":
175
- from rich import print
176
- searcher = GoogleS()
177
- results = searcher.search("HelpingAI-9B", max_results=20, extract_text=False, max_text_length=200)
178
- for result in results:
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ from typing import Dict, List, Optional, Union, Any
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from urllib.parse import quote, urljoin
6
+ from termcolor import colored
7
+ import time
8
+ import random
9
+ import json
10
+ import os
11
+ from datetime import datetime, timedelta
12
+ from functools import lru_cache
13
+ import logging
14
+ from tenacity import retry, stop_after_attempt, wait_exponential
15
+
16
+ class GoogleS:
17
+ """
18
+ Enhanced Google Search class with support for web search, image search, and advanced filters.
19
+ """
20
+
21
+ SEARCH_TYPES = {
22
+ "web": "https://www.google.com/search",
23
+ "image": "https://www.google.com/images",
24
+ "news": "https://www.google.com/news",
25
+ }
26
+
27
+ def __init__(
28
+ self,
29
+ headers: Optional[Dict[str, str]] = None,
30
+ proxy: Optional[str] = None,
31
+ timeout: Optional[int] = 10,
32
+ max_workers: int = 20,
33
+ cache_dir: Optional[str] = None,
34
+ rate_limit: float = 0.01
35
+ ):
36
+ """
37
+ Initialize the GoogleS object with enhanced features.
38
+
39
+ Args:
40
+ cache_dir: Directory to store search result cache
41
+ rate_limit: Minimum time between requests in seconds
42
+ """
43
+ self.proxy = proxy
44
+ self.headers = headers if headers else {
45
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
46
+ }
47
+ self.headers["Referer"] = "https://www.google.com/"
48
+ self.client = requests.Session()
49
+ self.client.headers.update(self.headers)
50
+ if proxy:
51
+ self.client.proxies.update({"http": proxy, "https": proxy})
52
+ self.timeout = timeout
53
+ self._executor = ThreadPoolExecutor(max_workers=max_workers)
54
+ self.cache_dir = cache_dir
55
+ if cache_dir and not os.path.exists(cache_dir):
56
+ os.makedirs(cache_dir)
57
+ self.last_request_time = 0
58
+ self.rate_limit = rate_limit
59
+
60
+ # Setup logging
61
+ logging.basicConfig(level=logging.INFO)
62
+ self.logger = logging.getLogger(__name__)
63
+
64
+ def _respect_rate_limit(self):
65
+ """Ensure minimum time between requests"""
66
+ current_time = time.time()
67
+ time_since_last = current_time - self.last_request_time
68
+ if time_since_last < self.rate_limit:
69
+ time.sleep(self.rate_limit - time_since_last)
70
+ self.last_request_time = time.time()
71
+
72
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
73
+ def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
74
+ data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
75
+ """
76
+ Makes an HTTP request with retry logic and rate limiting.
77
+ """
78
+ self._respect_rate_limit()
79
+ try:
80
+ resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
81
+ resp.raise_for_status()
82
+ return resp.content
83
+ except requests.exceptions.RequestException as ex:
84
+ self.logger.error(f"Request failed: {url} - {str(ex)}")
85
+ raise
86
+
87
+ @lru_cache(maxsize=100)
88
+ def _cache_key(self, query: str, **kwargs) -> str:
89
+ """Generate a cache key from search parameters"""
90
+ cache_data = {'query': query, **kwargs}
91
+ return json.dumps(cache_data, sort_keys=True)
92
+
93
+ def _get_cached_results(self, cache_key: str) -> Optional[List[Dict[str, Any]]]:
94
+ """Retrieve cached results if they exist and are not expired"""
95
+ if not self.cache_dir:
96
+ return None
97
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
98
+ if os.path.exists(cache_file):
99
+ with open(cache_file, 'r') as f:
100
+ cached_data = json.load(f)
101
+ if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
102
+ return cached_data['results']
103
+ return None
104
+
105
+ def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
106
+ """Cache search results"""
107
+ if not self.cache_dir:
108
+ return
109
+ cache_file = os.path.join(self.cache_dir, f"{cache_key}.json")
110
+ with open(cache_file, 'w') as f:
111
+ json.dump({
112
+ 'timestamp': datetime.now().isoformat(),
113
+ 'results': results
114
+ }, f)
115
+
116
+ def search_images(
117
+ self,
118
+ query: str,
119
+ max_results: int = 10,
120
+ size: Optional[str] = None, # large, medium, icon
121
+ color: Optional[str] = None, # color, gray, transparent
122
+ type_filter: Optional[str] = None, # face, photo, clipart, lineart
123
+ **kwargs
124
+ ) -> List[Dict[str, str]]:
125
+ """
126
+ Perform an image search and return results.
127
+
128
+ Args:
129
+ size: Filter by image size
130
+ color: Filter by color
131
+ type_filter: Filter by image type
132
+ """
133
+ params = {
134
+ "q": query,
135
+ "tbm": "isch",
136
+ "num": max_results
137
+ }
138
+
139
+ if size:
140
+ params["tbs"] = f"isz:{size}"
141
+ if color:
142
+ params["tbs"] = f"ic:{color}"
143
+ if type_filter:
144
+ params["tbs"] = f"itp:{type_filter}"
145
+
146
+ content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
147
+ soup = BeautifulSoup(content, 'lxml')
148
+
149
+ results = []
150
+ for img in soup.find_all("img", class_="rg_i"):
151
+ if len(results) >= max_results:
152
+ break
153
+
154
+ img_data = {
155
+ "thumbnail": img.get("src", ""),
156
+ "title": img.get("alt", ""),
157
+ "type": "image"
158
+ }
159
+
160
+ # Extract full resolution image URL if available
161
+ parent = img.parent
162
+ if parent and parent.get("href"):
163
+ img_data["full_url"] = urljoin("https://www.google.com", parent["href"])
164
+
165
+ results.append(img_data)
166
+
167
+ return results
168
+
169
+ def search(
170
+ self,
171
+ query: str,
172
+ region: str = "us-en",
173
+ language: str = "en",
174
+ safe: str = "off",
175
+ time_period: Optional[str] = None,
176
+ max_results: int = 10,
177
+ extract_text: bool = False,
178
+ max_text_length: Optional[int] = 100,
179
+ site: Optional[str] = None, # Search within specific site
180
+ file_type: Optional[str] = None, # Filter by file type
181
+ sort_by: str = "relevance", # relevance, date
182
+ exclude_terms: Optional[List[str]] = None, # Terms to exclude
183
+ exact_phrase: Optional[str] = None, # Exact phrase match
184
+ ) -> List[Dict[str, Union[str, int]]]:
185
+ """
186
+ Enhanced search with additional filters and options.
187
+
188
+ Args:
189
+ site: Limit search to specific website
190
+ file_type: Filter by file type (pdf, doc, etc.)
191
+ sort_by: Sort results by relevance or date
192
+ exclude_terms: List of terms to exclude from search
193
+ exact_phrase: Exact phrase to match
194
+ """
195
+ # Build advanced query
196
+ advanced_query = query
197
+ if site:
198
+ advanced_query += f" site:{site}"
199
+ if file_type:
200
+ advanced_query += f" filetype:{file_type}"
201
+ if exclude_terms:
202
+ advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
203
+ if exact_phrase:
204
+ advanced_query = f'"{exact_phrase}"' + advanced_query
205
+
206
+ # Check cache first
207
+ cache_key = self._cache_key(advanced_query, region=region, language=language,
208
+ safe=safe, time_period=time_period, sort_by=sort_by)
209
+ cached_results = self._get_cached_results(cache_key)
210
+ if cached_results:
211
+ return cached_results[:max_results]
212
+
213
+ # Continue with regular search implementation...
214
+ results = []
215
+ futures = []
216
+ start = 0
217
+
218
+ while len(results) < max_results:
219
+ params = {
220
+ "q": advanced_query,
221
+ "num": 10,
222
+ "hl": language,
223
+ "start": start,
224
+ "safe": safe,
225
+ "gl": region,
226
+ }
227
+ if time_period:
228
+ params["tbs"] = f"qdr:{time_period}"
229
+
230
+ futures.append(self._executor.submit(self._get_url, "GET", self.SEARCH_TYPES["web"], params=params))
231
+ start += 10
232
+
233
+ for future in as_completed(futures):
234
+ try:
235
+ resp_content = future.result()
236
+ soup = BeautifulSoup(resp_content, 'lxml') # Use lxml parser
237
+ result_blocks = soup.find_all("div", class_="g")
238
+
239
+ if not result_blocks:
240
+ break
241
+
242
+ # Extract links and titles first
243
+ for result_block in result_blocks:
244
+ link = result_block.find("a", href=True)
245
+ title = result_block.find("h3")
246
+ description_box = result_block.find(
247
+ "div", {"style": "-webkit-line-clamp:2"}
248
+ )
249
+
250
+ if link and title and description_box:
251
+ url = link["href"]
252
+ results.append({
253
+ "title": title.text,
254
+ "href": url,
255
+ "abstract": description_box.text,
256
+ "index": len(results),
257
+ "type": "web",
258
+ "visible_text": "" # Initialize visible_text as empty string
259
+ })
260
+
261
+ if len(results) >= max_results:
262
+ break # Stop if we have enough results
263
+
264
+ # Parallelize text extraction if needed
265
+ if extract_text:
266
+ with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
267
+ extraction_futures = [
268
+ text_extractor.submit(self._extract_text_from_webpage,
269
+ self._get_url("GET", result['href']),
270
+ max_characters=max_text_length)
271
+ for result in results
272
+ if 'href' in result
273
+ ]
274
+ for i, future in enumerate(as_completed(extraction_futures)):
275
+ try:
276
+ results[i]['visible_text'] = future.result()
277
+ except Exception as e:
278
+ print(f"Error extracting text: {e}")
279
+
280
+ except Exception as e:
281
+ print(f"Error: {e}")
282
+
283
+ # Cache results before returning
284
+ self._cache_results(cache_key, results)
285
+ return results
286
+
287
+ def get_search_suggestions(self, query: str) -> List[str]:
288
+ """Get search suggestions for a query"""
289
+ params = {
290
+ "client": "chrome",
291
+ "q": query
292
+ }
293
+ content = self._get_url("GET", "https://suggestqueries.google.com/complete/search",
294
+ params=params)
295
+ suggestions = json.loads(content.decode('utf-8'))[1]
296
+ return suggestions
297
+
298
+ def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
299
+ """
300
+ Extracts visible text from HTML content using lxml parser.
301
+ """
302
+ soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
303
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
304
+ tag.extract()
305
+ visible_text = soup.get_text(strip=True)
306
+ if max_characters:
307
+ visible_text = visible_text[:max_characters]
308
+ return visible_text
309
+
310
+ def __enter__(self):
311
+ return self
312
+
313
+ def __exit__(self, exc_type, exc_val, exc_tb):
314
+ self.client.close()
315
+ self._executor.shutdown()
316
+
317
+
318
+ if __name__ == "__main__":
319
+ from rich import print
320
+ searcher = GoogleS()
321
+ results = searcher.search("HelpingAI-9B", max_results=200, extract_text=False, max_text_length=200)
322
+ for result in results:
179
323
  print(result)