webscout 6.3__py3-none-any.whl → 6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIauto.py +191 -176
- webscout/AIbase.py +0 -197
- webscout/AIutel.py +441 -1130
- webscout/DWEBS.py +189 -35
- webscout/{YTdownloader.py → Extra/YTToolkit/YTdownloader.py} +990 -1103
- webscout/Extra/YTToolkit/__init__.py +3 -0
- webscout/{transcriber.py → Extra/YTToolkit/transcriber.py} +479 -551
- webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
- webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
- webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
- webscout/Extra/YTToolkit/ytapi/extras.py +45 -0
- webscout/Extra/YTToolkit/ytapi/https.py +88 -0
- webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
- webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
- webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
- webscout/Extra/YTToolkit/ytapi/query.py +37 -0
- webscout/Extra/YTToolkit/ytapi/stream.py +60 -0
- webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
- webscout/Extra/YTToolkit/ytapi/video.py +102 -0
- webscout/Extra/__init__.py +3 -1
- webscout/Extra/autocoder/__init__.py +9 -0
- webscout/Extra/autocoder/autocoder_utiles.py +121 -0
- webscout/Extra/autocoder/rawdog.py +680 -0
- webscout/Extra/autollama.py +246 -195
- webscout/Extra/gguf.py +81 -56
- webscout/Extra/markdownlite/__init__.py +862 -0
- webscout/Extra/weather_ascii.py +2 -2
- webscout/LLM.py +206 -43
- webscout/Litlogger/__init__.py +681 -0
- webscout/Provider/DARKAI.py +1 -1
- webscout/Provider/EDITEE.py +1 -1
- webscout/Provider/NinjaChat.py +1 -1
- webscout/Provider/PI.py +120 -35
- webscout/Provider/Perplexity.py +590 -598
- webscout/Provider/Reka.py +0 -1
- webscout/Provider/RoboCoders.py +206 -0
- webscout/Provider/TTI/AiForce/__init__.py +22 -0
- webscout/Provider/TTI/AiForce/async_aiforce.py +257 -0
- webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -0
- webscout/Provider/TTI/Nexra/__init__.py +22 -0
- webscout/Provider/TTI/Nexra/async_nexra.py +286 -0
- webscout/Provider/TTI/Nexra/sync_nexra.py +258 -0
- webscout/Provider/TTI/PollinationsAI/__init__.py +23 -0
- webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -0
- webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -0
- webscout/Provider/TTI/__init__.py +2 -4
- webscout/Provider/TTI/artbit/__init__.py +22 -0
- webscout/Provider/TTI/artbit/async_artbit.py +184 -0
- webscout/Provider/TTI/artbit/sync_artbit.py +176 -0
- webscout/Provider/TTI/blackbox/__init__.py +4 -0
- webscout/Provider/TTI/blackbox/async_blackbox.py +212 -0
- webscout/Provider/TTI/{blackboximage.py → blackbox/sync_blackbox.py} +199 -153
- webscout/Provider/TTI/deepinfra/__init__.py +4 -0
- webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -0
- webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -0
- webscout/Provider/TTI/huggingface/__init__.py +22 -0
- webscout/Provider/TTI/huggingface/async_huggingface.py +199 -0
- webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -0
- webscout/Provider/TTI/imgninza/__init__.py +4 -0
- webscout/Provider/TTI/imgninza/async_ninza.py +214 -0
- webscout/Provider/TTI/{imgninza.py → imgninza/sync_ninza.py} +209 -136
- webscout/Provider/TTI/talkai/__init__.py +4 -0
- webscout/Provider/TTI/talkai/async_talkai.py +229 -0
- webscout/Provider/TTI/talkai/sync_talkai.py +207 -0
- webscout/Provider/TTS/__init__.py +5 -1
- webscout/Provider/TTS/deepgram.py +183 -0
- webscout/Provider/TTS/elevenlabs.py +137 -0
- webscout/Provider/TTS/gesserit.py +151 -0
- webscout/Provider/TTS/murfai.py +139 -0
- webscout/Provider/TTS/parler.py +134 -107
- webscout/Provider/TTS/streamElements.py +360 -275
- webscout/Provider/TTS/utils.py +280 -0
- webscout/Provider/TTS/voicepod.py +116 -116
- webscout/Provider/__init__.py +8 -1
- webscout/Provider/askmyai.py +2 -2
- webscout/Provider/cerebras.py +227 -219
- webscout/Provider/llama3mitril.py +0 -1
- webscout/Provider/meta.py +794 -779
- webscout/Provider/mhystical.py +176 -0
- webscout/Provider/perplexitylabs.py +265 -0
- webscout/Provider/twitterclone.py +251 -245
- webscout/Provider/typegpt.py +358 -0
- webscout/__init__.py +9 -8
- webscout/__main__.py +5 -5
- webscout/cli.py +252 -280
- webscout/conversation.py +227 -0
- webscout/exceptions.py +161 -29
- webscout/litagent/__init__.py +172 -0
- webscout/litprinter/__init__.py +832 -0
- webscout/optimizers.py +270 -0
- webscout/prompt_manager.py +279 -0
- webscout/scout/__init__.py +11 -0
- webscout/scout/core.py +884 -0
- webscout/scout/element.py +459 -0
- webscout/scout/parsers/__init__.py +69 -0
- webscout/scout/parsers/html5lib_parser.py +172 -0
- webscout/scout/parsers/html_parser.py +236 -0
- webscout/scout/parsers/lxml_parser.py +178 -0
- webscout/scout/utils.py +38 -0
- webscout/swiftcli/__init__.py +810 -0
- webscout/update_checker.py +125 -0
- webscout/version.py +1 -1
- webscout/zeroart/__init__.py +55 -0
- webscout/zeroart/base.py +61 -0
- webscout/zeroart/effects.py +99 -0
- webscout/zeroart/fonts.py +816 -0
- webscout/zerodir/__init__.py +225 -0
- {webscout-6.3.dist-info → webscout-6.5.dist-info}/METADATA +37 -112
- webscout-6.5.dist-info/RECORD +179 -0
- webscout/Agents/Onlinesearcher.py +0 -182
- webscout/Agents/__init__.py +0 -2
- webscout/Agents/functioncall.py +0 -248
- webscout/Bing_search.py +0 -154
- webscout/Provider/TTI/AIuncensoredimage.py +0 -103
- webscout/Provider/TTI/Nexra.py +0 -120
- webscout/Provider/TTI/PollinationsAI.py +0 -138
- webscout/Provider/TTI/WebSimAI.py +0 -142
- webscout/Provider/TTI/aiforce.py +0 -160
- webscout/Provider/TTI/artbit.py +0 -141
- webscout/Provider/TTI/deepinfra.py +0 -148
- webscout/Provider/TTI/huggingface.py +0 -155
- webscout/Provider/TTI/talkai.py +0 -116
- webscout/g4f.py +0 -666
- webscout/models.py +0 -23
- webscout/requestsHTMLfix.py +0 -775
- webscout/webai.py +0 -2590
- webscout-6.3.dist-info/RECORD +0 -124
- {webscout-6.3.dist-info → webscout-6.5.dist-info}/LICENSE.md +0 -0
- {webscout-6.3.dist-info → webscout-6.5.dist-info}/WHEEL +0 -0
- {webscout-6.3.dist-info → webscout-6.5.dist-info}/entry_points.txt +0 -0
- {webscout-6.3.dist-info → webscout-6.5.dist-info}/top_level.txt +0 -0
webscout/DWEBS.py
CHANGED
|
@@ -1,21 +1,99 @@
|
|
|
1
|
-
from bs4 import BeautifulSoup
|
|
2
1
|
import requests
|
|
3
2
|
from typing import Dict, List, Optional, Union, Any
|
|
4
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
+
from webscout.scout import Scout
|
|
5
5
|
from urllib.parse import quote, urljoin
|
|
6
|
-
from
|
|
6
|
+
from webscout.litagent import LitAgent
|
|
7
|
+
|
|
7
8
|
import time
|
|
8
9
|
import random
|
|
9
10
|
import json
|
|
10
11
|
import os
|
|
11
12
|
from datetime import datetime, timedelta
|
|
12
13
|
from functools import lru_cache
|
|
13
|
-
import
|
|
14
|
-
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
14
|
+
from webscout.Litlogger import LitLogger, LogFormat, ColorScheme
|
|
15
15
|
|
|
16
16
|
class GoogleS:
|
|
17
|
-
"""
|
|
18
|
-
|
|
17
|
+
"""A Python interface for Google search with advanced features
|
|
18
|
+
|
|
19
|
+
The GoogleS class provides a powerful interface to perform web searches, image searches,
|
|
20
|
+
and advanced filtering on Google. Built with love by HAI to keep it
|
|
21
|
+
|
|
22
|
+
Basic Usage:
|
|
23
|
+
>>> from webscout.DWEBS import GoogleS
|
|
24
|
+
>>> searcher = GoogleS()
|
|
25
|
+
>>> # Simple web search
|
|
26
|
+
>>> results = searcher.search("Python programming")
|
|
27
|
+
>>> for result in results:
|
|
28
|
+
... print(f"Title: {result['title']}")
|
|
29
|
+
... print(f"URL: {result['href']}")
|
|
30
|
+
... print(f"Description: {result['abstract']}")
|
|
31
|
+
|
|
32
|
+
Advanced Web Search:
|
|
33
|
+
>>> # Search with filters
|
|
34
|
+
>>> results = searcher.search(
|
|
35
|
+
... query="Python tutorials",
|
|
36
|
+
... site="github.com",
|
|
37
|
+
... file_type="pdf",
|
|
38
|
+
... time_period="month",
|
|
39
|
+
... max_results=5
|
|
40
|
+
... )
|
|
41
|
+
>>> # Example response format:
|
|
42
|
+
>>> {
|
|
43
|
+
... 'title': 'Python Tutorial',
|
|
44
|
+
... 'href': 'https://example.com/python-tutorial',
|
|
45
|
+
... 'abstract': 'Comprehensive Python tutorial covering basics to advanced topics',
|
|
46
|
+
... 'index': 0,
|
|
47
|
+
... 'type': 'web',
|
|
48
|
+
... 'visible_text': '' # Optional: Contains webpage text if extract_text=True
|
|
49
|
+
... }
|
|
50
|
+
|
|
51
|
+
Image Search:
|
|
52
|
+
>>> # Search for images
|
|
53
|
+
>>> images = searcher.search_images(
|
|
54
|
+
... query="cute puppies",
|
|
55
|
+
... size="large",
|
|
56
|
+
... color="color",
|
|
57
|
+
... type_filter="photo",
|
|
58
|
+
... max_results=5
|
|
59
|
+
... )
|
|
60
|
+
>>> # Example response format:
|
|
61
|
+
>>> {
|
|
62
|
+
... 'title': 'Cute Puppy Image',
|
|
63
|
+
... 'thumbnail': 'https://example.com/puppy-thumb.jpg',
|
|
64
|
+
... 'full_url': 'https://example.com/puppy-full.jpg',
|
|
65
|
+
... 'type': 'image'
|
|
66
|
+
... }
|
|
67
|
+
|
|
68
|
+
Features:
|
|
69
|
+
- Web Search: Get detailed web results with title, URL, and description
|
|
70
|
+
- Image Search: Find images with thumbnails and full-resolution URLs
|
|
71
|
+
- Advanced Filters: Site-specific search, file types, time periods
|
|
72
|
+
- Rate Limiting: Smart request handling to avoid blocks
|
|
73
|
+
- Caching: Save results for faster repeat searches
|
|
74
|
+
- Retry Logic: Automatic retry on temporary failures
|
|
75
|
+
- Logging: Optional LitLogger integration for beautiful console output
|
|
76
|
+
- Proxy Support: Use custom proxies for requests
|
|
77
|
+
- Concurrent Processing: Multi-threaded requests for better performance
|
|
78
|
+
|
|
79
|
+
Response Format:
|
|
80
|
+
Web Search Results:
|
|
81
|
+
{
|
|
82
|
+
'title': str, # Title of the webpage
|
|
83
|
+
'href': str, # URL of the webpage
|
|
84
|
+
'abstract': str, # Brief description or snippet
|
|
85
|
+
'index': int, # Result position
|
|
86
|
+
'type': 'web', # Result type identifier
|
|
87
|
+
'visible_text': str # Full page text (if extract_text=True)
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
Image Search Results:
|
|
91
|
+
{
|
|
92
|
+
'title': str, # Image title or description
|
|
93
|
+
'thumbnail': str, # Thumbnail image URL
|
|
94
|
+
'full_url': str, # Full resolution image URL
|
|
95
|
+
'type': 'image' # Result type identifier
|
|
96
|
+
}
|
|
19
97
|
"""
|
|
20
98
|
|
|
21
99
|
SEARCH_TYPES = {
|
|
@@ -31,7 +109,8 @@ class GoogleS:
|
|
|
31
109
|
timeout: Optional[int] = 10,
|
|
32
110
|
max_workers: int = 20,
|
|
33
111
|
cache_dir: Optional[str] = None,
|
|
34
|
-
rate_limit: float = 0.01
|
|
112
|
+
rate_limit: float = 0.01,
|
|
113
|
+
use_litlogger: bool = False
|
|
35
114
|
):
|
|
36
115
|
"""
|
|
37
116
|
Initialize the GoogleS object with enhanced features.
|
|
@@ -39,10 +118,11 @@ class GoogleS:
|
|
|
39
118
|
Args:
|
|
40
119
|
cache_dir: Directory to store search result cache
|
|
41
120
|
rate_limit: Minimum time between requests in seconds
|
|
121
|
+
use_litlogger: Whether to use LitLogger for logging (default: False)
|
|
42
122
|
"""
|
|
43
123
|
self.proxy = proxy
|
|
44
124
|
self.headers = headers if headers else {
|
|
45
|
-
"User-Agent":
|
|
125
|
+
"User-Agent": LitAgent().random() # Use LitAgent to generate user agent
|
|
46
126
|
}
|
|
47
127
|
self.headers["Referer"] = "https://www.google.com/"
|
|
48
128
|
self.client = requests.Session()
|
|
@@ -56,10 +136,16 @@ class GoogleS:
|
|
|
56
136
|
os.makedirs(cache_dir)
|
|
57
137
|
self.last_request_time = 0
|
|
58
138
|
self.rate_limit = rate_limit
|
|
139
|
+
self.use_litlogger = use_litlogger
|
|
59
140
|
|
|
60
|
-
# Setup logging
|
|
61
|
-
|
|
62
|
-
|
|
141
|
+
# Setup enhanced logging with LitLogger if enabled
|
|
142
|
+
if self.use_litlogger:
|
|
143
|
+
self.logger = LitLogger(
|
|
144
|
+
name="GoogleS",
|
|
145
|
+
format=LogFormat.MODERN_EMOJI,
|
|
146
|
+
color_scheme=ColorScheme.CYBERPUNK,
|
|
147
|
+
console_output=True
|
|
148
|
+
)
|
|
63
149
|
|
|
64
150
|
def _respect_rate_limit(self):
|
|
65
151
|
"""Ensure minimum time between requests"""
|
|
@@ -69,20 +155,46 @@ class GoogleS:
|
|
|
69
155
|
time.sleep(self.rate_limit - time_since_last)
|
|
70
156
|
self.last_request_time = time.time()
|
|
71
157
|
|
|
72
|
-
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
|
73
158
|
def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
|
|
74
|
-
data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
|
|
159
|
+
data: Optional[Union[Dict[str, str], bytes]] = None, max_retries: int = 3) -> bytes:
|
|
75
160
|
"""
|
|
76
|
-
Makes an HTTP request with retry logic and rate limiting.
|
|
161
|
+
Makes an HTTP request with manual retry logic and rate limiting.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
method (str): HTTP method (GET, POST, etc.)
|
|
165
|
+
url (str): Target URL
|
|
166
|
+
params (Optional[Dict[str, str]]): Query parameters
|
|
167
|
+
data (Optional[Union[Dict[str, str], bytes]]): Request payload
|
|
168
|
+
max_retries (int): Maximum number of retry attempts
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
bytes: Response content
|
|
77
172
|
"""
|
|
78
173
|
self._respect_rate_limit()
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
174
|
+
|
|
175
|
+
for attempt in range(max_retries):
|
|
176
|
+
try:
|
|
177
|
+
if self.use_litlogger:
|
|
178
|
+
self.logger.debug(f"Making {method} request to {url} (Attempt {attempt + 1})")
|
|
179
|
+
|
|
180
|
+
resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
|
|
181
|
+
resp.raise_for_status()
|
|
182
|
+
|
|
183
|
+
if self.use_litlogger:
|
|
184
|
+
self.logger.success(f"Request successful: {resp.status_code}")
|
|
185
|
+
|
|
186
|
+
return resp.content
|
|
187
|
+
|
|
188
|
+
except requests.exceptions.RequestException as ex:
|
|
189
|
+
if self.use_litlogger:
|
|
190
|
+
self.logger.error(f"Request failed: {url} - {str(ex)}")
|
|
191
|
+
|
|
192
|
+
# Exponential backoff
|
|
193
|
+
if attempt < max_retries - 1:
|
|
194
|
+
wait_time = (2 ** attempt) + random.random()
|
|
195
|
+
time.sleep(wait_time)
|
|
196
|
+
else:
|
|
197
|
+
raise
|
|
86
198
|
|
|
87
199
|
@lru_cache(maxsize=100)
|
|
88
200
|
def _cache_key(self, query: str, **kwargs) -> str:
|
|
@@ -99,7 +211,11 @@ class GoogleS:
|
|
|
99
211
|
with open(cache_file, 'r') as f:
|
|
100
212
|
cached_data = json.load(f)
|
|
101
213
|
if datetime.fromisoformat(cached_data['timestamp']) + timedelta(hours=24) > datetime.now():
|
|
214
|
+
if self.use_litlogger:
|
|
215
|
+
self.logger.info(f"Using cached results for: {cache_key}")
|
|
102
216
|
return cached_data['results']
|
|
217
|
+
if self.use_litlogger:
|
|
218
|
+
self.logger.debug(f"No valid cache found for: {cache_key}")
|
|
103
219
|
return None
|
|
104
220
|
|
|
105
221
|
def _cache_results(self, cache_key: str, results: List[Dict[str, Any]]):
|
|
@@ -117,18 +233,49 @@ class GoogleS:
|
|
|
117
233
|
self,
|
|
118
234
|
query: str,
|
|
119
235
|
max_results: int = 10,
|
|
120
|
-
size: Optional[str] = None,
|
|
121
|
-
color: Optional[str] = None,
|
|
122
|
-
type_filter: Optional[str] = None,
|
|
236
|
+
size: Optional[str] = None,
|
|
237
|
+
color: Optional[str] = None,
|
|
238
|
+
type_filter: Optional[str] = None,
|
|
123
239
|
**kwargs
|
|
124
240
|
) -> List[Dict[str, str]]:
|
|
125
|
-
"""
|
|
126
|
-
|
|
127
|
-
|
|
241
|
+
"""Search for images on Google with style!
|
|
242
|
+
|
|
128
243
|
Args:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
244
|
+
query (str): What you're looking for fam
|
|
245
|
+
max_results (int): How many results you want (default: 10)
|
|
246
|
+
size (Optional[str]): Image size filter
|
|
247
|
+
- 'large': Big pics
|
|
248
|
+
- 'medium': Medium sized
|
|
249
|
+
- 'icon': Small icons
|
|
250
|
+
color (Optional[str]): Color filter
|
|
251
|
+
- 'color': Full color
|
|
252
|
+
- 'gray': Black and white
|
|
253
|
+
- 'transparent': Transparent background
|
|
254
|
+
type_filter (Optional[str]): Type of image
|
|
255
|
+
- 'face': Just faces
|
|
256
|
+
- 'photo': Real photos
|
|
257
|
+
- 'clipart': Vector art
|
|
258
|
+
- 'lineart': Line drawings
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
List[Dict[str, str]]: List of image results with these keys:
|
|
262
|
+
- 'thumbnail': Small preview URL
|
|
263
|
+
- 'full_url': Full resolution image URL
|
|
264
|
+
- 'title': Image title/description
|
|
265
|
+
- 'type': Always 'image'
|
|
266
|
+
|
|
267
|
+
Example:
|
|
268
|
+
>>> searcher = GoogleS()
|
|
269
|
+
>>> # Find some cool nature pics
|
|
270
|
+
>>> images = searcher.search_images(
|
|
271
|
+
... query="beautiful landscapes",
|
|
272
|
+
... size="large",
|
|
273
|
+
... color="color",
|
|
274
|
+
... max_results=5
|
|
275
|
+
... )
|
|
276
|
+
>>> for img in images:
|
|
277
|
+
... print(f"Found: {img['title']}")
|
|
278
|
+
... print(f"URL: {img['full_url']}")
|
|
132
279
|
"""
|
|
133
280
|
params = {
|
|
134
281
|
"q": query,
|
|
@@ -144,7 +291,7 @@ class GoogleS:
|
|
|
144
291
|
params["tbs"] = f"itp:{type_filter}"
|
|
145
292
|
|
|
146
293
|
content = self._get_url("GET", self.SEARCH_TYPES["image"], params=params)
|
|
147
|
-
soup =
|
|
294
|
+
soup = Scout(content) # Use Scout parser
|
|
148
295
|
|
|
149
296
|
results = []
|
|
150
297
|
for img in soup.find_all("img", class_="rg_i"):
|
|
@@ -192,6 +339,9 @@ class GoogleS:
|
|
|
192
339
|
exclude_terms: List of terms to exclude from search
|
|
193
340
|
exact_phrase: Exact phrase to match
|
|
194
341
|
"""
|
|
342
|
+
if self.use_litlogger:
|
|
343
|
+
self.logger.info(f"Starting search for: {query}")
|
|
344
|
+
|
|
195
345
|
# Build advanced query
|
|
196
346
|
advanced_query = query
|
|
197
347
|
if site:
|
|
@@ -202,7 +352,10 @@ class GoogleS:
|
|
|
202
352
|
advanced_query += " " + " ".join(f"-{term}" for term in exclude_terms)
|
|
203
353
|
if exact_phrase:
|
|
204
354
|
advanced_query = f'"{exact_phrase}"' + advanced_query
|
|
205
|
-
|
|
355
|
+
|
|
356
|
+
if self.use_litlogger:
|
|
357
|
+
self.logger.debug(f"Advanced query: {advanced_query}")
|
|
358
|
+
|
|
206
359
|
# Check cache first
|
|
207
360
|
cache_key = self._cache_key(advanced_query, region=region, language=language,
|
|
208
361
|
safe=safe, time_period=time_period, sort_by=sort_by)
|
|
@@ -233,7 +386,8 @@ class GoogleS:
|
|
|
233
386
|
for future in as_completed(futures):
|
|
234
387
|
try:
|
|
235
388
|
resp_content = future.result()
|
|
236
|
-
soup =
|
|
389
|
+
soup = Scout(resp_content) # Use Scout parser
|
|
390
|
+
|
|
237
391
|
result_blocks = soup.find_all("div", class_="g")
|
|
238
392
|
|
|
239
393
|
if not result_blocks:
|
|
@@ -297,9 +451,9 @@ class GoogleS:
|
|
|
297
451
|
|
|
298
452
|
def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
|
|
299
453
|
"""
|
|
300
|
-
Extracts visible text from HTML content using
|
|
454
|
+
Extracts visible text from HTML content using Scout parser.
|
|
301
455
|
"""
|
|
302
|
-
soup =
|
|
456
|
+
soup = Scout(html_content) # Use Scout parser
|
|
303
457
|
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
|
304
458
|
tag.extract()
|
|
305
459
|
visible_text = soup.get_text(strip=True)
|