webscout 8.3.1__py3-none-any.whl → 8.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIutel.py +180 -78
- webscout/Bing_search.py +417 -0
- webscout/Extra/gguf.py +706 -177
- webscout/Provider/AISEARCH/__init__.py +1 -0
- webscout/Provider/AISEARCH/genspark_search.py +7 -7
- webscout/Provider/AISEARCH/stellar_search.py +132 -0
- webscout/Provider/ExaChat.py +84 -58
- webscout/Provider/GeminiProxy.py +140 -0
- webscout/Provider/HeckAI.py +85 -80
- webscout/Provider/Jadve.py +56 -50
- webscout/Provider/MCPCore.py +78 -75
- webscout/Provider/MiniMax.py +207 -0
- webscout/Provider/Nemotron.py +41 -13
- webscout/Provider/Netwrck.py +34 -51
- webscout/Provider/OPENAI/BLACKBOXAI.py +0 -4
- webscout/Provider/OPENAI/GeminiProxy.py +328 -0
- webscout/Provider/OPENAI/MiniMax.py +298 -0
- webscout/Provider/OPENAI/README.md +32 -29
- webscout/Provider/OPENAI/README_AUTOPROXY.md +238 -0
- webscout/Provider/OPENAI/TogetherAI.py +4 -17
- webscout/Provider/OPENAI/__init__.py +17 -1
- webscout/Provider/OPENAI/autoproxy.py +1067 -39
- webscout/Provider/OPENAI/base.py +17 -76
- webscout/Provider/OPENAI/deepinfra.py +42 -108
- webscout/Provider/OPENAI/e2b.py +0 -1
- webscout/Provider/OPENAI/flowith.py +179 -166
- webscout/Provider/OPENAI/friendli.py +233 -0
- webscout/Provider/OPENAI/mcpcore.py +109 -70
- webscout/Provider/OPENAI/monochat.py +329 -0
- webscout/Provider/OPENAI/pydantic_imports.py +1 -172
- webscout/Provider/OPENAI/scirachat.py +59 -51
- webscout/Provider/OPENAI/toolbaz.py +3 -9
- webscout/Provider/OPENAI/typegpt.py +1 -1
- webscout/Provider/OPENAI/utils.py +19 -42
- webscout/Provider/OPENAI/x0gpt.py +14 -2
- webscout/Provider/OPENAI/xenai.py +514 -0
- webscout/Provider/OPENAI/yep.py +8 -2
- webscout/Provider/OpenGPT.py +54 -32
- webscout/Provider/PI.py +58 -84
- webscout/Provider/StandardInput.py +32 -13
- webscout/Provider/TTI/README.md +9 -9
- webscout/Provider/TTI/__init__.py +3 -1
- webscout/Provider/TTI/aiarta.py +92 -78
- webscout/Provider/TTI/bing.py +231 -0
- webscout/Provider/TTI/infip.py +212 -0
- webscout/Provider/TTI/monochat.py +220 -0
- webscout/Provider/TTS/speechma.py +45 -39
- webscout/Provider/TeachAnything.py +11 -3
- webscout/Provider/TextPollinationsAI.py +78 -70
- webscout/Provider/TogetherAI.py +350 -0
- webscout/Provider/Venice.py +37 -46
- webscout/Provider/VercelAI.py +27 -24
- webscout/Provider/WiseCat.py +35 -35
- webscout/Provider/WrDoChat.py +22 -26
- webscout/Provider/WritingMate.py +26 -22
- webscout/Provider/XenAI.py +324 -0
- webscout/Provider/__init__.py +10 -5
- webscout/Provider/deepseek_assistant.py +378 -0
- webscout/Provider/granite.py +48 -57
- webscout/Provider/koala.py +51 -39
- webscout/Provider/learnfastai.py +49 -64
- webscout/Provider/llmchat.py +79 -93
- webscout/Provider/llmchatco.py +63 -78
- webscout/Provider/multichat.py +51 -40
- webscout/Provider/oivscode.py +1 -1
- webscout/Provider/scira_chat.py +159 -96
- webscout/Provider/scnet.py +13 -13
- webscout/Provider/searchchat.py +13 -13
- webscout/Provider/sonus.py +12 -11
- webscout/Provider/toolbaz.py +25 -8
- webscout/Provider/turboseek.py +41 -42
- webscout/Provider/typefully.py +27 -12
- webscout/Provider/typegpt.py +41 -46
- webscout/Provider/uncovr.py +55 -90
- webscout/Provider/x0gpt.py +33 -17
- webscout/Provider/yep.py +79 -96
- webscout/auth/__init__.py +55 -0
- webscout/auth/api_key_manager.py +189 -0
- webscout/auth/auth_system.py +100 -0
- webscout/auth/config.py +76 -0
- webscout/auth/database.py +400 -0
- webscout/auth/exceptions.py +67 -0
- webscout/auth/middleware.py +248 -0
- webscout/auth/models.py +130 -0
- webscout/auth/providers.py +279 -0
- webscout/auth/rate_limiter.py +254 -0
- webscout/auth/request_models.py +127 -0
- webscout/auth/request_processing.py +226 -0
- webscout/auth/routes.py +550 -0
- webscout/auth/schemas.py +103 -0
- webscout/auth/server.py +367 -0
- webscout/client.py +121 -70
- webscout/litagent/Readme.md +68 -55
- webscout/litagent/agent.py +99 -9
- webscout/scout/core/scout.py +104 -26
- webscout/scout/element.py +139 -18
- webscout/swiftcli/core/cli.py +14 -3
- webscout/swiftcli/decorators/output.py +59 -9
- webscout/update_checker.py +31 -49
- webscout/version.py +1 -1
- webscout/webscout_search.py +4 -12
- webscout/webscout_search_async.py +3 -10
- webscout/yep_search.py +2 -11
- {webscout-8.3.1.dist-info → webscout-8.3.3.dist-info}/METADATA +141 -99
- {webscout-8.3.1.dist-info → webscout-8.3.3.dist-info}/RECORD +109 -83
- {webscout-8.3.1.dist-info → webscout-8.3.3.dist-info}/entry_points.txt +1 -1
- webscout/Provider/HF_space/__init__.py +0 -0
- webscout/Provider/HF_space/qwen_qwen2.py +0 -206
- webscout/Provider/OPENAI/api.py +0 -1320
- webscout/Provider/TTI/fastflux.py +0 -233
- webscout/Provider/Writecream.py +0 -246
- {webscout-8.3.1.dist-info → webscout-8.3.3.dist-info}/WHEEL +0 -0
- {webscout-8.3.1.dist-info → webscout-8.3.3.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.3.1.dist-info → webscout-8.3.3.dist-info}/top_level.txt +0 -0
webscout/litagent/Readme.md
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# 🔥 LitAgent - The Lit User Agent Generator
|
|
2
2
|
|
|
3
|
-
LitAgent is a powerful
|
|
3
|
+
LitAgent is a powerful, modern user agent generator that keeps your requests fresh and undetectable! Built for web scraping, it helps you manage user agents with ease and style.
|
|
4
|
+
|
|
5
|
+
---
|
|
4
6
|
|
|
5
7
|
## 🚀 Quick Start
|
|
6
8
|
|
|
@@ -15,41 +17,46 @@ ua = agent.random()
|
|
|
15
17
|
print(ua) # Mozilla/5.0 (Windows NT 11.0) AppleWebKit/537.36 ...
|
|
16
18
|
```
|
|
17
19
|
|
|
20
|
+
---
|
|
21
|
+
|
|
18
22
|
## 🎯 Features
|
|
19
23
|
|
|
20
24
|
### Browser-Specific Agents
|
|
21
25
|
|
|
22
26
|
```python
|
|
23
27
|
# Get agents for specific browsers
|
|
24
|
-
chrome_ua = agent.chrome()
|
|
25
|
-
firefox_ua = agent.firefox()
|
|
26
|
-
safari_ua = agent.safari()
|
|
27
|
-
edge_ua = agent.edge()
|
|
28
|
-
opera_ua = agent.opera()
|
|
28
|
+
chrome_ua = agent.chrome() # Latest Chrome agent
|
|
29
|
+
firefox_ua = agent.firefox() # Latest Firefox agent
|
|
30
|
+
safari_ua = agent.safari() # Latest Safari agent
|
|
31
|
+
edge_ua = agent.edge() # Latest Edge agent
|
|
32
|
+
opera_ua = agent.opera() # Latest Opera agent
|
|
29
33
|
```
|
|
30
34
|
|
|
31
35
|
### Device-Specific Agents
|
|
32
36
|
|
|
33
37
|
```python
|
|
34
38
|
# Get mobile or desktop agents
|
|
35
|
-
mobile_ua = agent.mobile()
|
|
36
|
-
desktop_ua = agent.desktop()
|
|
39
|
+
mobile_ua = agent.mobile() # Mobile device agent
|
|
40
|
+
desktop_ua = agent.desktop() # Desktop device agent
|
|
37
41
|
|
|
38
42
|
# New - Get agents for specific device types
|
|
39
|
-
tablet_ua = agent.tablet()
|
|
40
|
-
tv_ua = agent.smart_tv()
|
|
41
|
-
console_ua = agent.gaming()
|
|
43
|
+
tablet_ua = agent.tablet() # Tablet device agent
|
|
44
|
+
tv_ua = agent.smart_tv() # Smart TV agent
|
|
45
|
+
console_ua = agent.gaming() # Gaming console agent
|
|
46
|
+
|
|
47
|
+
# Wearable device user agent support
|
|
48
|
+
wearable_ua = agent.wearable() # Get a wearable device user agent
|
|
42
49
|
```
|
|
43
50
|
|
|
44
51
|
### OS-Specific Agents
|
|
45
52
|
|
|
46
53
|
```python
|
|
47
54
|
# New - Get agents for specific operating systems
|
|
48
|
-
windows_ua = agent.windows()
|
|
49
|
-
mac_ua = agent.macos()
|
|
50
|
-
linux_ua = agent.linux()
|
|
51
|
-
android_ua = agent.android()
|
|
52
|
-
ios_ua = agent.ios()
|
|
55
|
+
windows_ua = agent.windows() # Windows agent
|
|
56
|
+
mac_ua = agent.macos() # macOS agent
|
|
57
|
+
linux_ua = agent.linux() # Linux agent`
|
|
58
|
+
android_ua = agent.android() # Android agent
|
|
59
|
+
ios_ua = agent.ios() # iOS agent
|
|
53
60
|
```
|
|
54
61
|
|
|
55
62
|
### Custom Agent Generation
|
|
@@ -83,6 +90,15 @@ ip = agent.rotate_ip()
|
|
|
83
90
|
print(ip) # 192.168.1.10 (example)
|
|
84
91
|
```
|
|
85
92
|
|
|
93
|
+
### Proxy Pool & Rotation
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
agent.set_proxy_pool(["http://proxy1:8080", "http://proxy2:8080"])
|
|
97
|
+
proxy = agent.rotate_proxy() # Get next proxy from pool
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
86
102
|
## 💫 Real-World Examples
|
|
87
103
|
|
|
88
104
|
### With Requests
|
|
@@ -152,47 +168,30 @@ driver = create_driver()
|
|
|
152
168
|
driver.get('https://example.com')
|
|
153
169
|
```
|
|
154
170
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
```python
|
|
158
|
-
from playwright.sync_api import sync_playwright
|
|
159
|
-
from webscout import LitAgent
|
|
160
|
-
|
|
161
|
-
agent = LitAgent()
|
|
162
|
-
|
|
163
|
-
def browse_with_playwright():
|
|
164
|
-
with sync_playwright() as p:
|
|
165
|
-
browser_options = {
|
|
166
|
-
"user_agent": agent.chrome(),
|
|
167
|
-
"viewport": {"width": 1280, "height": 720}
|
|
168
|
-
}
|
|
169
|
-
browser = p.chromium.launch()
|
|
170
|
-
context = browser.new_context(**browser_options)
|
|
171
|
-
page = context.new_page()
|
|
172
|
-
page.goto('https://example.com')
|
|
173
|
-
# Continue with your scraping logic
|
|
174
|
-
browser.close()
|
|
175
|
-
```
|
|
171
|
+
---
|
|
176
172
|
|
|
177
173
|
## 🌟 Pro Tips
|
|
178
174
|
|
|
179
|
-
1. **Rotate Agents
|
|
175
|
+
1. **Rotate Agents Regularly**
|
|
176
|
+
Refresh your agent pool periodically to avoid detection.
|
|
177
|
+
|
|
180
178
|
```python
|
|
181
179
|
agent = LitAgent()
|
|
182
|
-
for
|
|
180
|
+
for i in range(10):
|
|
183
181
|
response = requests.get(url, headers={'User-Agent': agent.random()})
|
|
184
|
-
if
|
|
182
|
+
if i % 3 == 0:
|
|
185
183
|
agent.refresh()
|
|
186
184
|
```
|
|
187
185
|
|
|
188
|
-
2. **Device-Specific Scraping
|
|
186
|
+
2. **Device-Specific Scraping**
|
|
187
|
+
Use device-specific agents for different platforms.
|
|
188
|
+
|
|
189
189
|
```python
|
|
190
190
|
# Mobile site scraping
|
|
191
191
|
mobile_response = requests.get(
|
|
192
192
|
'https://m.example.com',
|
|
193
193
|
headers={'User-Agent': agent.mobile()}
|
|
194
194
|
)
|
|
195
|
-
|
|
196
195
|
# Desktop site scraping
|
|
197
196
|
desktop_response = requests.get(
|
|
198
197
|
'https://example.com',
|
|
@@ -200,7 +199,9 @@ def browse_with_playwright():
|
|
|
200
199
|
)
|
|
201
200
|
```
|
|
202
201
|
|
|
203
|
-
3. **Browser Consistency
|
|
202
|
+
3. **Browser Consistency**
|
|
203
|
+
Stick to one browser type per session for realism.
|
|
204
|
+
|
|
204
205
|
```python
|
|
205
206
|
chrome_agent = agent.chrome()
|
|
206
207
|
headers = {
|
|
@@ -210,11 +211,11 @@ def browse_with_playwright():
|
|
|
210
211
|
# Use these headers for all requests in this session
|
|
211
212
|
```
|
|
212
213
|
|
|
213
|
-
4. **
|
|
214
|
+
4. **Browser Fingerprinting & IP Rotation**
|
|
215
|
+
Generate consistent browser fingerprints and rotate IPs.
|
|
216
|
+
|
|
214
217
|
```python
|
|
215
|
-
# Create consistent browser fingerprinting
|
|
216
218
|
fingerprint = agent.generate_fingerprint(browser="chrome")
|
|
217
|
-
|
|
218
219
|
headers = {
|
|
219
220
|
'User-Agent': fingerprint['user_agent'],
|
|
220
221
|
'Accept-Language': fingerprint['accept_language'],
|
|
@@ -223,33 +224,37 @@ def browse_with_playwright():
|
|
|
223
224
|
'Sec-Ch-Ua-Platform': fingerprint['platform'],
|
|
224
225
|
'X-Forwarded-For': fingerprint['x-forwarded-for']
|
|
225
226
|
}
|
|
226
|
-
|
|
227
|
-
# Use this consistent set for all session requests while rotating IPs
|
|
228
227
|
```
|
|
229
228
|
|
|
230
|
-
5. **
|
|
229
|
+
5. **Multi-threading Support**
|
|
230
|
+
Use thread-safe mode for concurrent requests.
|
|
231
|
+
|
|
231
232
|
```python
|
|
232
233
|
import concurrent.futures
|
|
233
234
|
from webscout import LitAgent
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
235
|
+
import requests
|
|
236
|
+
|
|
237
|
+
agent = LitAgent(thread_safe=True)
|
|
238
|
+
|
|
237
239
|
def fetch_url(url):
|
|
238
240
|
headers = {'User-Agent': agent.random()}
|
|
239
241
|
return requests.get(url, headers=headers).text
|
|
240
|
-
|
|
242
|
+
|
|
241
243
|
urls = ['https://example1.com', 'https://example2.com', 'https://example3.com']
|
|
242
|
-
|
|
243
244
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
244
245
|
results = list(executor.map(fetch_url, urls))
|
|
245
246
|
```
|
|
246
247
|
|
|
248
|
+
---
|
|
249
|
+
|
|
247
250
|
## 🔧 Supported Browsers & Devices
|
|
248
251
|
|
|
249
252
|
- **Browsers**: Chrome, Firefox, Safari, Edge, Opera, Brave, Vivaldi
|
|
250
253
|
- **Operating Systems**: Windows, macOS, Linux, Android, iOS, Chrome OS
|
|
251
254
|
- **Devices**: Mobile phones, Tablets, Desktops, Game consoles, Smart TVs, Wearables
|
|
252
255
|
|
|
256
|
+
---
|
|
257
|
+
|
|
253
258
|
## 🎨 Why LitAgent?
|
|
254
259
|
|
|
255
260
|
- 🚀 Modern and up-to-date user agents
|
|
@@ -263,7 +268,11 @@ def browse_with_playwright():
|
|
|
263
268
|
- 🧵 Thread-safe operation
|
|
264
269
|
- 🕰️ Automatic refresh scheduling
|
|
265
270
|
|
|
266
|
-
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## 📊 Analytics and Reporting
|
|
274
|
+
|
|
275
|
+
Get statistics on your agent usage and export your data.
|
|
267
276
|
|
|
268
277
|
```python
|
|
269
278
|
# Get statistics on your agent usage
|
|
@@ -276,10 +285,14 @@ print(f"Detection avoidance rate: {stats.avoidance_rate}%")
|
|
|
276
285
|
agent.export_stats('agent_usage.json')
|
|
277
286
|
```
|
|
278
287
|
|
|
288
|
+
---
|
|
289
|
+
|
|
279
290
|
## 📋 Installation
|
|
280
291
|
|
|
281
292
|
```bash
|
|
282
293
|
pip install webscout
|
|
283
294
|
```
|
|
284
295
|
|
|
296
|
+
---
|
|
297
|
+
|
|
285
298
|
Made with 💖 by the HelpingAI team
|
webscout/litagent/agent.py
CHANGED
|
@@ -93,15 +93,23 @@ class LitAgent:
|
|
|
93
93
|
self._stats["device_usage"][device_type] = self._stats["device_usage"].get(device_type, 0) + 1
|
|
94
94
|
|
|
95
95
|
def random(self) -> str:
|
|
96
|
-
"""Get a random user agent! 🎲"""
|
|
96
|
+
"""Get a random user agent! 🎲 (with blacklist/whitelist support)"""
|
|
97
|
+
if hasattr(self, '_whitelist') and self._whitelist:
|
|
98
|
+
pool = list(self._whitelist)
|
|
99
|
+
else:
|
|
100
|
+
pool = [a for a in self.agents if not hasattr(self, '_blacklist') or a not in self._blacklist]
|
|
101
|
+
if not pool:
|
|
102
|
+
pool = self.agents
|
|
97
103
|
if self.thread_safe and self.lock:
|
|
98
104
|
with self.lock:
|
|
99
|
-
agent = random.choice(
|
|
105
|
+
agent = random.choice(pool)
|
|
100
106
|
self._update_stats()
|
|
107
|
+
self._add_to_history(agent)
|
|
101
108
|
return agent
|
|
102
109
|
else:
|
|
103
|
-
agent = random.choice(
|
|
110
|
+
agent = random.choice(pool)
|
|
104
111
|
self._update_stats()
|
|
112
|
+
self._add_to_history(agent)
|
|
105
113
|
return agent
|
|
106
114
|
|
|
107
115
|
def browser(self, name: str) -> str:
|
|
@@ -330,7 +338,8 @@ class LitAgent:
|
|
|
330
338
|
self._update_stats(browser_type=browser, device_type=device_type)
|
|
331
339
|
return agent
|
|
332
340
|
|
|
333
|
-
|
|
341
|
+
@staticmethod
|
|
342
|
+
def generate_fingerprint(browser: Optional[str] = None) -> Dict[str, str]:
|
|
334
343
|
"""
|
|
335
344
|
Generate a consistent browser fingerprint for anti-fingerprinting purposes.
|
|
336
345
|
|
|
@@ -347,13 +356,14 @@ class LitAgent:
|
|
|
347
356
|
Dict[str, str]: A dictionary containing fingerprinting headers and values.
|
|
348
357
|
"""
|
|
349
358
|
# Get a random user agent using the random() method
|
|
350
|
-
|
|
359
|
+
agent = LitAgent()
|
|
360
|
+
user_agent = agent.random()
|
|
351
361
|
|
|
352
362
|
# If browser is specified, try to get a matching one
|
|
353
363
|
if browser:
|
|
354
364
|
browser = browser.lower()
|
|
355
365
|
if browser in BROWSERS:
|
|
356
|
-
user_agent =
|
|
366
|
+
user_agent = agent.browser(browser)
|
|
357
367
|
|
|
358
368
|
accept_language = random.choice(FINGERPRINTS["accept_language"])
|
|
359
369
|
accept = random.choice(FINGERPRINTS["accept"])
|
|
@@ -367,7 +377,7 @@ class LitAgent:
|
|
|
367
377
|
sec_ch_ua = FINGERPRINTS["sec_ch_ua"][browser_name].format(version, version)
|
|
368
378
|
break
|
|
369
379
|
|
|
370
|
-
ip =
|
|
380
|
+
ip = agent.rotate_ip()
|
|
371
381
|
fingerprint = {
|
|
372
382
|
"user_agent": user_agent,
|
|
373
383
|
"accept_language": accept_language,
|
|
@@ -379,10 +389,10 @@ class LitAgent:
|
|
|
379
389
|
"x-client-ip": ip,
|
|
380
390
|
"forwarded": f"for={ip};proto=https",
|
|
381
391
|
"x-forwarded-proto": "https",
|
|
382
|
-
"x-request-id":
|
|
392
|
+
"x-request-id": agent.random_id(8) if hasattr(agent, 'random_id') else ''.join(random.choices('0123456789abcdef', k=8)),
|
|
383
393
|
}
|
|
384
394
|
|
|
385
|
-
|
|
395
|
+
agent._update_stats(browser_type=browser)
|
|
386
396
|
return fingerprint
|
|
387
397
|
|
|
388
398
|
def refresh(self) -> None:
|
|
@@ -472,6 +482,63 @@ class LitAgent:
|
|
|
472
482
|
"""Generate a random identifier string."""
|
|
473
483
|
return ''.join(random.choices('0123456789abcdef', k=length)).lower()
|
|
474
484
|
|
|
485
|
+
def wearable(self) -> str:
|
|
486
|
+
"""Get a wearable device agent! ⌚"""
|
|
487
|
+
wearable_type = random.choice(DEVICES['wearable'])
|
|
488
|
+
# Example user agent for wearables (simplified)
|
|
489
|
+
if 'Apple Watch' in wearable_type:
|
|
490
|
+
agent = f"Mozilla/5.0 (AppleWatch; CPU WatchOS like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/9.0 Mobile/13S344 Safari/602.1"
|
|
491
|
+
elif 'Samsung' in wearable_type:
|
|
492
|
+
agent = f"Mozilla/5.0 (Linux; Tizen 3.0; {wearable_type}) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/1.0"
|
|
493
|
+
elif 'Fitbit' in wearable_type:
|
|
494
|
+
agent = f"Mozilla/5.0 (Linux; {wearable_type}) AppleWebKit/537.36 (KHTML, like Gecko)"
|
|
495
|
+
elif 'Garmin' in wearable_type:
|
|
496
|
+
agent = f"Mozilla/5.0 (Linux; {wearable_type}) AppleWebKit/537.36 (KHTML, like Gecko)"
|
|
497
|
+
else:
|
|
498
|
+
agent = self.random()
|
|
499
|
+
self._update_stats(device_type="wearable")
|
|
500
|
+
return agent
|
|
501
|
+
|
|
502
|
+
def set_proxy_pool(self, proxies: List[str]):
|
|
503
|
+
"""Set a pool of proxies for rotation."""
|
|
504
|
+
self._proxy_pool = proxies
|
|
505
|
+
self._proxy_index = 0
|
|
506
|
+
|
|
507
|
+
def rotate_proxy(self) -> Optional[str]:
|
|
508
|
+
"""Rotate through the proxy pool and return the next proxy."""
|
|
509
|
+
if not hasattr(self, '_proxy_pool') or not self._proxy_pool:
|
|
510
|
+
return None
|
|
511
|
+
proxy = self._proxy_pool[self._proxy_index]
|
|
512
|
+
self._proxy_index = (self._proxy_index + 1) % len(self._proxy_pool)
|
|
513
|
+
return proxy
|
|
514
|
+
|
|
515
|
+
def add_to_blacklist(self, agent: str):
|
|
516
|
+
"""Add a user agent to the blacklist."""
|
|
517
|
+
if not hasattr(self, '_blacklist'):
|
|
518
|
+
self._blacklist = set()
|
|
519
|
+
self._blacklist.add(agent)
|
|
520
|
+
|
|
521
|
+
def add_to_whitelist(self, agent: str):
|
|
522
|
+
"""Add a user agent to the whitelist."""
|
|
523
|
+
if not hasattr(self, '_whitelist'):
|
|
524
|
+
self._whitelist = set()
|
|
525
|
+
self._whitelist.add(agent)
|
|
526
|
+
|
|
527
|
+
def _add_to_history(self, agent: str):
|
|
528
|
+
if not hasattr(self, '_history'):
|
|
529
|
+
self._history = []
|
|
530
|
+
self._history.append(agent)
|
|
531
|
+
if len(self._history) > 50:
|
|
532
|
+
self._history.pop(0)
|
|
533
|
+
|
|
534
|
+
def get_history(self) -> List[str]:
|
|
535
|
+
"""Get the last 50 user agents served."""
|
|
536
|
+
return getattr(self, '_history', [])
|
|
537
|
+
|
|
538
|
+
def validate_agent(self, agent: str) -> bool:
|
|
539
|
+
"""Validate if a user agent string is realistic (basic check)."""
|
|
540
|
+
return agent.startswith("Mozilla/5.0") and any(b in agent for b in BROWSERS.keys())
|
|
541
|
+
|
|
475
542
|
if __name__ == "__main__":
|
|
476
543
|
# Test it out! 🧪
|
|
477
544
|
agent = LitAgent()
|
|
@@ -484,9 +551,32 @@ if __name__ == "__main__":
|
|
|
484
551
|
print("Tablet:", agent.tablet())
|
|
485
552
|
print("Smart TV:", agent.smart_tv())
|
|
486
553
|
print("Gaming:", agent.gaming())
|
|
554
|
+
print("Wearable:", agent.wearable())
|
|
487
555
|
|
|
488
556
|
# Test custom agent
|
|
489
557
|
print("Custom:", agent.custom(browser="chrome", os="windows", os_version="10.0"))
|
|
490
558
|
|
|
491
559
|
# Test fingerprinting
|
|
492
560
|
print("Fingerprint:", agent.generate_fingerprint("chrome"))
|
|
561
|
+
|
|
562
|
+
# Test proxy rotation
|
|
563
|
+
agent.set_proxy_pool(["http://proxy1.com", "http://proxy2.com"])
|
|
564
|
+
print("Proxy 1:", agent.rotate_proxy())
|
|
565
|
+
print("Proxy 2:", agent.rotate_proxy())
|
|
566
|
+
|
|
567
|
+
# Test blacklist/whitelist
|
|
568
|
+
agent.add_to_blacklist("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
|
|
569
|
+
agent.add_to_whitelist("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
|
570
|
+
print("Blacklisted:", agent.random())
|
|
571
|
+
print("Whitelisted:", agent.random())
|
|
572
|
+
|
|
573
|
+
# Test agent history
|
|
574
|
+
for _ in range(55):
|
|
575
|
+
agent.random()
|
|
576
|
+
print("History:", agent.get_history())
|
|
577
|
+
|
|
578
|
+
# Test agent validation
|
|
579
|
+
print("Valid agent:", agent.validate_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"))
|
|
580
|
+
print("Invalid agent:", agent.validate_agent("InvalidUserAgentString"))
|
|
581
|
+
ip = agent.rotate_ip()
|
|
582
|
+
print(ip) # 192.168.1.10 (example)
|
webscout/scout/core/scout.py
CHANGED
|
@@ -265,7 +265,7 @@ class Scout:
|
|
|
265
265
|
|
|
266
266
|
return json.dumps(_tag_to_dict(self._soup), indent=indent)
|
|
267
267
|
|
|
268
|
-
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> ScoutSearchResult:
|
|
268
|
+
def find(self, name=None, attrs={}, recursive=True, text=None, class_=None, **kwargs) -> ScoutSearchResult:
|
|
269
269
|
"""
|
|
270
270
|
Find the first matching element.
|
|
271
271
|
|
|
@@ -278,10 +278,10 @@ class Scout:
|
|
|
278
278
|
Returns:
|
|
279
279
|
ScoutSearchResult: First matching element
|
|
280
280
|
"""
|
|
281
|
-
result = self._soup.find(name, attrs, recursive, text, **kwargs)
|
|
281
|
+
result = self._soup.find(name, attrs, recursive, text, limit=1, class_=class_, **kwargs)
|
|
282
282
|
return ScoutSearchResult([result]) if result else ScoutSearchResult([])
|
|
283
283
|
|
|
284
|
-
def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> ScoutSearchResult:
|
|
284
|
+
def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, class_=None, **kwargs) -> ScoutSearchResult:
|
|
285
285
|
"""
|
|
286
286
|
Find all matching elements.
|
|
287
287
|
|
|
@@ -295,7 +295,7 @@ class Scout:
|
|
|
295
295
|
Returns:
|
|
296
296
|
ScoutSearchResult: List of matching elements
|
|
297
297
|
"""
|
|
298
|
-
results = self._soup.find_all(name, attrs, recursive, text, limit, **kwargs)
|
|
298
|
+
results = self._soup.find_all(name, attrs, recursive, text, limit, class_=class_, **kwargs)
|
|
299
299
|
return ScoutSearchResult(results)
|
|
300
300
|
|
|
301
301
|
def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
|
|
@@ -474,6 +474,19 @@ class Scout:
|
|
|
474
474
|
sentences = tokenizer.tokenize(text)
|
|
475
475
|
return "\n\n".join(sentences)
|
|
476
476
|
|
|
477
|
+
def get_text_robust(self, separator=' ', strip=False, types=None, encoding_fallbacks=None) -> str:
|
|
478
|
+
"""Extract text robustly, trying multiple encodings if needed."""
|
|
479
|
+
try:
|
|
480
|
+
return self.get_text(separator, strip, types)
|
|
481
|
+
except UnicodeDecodeError:
|
|
482
|
+
if encoding_fallbacks:
|
|
483
|
+
for enc in encoding_fallbacks:
|
|
484
|
+
try:
|
|
485
|
+
return self._soup.get_text(separator, strip, types).encode(enc).decode(enc)
|
|
486
|
+
except Exception:
|
|
487
|
+
continue
|
|
488
|
+
raise
|
|
489
|
+
|
|
477
490
|
def remove_tags(self, tags: List[str]) -> None:
|
|
478
491
|
"""
|
|
479
492
|
Remove specified tags and their contents from the document.
|
|
@@ -543,29 +556,19 @@ class Scout:
|
|
|
543
556
|
"""
|
|
544
557
|
old_tag.replace_with(new_tag)
|
|
545
558
|
|
|
546
|
-
def encode(self, encoding='utf-8') -> bytes:
|
|
547
|
-
"""
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
Returns:
|
|
554
|
-
bytes: Encoded document
|
|
555
|
-
"""
|
|
556
|
-
return str(self._soup).encode(encoding)
|
|
557
|
-
|
|
558
|
-
def decode(self, encoding='utf-8') -> str:
|
|
559
|
-
"""
|
|
560
|
-
Decode the document from a specific encoding.
|
|
561
|
-
|
|
562
|
-
Args:
|
|
563
|
-
encoding (str, optional): Encoding to use
|
|
559
|
+
def encode(self, encoding='utf-8', errors='strict') -> bytes:
|
|
560
|
+
"""Encode the document to a specific encoding with error handling."""
|
|
561
|
+
try:
|
|
562
|
+
return str(self._soup).encode(encoding, errors)
|
|
563
|
+
except Exception:
|
|
564
|
+
return str(self._soup).encode('utf-8', errors)
|
|
564
565
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
566
|
+
def decode(self, encoding='utf-8', errors='strict') -> str:
|
|
567
|
+
"""Decode the document from a specific encoding with error handling."""
|
|
568
|
+
try:
|
|
569
|
+
return str(self._soup).decode(encoding, errors)
|
|
570
|
+
except Exception:
|
|
571
|
+
return str(self._soup)
|
|
569
572
|
|
|
570
573
|
def __str__(self) -> str:
|
|
571
574
|
"""
|
|
@@ -605,3 +608,78 @@ class Scout:
|
|
|
605
608
|
decoded_markup = re.sub(r'\s+', ' ', decoded_markup)
|
|
606
609
|
|
|
607
610
|
return decoded_markup
|
|
611
|
+
|
|
612
|
+
def wrap(self, wrapper_tag: Tag) -> Tag:
|
|
613
|
+
"""Wrap the root tag in another tag with error handling."""
|
|
614
|
+
try:
|
|
615
|
+
return self._soup.wrap(wrapper_tag)
|
|
616
|
+
except Exception:
|
|
617
|
+
return wrapper_tag
|
|
618
|
+
|
|
619
|
+
def unwrap(self) -> None:
|
|
620
|
+
"""Unwrap the root tag, keeping its contents in the parent, with error handling."""
|
|
621
|
+
try:
|
|
622
|
+
self._soup.unwrap()
|
|
623
|
+
except Exception:
|
|
624
|
+
pass
|
|
625
|
+
|
|
626
|
+
def insert_before(self, new_element: Tag) -> None:
|
|
627
|
+
"""Insert a tag or string immediately before the root tag with error handling."""
|
|
628
|
+
try:
|
|
629
|
+
self._soup.insert_before(new_element)
|
|
630
|
+
except Exception:
|
|
631
|
+
pass
|
|
632
|
+
|
|
633
|
+
def insert_after(self, new_element: Tag) -> None:
|
|
634
|
+
"""Insert a tag or string immediately after the root tag with error handling."""
|
|
635
|
+
try:
|
|
636
|
+
self._soup.insert_after(new_element)
|
|
637
|
+
except Exception:
|
|
638
|
+
pass
|
|
639
|
+
|
|
640
|
+
def append(self, tag: Tag) -> None:
|
|
641
|
+
"""Append a tag to the root tag with error handling."""
|
|
642
|
+
try:
|
|
643
|
+
self._soup.append(tag)
|
|
644
|
+
except Exception:
|
|
645
|
+
pass
|
|
646
|
+
|
|
647
|
+
@property
|
|
648
|
+
def descendants(self):
|
|
649
|
+
"""Yield all descendants of the root tag in document order."""
|
|
650
|
+
return self._soup.descendants
|
|
651
|
+
|
|
652
|
+
@property
|
|
653
|
+
def parents(self):
|
|
654
|
+
"""Yield all parents of the root tag up the tree."""
|
|
655
|
+
return self._soup.parents
|
|
656
|
+
|
|
657
|
+
@property
|
|
658
|
+
def next_element(self):
|
|
659
|
+
"""Return the next element in document order after the root tag."""
|
|
660
|
+
return self._soup.next_element
|
|
661
|
+
|
|
662
|
+
@property
|
|
663
|
+
def previous_element(self):
|
|
664
|
+
"""Return the previous element in document order before the root tag."""
|
|
665
|
+
return self._soup.previous_element
|
|
666
|
+
|
|
667
|
+
def fetch_and_parse(self, url: str, requests_session=None, **kwargs) -> 'Scout':
|
|
668
|
+
"""Fetch HTML from a URL using requests and parse it with Scout."""
|
|
669
|
+
import requests
|
|
670
|
+
session = requests_session or requests.Session()
|
|
671
|
+
resp = session.get(url, **kwargs)
|
|
672
|
+
return Scout(resp.content, features=self.features)
|
|
673
|
+
|
|
674
|
+
def tables_to_dataframe(self, table_index=0, pandas_module=None):
|
|
675
|
+
"""Convert the nth table in the document to a pandas DataFrame."""
|
|
676
|
+
import pandas as pd
|
|
677
|
+
if pandas_module:
|
|
678
|
+
pd = pandas_module
|
|
679
|
+
tables = self.find_all('table')
|
|
680
|
+
if not tables or table_index >= len(tables):
|
|
681
|
+
return None
|
|
682
|
+
table = tables[table_index]
|
|
683
|
+
rows = table.find_all('tr')
|
|
684
|
+
data = [[cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])] for row in rows]
|
|
685
|
+
return pd.DataFrame(data)
|