webscout 8.3.5__py3-none-any.whl → 8.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/Bard.py +12 -6
- webscout/DWEBS.py +66 -57
- webscout/Provider/{UNFINISHED → AISEARCH}/PERPLEXED_search.py +34 -74
- webscout/Provider/AISEARCH/__init__.py +1 -1
- webscout/Provider/Deepinfra.py +6 -0
- webscout/Provider/Flowith.py +6 -1
- webscout/Provider/GithubChat.py +1 -0
- webscout/Provider/GptOss.py +207 -0
- webscout/Provider/Kimi.py +445 -0
- webscout/Provider/Netwrck.py +3 -6
- webscout/Provider/OPENAI/README.md +2 -1
- webscout/Provider/OPENAI/TogetherAI.py +50 -55
- webscout/Provider/OPENAI/__init__.py +4 -2
- webscout/Provider/OPENAI/copilot.py +20 -4
- webscout/Provider/OPENAI/deepinfra.py +6 -0
- webscout/Provider/OPENAI/e2b.py +60 -8
- webscout/Provider/OPENAI/flowith.py +4 -3
- webscout/Provider/OPENAI/generate_api_key.py +48 -0
- webscout/Provider/OPENAI/gptoss.py +288 -0
- webscout/Provider/OPENAI/kimi.py +469 -0
- webscout/Provider/OPENAI/netwrck.py +8 -12
- webscout/Provider/OPENAI/refact.py +274 -0
- webscout/Provider/OPENAI/textpollinations.py +3 -6
- webscout/Provider/OPENAI/toolbaz.py +1 -0
- webscout/Provider/TTI/bing.py +14 -2
- webscout/Provider/TTI/together.py +10 -9
- webscout/Provider/TTS/README.md +0 -1
- webscout/Provider/TTS/__init__.py +0 -1
- webscout/Provider/TTS/base.py +479 -159
- webscout/Provider/TTS/deepgram.py +409 -156
- webscout/Provider/TTS/elevenlabs.py +425 -111
- webscout/Provider/TTS/freetts.py +317 -140
- webscout/Provider/TTS/gesserit.py +192 -128
- webscout/Provider/TTS/murfai.py +248 -113
- webscout/Provider/TTS/openai_fm.py +347 -129
- webscout/Provider/TTS/speechma.py +620 -586
- webscout/Provider/TextPollinationsAI.py +3 -6
- webscout/Provider/TogetherAI.py +50 -55
- webscout/Provider/UNFINISHED/VercelAIGateway.py +339 -0
- webscout/Provider/__init__.py +2 -90
- webscout/Provider/cerebras.py +83 -33
- webscout/Provider/copilot.py +42 -23
- webscout/Provider/toolbaz.py +1 -0
- webscout/conversation.py +22 -20
- webscout/sanitize.py +14 -10
- webscout/scout/README.md +20 -23
- webscout/scout/core/crawler.py +125 -38
- webscout/scout/core/scout.py +26 -5
- webscout/version.py +1 -1
- webscout/webscout_search.py +13 -6
- webscout/webscout_search_async.py +10 -8
- webscout/yep_search.py +13 -5
- {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/METADATA +2 -1
- {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/RECORD +59 -56
- webscout/Provider/Glider.py +0 -225
- webscout/Provider/OPENAI/c4ai.py +0 -394
- webscout/Provider/OPENAI/glider.py +0 -330
- webscout/Provider/TTS/sthir.py +0 -94
- /webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +0 -0
- {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/WHEEL +0 -0
- {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/entry_points.txt +0 -0
- {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/top_level.txt +0 -0
webscout/Provider/cerebras.py
CHANGED
|
@@ -1,34 +1,48 @@
|
|
|
1
1
|
|
|
2
2
|
import re
|
|
3
|
+
|
|
4
|
+
# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
|
|
5
|
+
# See: https://github.com/python-trio/trio/issues/3015
|
|
6
|
+
try:
|
|
7
|
+
import trio # noqa: F401
|
|
8
|
+
except ImportError:
|
|
9
|
+
pass # trio is optional, ignore if not available
|
|
10
|
+
import json
|
|
11
|
+
from typing import Any, Dict, Generator, List, Optional, Union
|
|
12
|
+
|
|
3
13
|
import curl_cffi
|
|
4
14
|
from curl_cffi.requests import Session
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
from typing import Any, Dict, Optional, Generator, List, Union
|
|
8
|
-
from webscout.AIutel import Optimizers, Conversation, AwesomePrompts, sanitize_stream # Import sanitize_stream
|
|
9
|
-
from webscout.AIbase import Provider
|
|
15
|
+
|
|
10
16
|
from webscout import exceptions
|
|
17
|
+
from webscout.AIbase import Provider
|
|
18
|
+
from webscout.AIutel import ( # Import sanitize_stream
|
|
19
|
+
AwesomePrompts,
|
|
20
|
+
Conversation,
|
|
21
|
+
Optimizers,
|
|
22
|
+
sanitize_stream,
|
|
23
|
+
)
|
|
11
24
|
from webscout.litagent import LitAgent as UserAgent
|
|
12
25
|
|
|
26
|
+
|
|
13
27
|
class Cerebras(Provider):
|
|
14
28
|
"""
|
|
15
29
|
A class to interact with the Cerebras API using a cookie for authentication.
|
|
16
30
|
"""
|
|
17
|
-
|
|
31
|
+
|
|
18
32
|
AVAILABLE_MODELS = [
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
"llama-4-scout-17b-16e-instruct",
|
|
33
|
+
"qwen-3-coder-480b",
|
|
34
|
+
"qwen-3-235b-a22b-instruct-2507",
|
|
35
|
+
"qwen-3-235b-a22b-thinking-2507",
|
|
23
36
|
"qwen-3-32b",
|
|
24
|
-
|
|
25
|
-
|
|
37
|
+
"llama-3.3-70b",
|
|
38
|
+
"llama-4-maverick-17b-128e-instruct"
|
|
26
39
|
]
|
|
27
40
|
|
|
28
41
|
def __init__(
|
|
29
42
|
self,
|
|
43
|
+
cookie_path: str = None,
|
|
30
44
|
is_conversation: bool = True,
|
|
31
|
-
max_tokens: int =
|
|
45
|
+
max_tokens: int = 40000,
|
|
32
46
|
timeout: int = 30,
|
|
33
47
|
intro: str = None,
|
|
34
48
|
filepath: str = None,
|
|
@@ -36,9 +50,11 @@ class Cerebras(Provider):
|
|
|
36
50
|
proxies: dict = {},
|
|
37
51
|
history_offset: int = 10250,
|
|
38
52
|
act: str = None,
|
|
39
|
-
|
|
40
|
-
model: str = "
|
|
53
|
+
api_key: str = None,
|
|
54
|
+
model: str = "qwen-3-coder-480b",
|
|
41
55
|
system_prompt: str = "You are a helpful assistant.",
|
|
56
|
+
temperature: float = 0.7,
|
|
57
|
+
top_p: float = 0.8,
|
|
42
58
|
):
|
|
43
59
|
# Validate model choice
|
|
44
60
|
if model not in self.AVAILABLE_MODELS:
|
|
@@ -52,15 +68,26 @@ class Cerebras(Provider):
|
|
|
52
68
|
self.system_prompt = system_prompt
|
|
53
69
|
self.is_conversation = is_conversation
|
|
54
70
|
self.max_tokens_to_sample = max_tokens
|
|
71
|
+
self.temperature = temperature
|
|
72
|
+
self.top_p = top_p
|
|
55
73
|
self.last_response = {}
|
|
56
74
|
|
|
57
75
|
self.session = Session() # Initialize curl_cffi session
|
|
58
76
|
|
|
59
|
-
#
|
|
60
|
-
|
|
61
|
-
self.api_key =
|
|
62
|
-
|
|
63
|
-
|
|
77
|
+
# Handle API key - either provided directly or retrieved from cookies
|
|
78
|
+
if api_key:
|
|
79
|
+
self.api_key = api_key.strip()
|
|
80
|
+
# Basic validation for API key format
|
|
81
|
+
if not self.api_key or len(self.api_key) < 10:
|
|
82
|
+
raise ValueError("Invalid API key format. API key must be at least 10 characters long.")
|
|
83
|
+
elif cookie_path:
|
|
84
|
+
# Get API key from cookies
|
|
85
|
+
try:
|
|
86
|
+
self.api_key = self.get_demo_api_key(cookie_path)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
raise exceptions.APIConnectionError(f"Failed to initialize Cerebras client: {e}")
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError("Either api_key must be provided or cookie_path must be specified")
|
|
64
91
|
|
|
65
92
|
# Initialize optimizers
|
|
66
93
|
self.__available_optimizers = (
|
|
@@ -72,16 +99,16 @@ class Cerebras(Provider):
|
|
|
72
99
|
# Initialize conversation settings
|
|
73
100
|
Conversation.intro = (
|
|
74
101
|
AwesomePrompts().get_act(
|
|
75
|
-
act, raise_not_found=True, default=
|
|
102
|
+
act, raise_not_found=True, default="You are a helpful assistant.", case_insensitive=True
|
|
76
103
|
)
|
|
77
104
|
if act
|
|
78
|
-
else
|
|
105
|
+
else "You are a helpful assistant."
|
|
79
106
|
)
|
|
80
107
|
self.conversation = Conversation(
|
|
81
108
|
is_conversation, self.max_tokens_to_sample, filepath, update_file
|
|
82
109
|
)
|
|
83
110
|
self.conversation.history_offset = history_offset
|
|
84
|
-
|
|
111
|
+
|
|
85
112
|
# Apply proxies to the session
|
|
86
113
|
self.session.proxies = proxies
|
|
87
114
|
|
|
@@ -105,8 +132,10 @@ class Cerebras(Provider):
|
|
|
105
132
|
return chunk.get("choices", [{}])[0].get("delta", {}).get("content")
|
|
106
133
|
return None
|
|
107
134
|
|
|
108
|
-
def get_demo_api_key(self, cookie_path: str) -> str: # Keep this using requests or switch to curl_cffi
|
|
135
|
+
def get_demo_api_key(self, cookie_path: str = None) -> str: # Keep this using requests or switch to curl_cffi
|
|
109
136
|
"""Retrieves the demo API key using the provided cookie."""
|
|
137
|
+
if not cookie_path:
|
|
138
|
+
raise ValueError("cookie_path must be provided when using cookie-based authentication")
|
|
110
139
|
try:
|
|
111
140
|
with open(cookie_path, "r") as file:
|
|
112
141
|
cookies = {item["name"]: item["value"] for item in json.load(file)}
|
|
@@ -159,7 +188,10 @@ class Cerebras(Provider):
|
|
|
159
188
|
payload = {
|
|
160
189
|
"model": self.model,
|
|
161
190
|
"messages": messages,
|
|
162
|
-
"stream": stream
|
|
191
|
+
"stream": stream,
|
|
192
|
+
"max_tokens": self.max_tokens_to_sample,
|
|
193
|
+
"temperature": self.temperature,
|
|
194
|
+
"top_p": self.top_p
|
|
163
195
|
}
|
|
164
196
|
|
|
165
197
|
try:
|
|
@@ -197,8 +229,26 @@ class Cerebras(Provider):
|
|
|
197
229
|
|
|
198
230
|
except curl_cffi.CurlError as e:
|
|
199
231
|
raise exceptions.APIConnectionError(f"Request failed (CurlError): {e}") from e
|
|
200
|
-
except Exception as e:
|
|
201
|
-
|
|
232
|
+
except Exception as e:
|
|
233
|
+
# Check if it's an HTTP error with status code
|
|
234
|
+
if hasattr(e, 'response') and hasattr(e.response, 'status_code'):
|
|
235
|
+
status_code = e.response.status_code
|
|
236
|
+
if status_code == 401:
|
|
237
|
+
raise exceptions.APIConnectionError(
|
|
238
|
+
"Authentication failed (401): Invalid API key. Please check your API key and try again."
|
|
239
|
+
) from e
|
|
240
|
+
elif status_code == 403:
|
|
241
|
+
raise exceptions.APIConnectionError(
|
|
242
|
+
"Access forbidden (403): Your API key may not have permission to access this resource."
|
|
243
|
+
) from e
|
|
244
|
+
elif status_code == 429:
|
|
245
|
+
raise exceptions.APIConnectionError(
|
|
246
|
+
"Rate limit exceeded (429): Too many requests. Please wait and try again."
|
|
247
|
+
) from e
|
|
248
|
+
else:
|
|
249
|
+
raise exceptions.APIConnectionError(f"HTTP {status_code} error: {e}") from e
|
|
250
|
+
else:
|
|
251
|
+
raise exceptions.APIConnectionError(f"Request failed: {e}") from e
|
|
202
252
|
|
|
203
253
|
def ask(
|
|
204
254
|
self,
|
|
@@ -225,7 +275,7 @@ class Cerebras(Provider):
|
|
|
225
275
|
|
|
226
276
|
try:
|
|
227
277
|
response = self._make_request(messages, stream)
|
|
228
|
-
|
|
278
|
+
|
|
229
279
|
if stream:
|
|
230
280
|
# Wrap the generator to yield dicts or raw strings
|
|
231
281
|
def stream_wrapper():
|
|
@@ -256,7 +306,7 @@ class Cerebras(Provider):
|
|
|
256
306
|
"""Chat with the model."""
|
|
257
307
|
# Ask returns a generator for stream=True, dict/str for stream=False
|
|
258
308
|
response_gen_or_dict = self.ask(prompt, stream, raw=False, optimizer=optimizer, conversationally=conversationally)
|
|
259
|
-
|
|
309
|
+
|
|
260
310
|
if stream:
|
|
261
311
|
# Wrap the generator from ask() to get message text
|
|
262
312
|
def stream_wrapper():
|
|
@@ -276,14 +326,14 @@ class Cerebras(Provider):
|
|
|
276
326
|
|
|
277
327
|
if __name__ == "__main__":
|
|
278
328
|
from rich import print
|
|
279
|
-
|
|
329
|
+
|
|
280
330
|
# Example usage
|
|
281
331
|
cerebras = Cerebras(
|
|
282
|
-
|
|
283
|
-
model='
|
|
332
|
+
api_key='csk-**********************', # Replace with your actual API key
|
|
333
|
+
model='qwen-3-235b-a22b-instruct-2507',
|
|
284
334
|
system_prompt="You are a helpful AI assistant."
|
|
285
335
|
)
|
|
286
|
-
|
|
336
|
+
|
|
287
337
|
# Test with streaming
|
|
288
338
|
response = cerebras.chat("Hello!", stream=True)
|
|
289
339
|
for chunk in response:
|
webscout/Provider/copilot.py
CHANGED
|
@@ -1,17 +1,21 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import json
|
|
3
|
-
import base64
|
|
4
1
|
import asyncio
|
|
2
|
+
import base64
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Dict, Generator, Union
|
|
5
6
|
from urllib.parse import quote
|
|
6
|
-
from typing import Optional, Dict, Any, List, Union, Generator
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
|
|
9
|
+
# See: https://github.com/python-trio/trio/issues/3015
|
|
10
|
+
try:
|
|
11
|
+
import trio # noqa: F401
|
|
12
|
+
except ImportError:
|
|
13
|
+
pass # trio is optional, ignore if not available
|
|
14
|
+
from curl_cffi.requests import CurlWsFlag, Session
|
|
9
15
|
|
|
10
|
-
from webscout.AIutel import Optimizers
|
|
11
|
-
from webscout.AIutel import Conversation
|
|
12
|
-
from webscout.AIutel import AwesomePrompts, sanitize_stream
|
|
13
|
-
from webscout.AIbase import Provider, AsyncProvider
|
|
14
16
|
from webscout import exceptions
|
|
17
|
+
from webscout.AIbase import Provider
|
|
18
|
+
from webscout.AIutel import AwesomePrompts, Conversation, Optimizers
|
|
15
19
|
from webscout.litagent import LitAgent
|
|
16
20
|
|
|
17
21
|
try:
|
|
@@ -41,12 +45,17 @@ class Copilot(Provider):
|
|
|
41
45
|
"""
|
|
42
46
|
A class to interact with the Microsoft Copilot API.
|
|
43
47
|
"""
|
|
44
|
-
|
|
48
|
+
|
|
45
49
|
label = "Microsoft Copilot"
|
|
46
50
|
url = "https://copilot.microsoft.com"
|
|
47
51
|
websocket_url = "wss://copilot.microsoft.com/c/api/chat?api-version=2"
|
|
48
52
|
conversation_url = f"{url}/c/api/conversations"
|
|
49
|
-
AVAILABLE_MODELS = ["Copilot", "Think Deeper"]
|
|
53
|
+
AVAILABLE_MODELS = ["Copilot", "Think Deeper", "Smart"]
|
|
54
|
+
MODEL_ALIASES = {
|
|
55
|
+
"gpt-4o": "Copilot",
|
|
56
|
+
"o4-mini": "Think Deeper",
|
|
57
|
+
"gpt-5": "Smart",
|
|
58
|
+
}
|
|
50
59
|
_access_token: str = None
|
|
51
60
|
_cookies: dict = None
|
|
52
61
|
|
|
@@ -64,9 +73,12 @@ class Copilot(Provider):
|
|
|
64
73
|
model: str = "Copilot"
|
|
65
74
|
):
|
|
66
75
|
"""Initializes the Copilot API client."""
|
|
67
|
-
|
|
76
|
+
# Map alias to real model name if needed
|
|
77
|
+
real_model = self.MODEL_ALIASES.get(model, model)
|
|
78
|
+
if real_model not in self.AVAILABLE_MODELS:
|
|
68
79
|
raise ValueError(f"Invalid model: {model}. Choose from: {self.AVAILABLE_MODELS}")
|
|
69
|
-
|
|
80
|
+
self.model = real_model
|
|
81
|
+
|
|
70
82
|
# Use LitAgent for user-agent
|
|
71
83
|
self.headers = {
|
|
72
84
|
'User-Agent': LitAgent().random(),
|
|
@@ -79,7 +91,7 @@ class Copilot(Provider):
|
|
|
79
91
|
'Sec-Fetch-Mode': 'cors',
|
|
80
92
|
'Sec-Fetch-Site': 'same-origin',
|
|
81
93
|
}
|
|
82
|
-
|
|
94
|
+
|
|
83
95
|
self.is_conversation = is_conversation
|
|
84
96
|
self.max_tokens_to_sample = max_tokens
|
|
85
97
|
self.timeout = timeout
|
|
@@ -253,6 +265,12 @@ class Copilot(Provider):
|
|
|
253
265
|
# WebSocket connection
|
|
254
266
|
wss = session.ws_connect(websocket_url)
|
|
255
267
|
wss.send(json.dumps({"event": "setOptions", "supportedCards": ["weather", "local", "image", "sports", "video", "ads", "finance"], "ads": {"supportedTypes": ["multimedia", "product", "tourActivity", "propertyPromotion", "text"]}}))
|
|
268
|
+
if self.model == "Smart":
|
|
269
|
+
mode_value = "smart"
|
|
270
|
+
elif "Think" in self.model:
|
|
271
|
+
mode_value = "reasoning"
|
|
272
|
+
else:
|
|
273
|
+
mode_value = "chat"
|
|
256
274
|
wss.send(json.dumps({
|
|
257
275
|
"event": "send",
|
|
258
276
|
"conversationId": conversation_id,
|
|
@@ -260,7 +278,8 @@ class Copilot(Provider):
|
|
|
260
278
|
"type": "text",
|
|
261
279
|
"text": conversation_prompt,
|
|
262
280
|
}],
|
|
263
|
-
"mode":
|
|
281
|
+
"mode": mode_value,
|
|
282
|
+
"model": self.model
|
|
264
283
|
}).encode(), CurlWsFlag.TEXT)
|
|
265
284
|
|
|
266
285
|
# Event-driven response loop
|
|
@@ -307,8 +326,8 @@ class Copilot(Provider):
|
|
|
307
326
|
**kwargs
|
|
308
327
|
) -> Union[str, Generator]:
|
|
309
328
|
def for_stream():
|
|
310
|
-
for response in self.ask(prompt, True, optimizer=optimizer,
|
|
311
|
-
conversationally=conversationally,
|
|
329
|
+
for response in self.ask(prompt, True, optimizer=optimizer,
|
|
330
|
+
conversationally=conversationally,
|
|
312
331
|
images=images, api_key=api_key, **kwargs):
|
|
313
332
|
if isinstance(response, dict):
|
|
314
333
|
if "text" in response:
|
|
@@ -320,13 +339,13 @@ class Copilot(Provider):
|
|
|
320
339
|
yield "\nSuggested follow-up questions:\n"
|
|
321
340
|
for suggestion in response["suggestions"]:
|
|
322
341
|
yield f"- {suggestion}\n"
|
|
323
|
-
|
|
342
|
+
|
|
324
343
|
def for_non_stream():
|
|
325
|
-
response = self.ask(prompt, False, optimizer=optimizer,
|
|
344
|
+
response = self.ask(prompt, False, optimizer=optimizer,
|
|
326
345
|
conversationally=conversationally,
|
|
327
346
|
images=images, api_key=api_key, **kwargs)
|
|
328
347
|
return self.get_message(response)
|
|
329
|
-
|
|
348
|
+
|
|
330
349
|
return for_stream() if stream else for_non_stream()
|
|
331
350
|
|
|
332
351
|
def get_message(self, response: dict) -> str:
|
|
@@ -379,7 +398,7 @@ def readHAR(url: str):
|
|
|
379
398
|
for file in os.listdir(path):
|
|
380
399
|
if file.endswith(".har"):
|
|
381
400
|
har_files.append(os.path.join(path, file))
|
|
382
|
-
|
|
401
|
+
|
|
383
402
|
for path in har_files:
|
|
384
403
|
with open(path, 'rb') as file:
|
|
385
404
|
try:
|
|
@@ -416,7 +435,7 @@ async def get_nodriver(proxy=None, user_data_dir=None):
|
|
|
416
435
|
|
|
417
436
|
if __name__ == "__main__":
|
|
418
437
|
from rich import print
|
|
419
|
-
ai = Copilot(timeout=900, model="
|
|
438
|
+
ai = Copilot(timeout=900, model="gpt-5")
|
|
420
439
|
response = ai.chat(input("> "), stream=True)
|
|
421
440
|
for chunk in response:
|
|
422
|
-
print(chunk, end="", flush=True)
|
|
441
|
+
print(chunk, end="", flush=True)
|
webscout/Provider/toolbaz.py
CHANGED
webscout/conversation.py
CHANGED
|
@@ -165,29 +165,19 @@ class Conversation:
|
|
|
165
165
|
))
|
|
166
166
|
|
|
167
167
|
def _compress_history(self) -> None:
|
|
168
|
-
"""
|
|
168
|
+
"""Delete old history when it exceeds threshold."""
|
|
169
169
|
if len(self.messages) > self.compression_threshold:
|
|
170
|
-
#
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
self.messages[-keep_recent:]
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
def _summarize_messages(self, messages: List[Message]) -> Message:
|
|
178
|
-
"""Create a summary message from older messages."""
|
|
179
|
-
return Message(
|
|
180
|
-
role="system",
|
|
181
|
-
content="[History Summary] Previous conversation summarized for context",
|
|
182
|
-
metadata={"summarized_count": len(messages)}
|
|
183
|
-
)
|
|
170
|
+
# Remove oldest messages, keep only the most recent ones
|
|
171
|
+
self.messages = self.messages[-self.compression_threshold:]
|
|
172
|
+
|
|
173
|
+
# _summarize_messages removed
|
|
184
174
|
|
|
185
175
|
def gen_complete_prompt(self, prompt: str, intro: Optional[str] = None) -> str:
|
|
186
176
|
"""Generate complete prompt with enhanced context management."""
|
|
187
177
|
if not self.status:
|
|
188
178
|
return prompt
|
|
189
179
|
|
|
190
|
-
intro = intro or self.intro
|
|
180
|
+
intro = intro or self.intro or ""
|
|
191
181
|
|
|
192
182
|
# Add tool information if available
|
|
193
183
|
tools_description = self.get_tools_description()
|
|
@@ -260,6 +250,7 @@ Your goal is to assist the user effectively. Analyze each query and choose one o
|
|
|
260
250
|
|
|
261
251
|
def _trim_chat_history(self, chat_history: str, intro: str) -> str:
|
|
262
252
|
"""Trim chat history with improved token management."""
|
|
253
|
+
intro = intro or ""
|
|
263
254
|
total_length = len(intro) + len(chat_history)
|
|
264
255
|
|
|
265
256
|
if total_length > self.history_offset:
|
|
@@ -273,20 +264,31 @@ Your goal is to assist the user effectively. Analyze each query and choose one o
|
|
|
273
264
|
return chat_history
|
|
274
265
|
|
|
275
266
|
def add_message(self, role: str, content: str, metadata: Optional[Dict[str, Any]] = None) -> None:
|
|
276
|
-
"""Add a message with enhanced validation and metadata support."""
|
|
267
|
+
"""Add a message with enhanced validation and metadata support. Deletes oldest messages if total word count exceeds max_tokens_to_sample."""
|
|
277
268
|
try:
|
|
278
269
|
role = role.lower() # Normalize role to lowercase
|
|
279
270
|
if not self.validate_message(role, content):
|
|
280
271
|
raise MessageValidationError("Invalid message role or content")
|
|
281
272
|
|
|
273
|
+
# Calculate total word count in history
|
|
274
|
+
def total_word_count(messages):
|
|
275
|
+
return sum(len(msg.content.split()) for msg in messages)
|
|
276
|
+
|
|
277
|
+
# Remove oldest messages until total word count is below limit
|
|
278
|
+
temp_messages = self.messages.copy()
|
|
279
|
+
while temp_messages and (total_word_count(temp_messages) + len(content.split()) > self.max_tokens_to_sample):
|
|
280
|
+
temp_messages.pop(0)
|
|
281
|
+
|
|
282
|
+
self.messages = temp_messages
|
|
283
|
+
|
|
282
284
|
message = Message(role=role, content=content, metadata=metadata or {})
|
|
283
285
|
self.messages.append(message)
|
|
284
|
-
|
|
286
|
+
|
|
285
287
|
if self.file and self.update_file:
|
|
286
288
|
self._append_to_file(message)
|
|
287
|
-
|
|
289
|
+
|
|
288
290
|
self._compress_history()
|
|
289
|
-
|
|
291
|
+
|
|
290
292
|
except Exception as e:
|
|
291
293
|
raise ConversationError(f"Failed to add message: {str(e)}") from e
|
|
292
294
|
|
webscout/sanitize.py
CHANGED
|
@@ -143,7 +143,7 @@ def _process_chunk(
|
|
|
143
143
|
if to_json:
|
|
144
144
|
try:
|
|
145
145
|
# Only strip before JSON parsing if both boundaries are incorrect
|
|
146
|
-
if sanitized_chunk[0] not in '{[' and sanitized_chunk[-1] not in '}]':
|
|
146
|
+
if len(sanitized_chunk) >= 2 and sanitized_chunk[0] not in '{[' and sanitized_chunk[-1] not in '}]':
|
|
147
147
|
sanitized_chunk = sanitized_chunk.strip()
|
|
148
148
|
return json.loads(sanitized_chunk)
|
|
149
149
|
except (json.JSONDecodeError, Exception) as e:
|
|
@@ -646,13 +646,14 @@ async def _sanitize_stream_async(
|
|
|
646
646
|
f"Stream must yield strings or bytes, not {type(first_item).__name__}"
|
|
647
647
|
)
|
|
648
648
|
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
649
|
+
try:
|
|
650
|
+
async for line in line_iterator:
|
|
651
|
+
if not line:
|
|
652
|
+
continue
|
|
653
|
+
buffer += line
|
|
654
|
+
while True:
|
|
655
|
+
if not found_start and start_marker:
|
|
656
|
+
idx = buffer.find(start_marker)
|
|
656
657
|
if idx != -1:
|
|
657
658
|
found_start = True
|
|
658
659
|
buffer = buffer[idx + len(start_marker) :]
|
|
@@ -735,6 +736,9 @@ async def _sanitize_stream_async(
|
|
|
735
736
|
break
|
|
736
737
|
else:
|
|
737
738
|
break
|
|
739
|
+
except Exception as e:
|
|
740
|
+
import sys
|
|
741
|
+
print(f"Async stream processing error: {str(e)}", file=sys.stderr)
|
|
738
742
|
|
|
739
743
|
|
|
740
744
|
def sanitize_stream(
|
|
@@ -937,7 +941,7 @@ def sanitize_stream(
|
|
|
937
941
|
payload, intro_value, to_json, skip_markers, strip_chars,
|
|
938
942
|
start_marker, end_marker, content_extractor, yield_raw_on_error,
|
|
939
943
|
encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
|
|
940
|
-
skip_regexes, extract_regexes,
|
|
944
|
+
skip_regexes, extract_regexes, raw,
|
|
941
945
|
)
|
|
942
946
|
|
|
943
947
|
# Handle async iterables
|
|
@@ -966,6 +970,7 @@ def sanitize_stream(
|
|
|
966
970
|
|
|
967
971
|
# --- Decorator version of sanitize_stream ---
|
|
968
972
|
import functools
|
|
973
|
+
import asyncio
|
|
969
974
|
from typing import overload
|
|
970
975
|
|
|
971
976
|
def _sanitize_stream_decorator(
|
|
@@ -1057,7 +1062,6 @@ sanitize_stream_decorator = _sanitize_stream_decorator
|
|
|
1057
1062
|
lit_streamer = _sanitize_stream_decorator
|
|
1058
1063
|
|
|
1059
1064
|
# Allow @sanitize_stream and @lit_streamer as decorators
|
|
1060
|
-
import asyncio
|
|
1061
1065
|
sanitize_stream.__decorator__ = _sanitize_stream_decorator
|
|
1062
1066
|
LITSTREAM.__decorator__ = _sanitize_stream_decorator
|
|
1063
1067
|
lit_streamer.__decorator__ = _sanitize_stream_decorator
|
webscout/scout/README.md
CHANGED
|
@@ -1,27 +1,24 @@
|
|
|
1
|
-
|
|
1
|
+
**🚀 The Most Advanced HTML Parser & Web Crawler for AI/LLM Data Collection**
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
[](https://www.python.org/)
|
|
6
|
-
[](https://opensource.org/licenses/MIT)
|
|
7
|
-
[](https://github.com/OE-LUCIFER/Webscout)
|
|
8
|
-
[](https://github.com/OE-LUCIFER/Webscout/wiki)
|
|
9
|
-
[](https://github.com/OE-LUCIFER/Webscout/pulls)
|
|
3
|
+
**🌟 Built for the Future • Powered by Intelligence • Trusted by Developers**
|
|
10
4
|
|
|
11
|
-
</div>
|
|
12
5
|
|
|
13
6
|
## 📋 Overview
|
|
14
7
|
|
|
15
|
-
Scout is
|
|
8
|
+
Scout is an ultra-powerful, enterprise-grade HTML parsing and web crawling library designed for the AI era. Built with LLM data collection in mind, Scout provides unparalleled capabilities for extracting, analyzing, and processing web content at scale. With its BeautifulSoup-compatible API enhanced with modern features, Scout is the go-to solution for serious web scraping projects.
|
|
16
9
|
|
|
17
10
|
<details open>
|
|
18
|
-
<summary><b
|
|
19
|
-
|
|
20
|
-
-
|
|
21
|
-
-
|
|
22
|
-
-
|
|
23
|
-
-
|
|
24
|
-
-
|
|
11
|
+
<summary><b>🌟 Why Scout is the Ultimate Choice</b></summary>
|
|
12
|
+
|
|
13
|
+
- **🧠 LLM-Optimized Crawling**: Purpose-built for collecting high-quality training data for Large Language Models
|
|
14
|
+
- **🌐 Subdomain Intelligence**: Automatically discovers and crawls subdomains (e.g., blog.example.com, docs.example.com)
|
|
15
|
+
- **⚡ Lightning-Fast Performance**: Multi-threaded concurrent crawling with intelligent rate limiting
|
|
16
|
+
- **🎯 Surgical Precision**: Advanced content extraction that preserves structure while removing noise
|
|
17
|
+
- **🔍 Deep Analysis**: Built-in NLP capabilities for entity extraction, text analysis, and semantic understanding
|
|
18
|
+
- **🛡️ Enterprise-Ready**: Robust error handling, retry mechanisms, and respect for robots.txt
|
|
19
|
+
- **📊 Rich Data Extraction**: Captures metadata, structured data, semantic content, and more
|
|
20
|
+
- **🔄 Format Flexibility**: Export to JSON, Markdown, CSV, or custom formats
|
|
21
|
+
- **🎨 BeautifulSoup++ API**: Familiar interface with 10x more features
|
|
25
22
|
|
|
26
23
|
</details>
|
|
27
24
|
|
|
@@ -46,7 +43,7 @@ pip install webscout
|
|
|
46
43
|
Or install the latest version from GitHub:
|
|
47
44
|
|
|
48
45
|
```bash
|
|
49
|
-
pip install git+https://github.com/
|
|
46
|
+
pip install git+https://github.com/OEvortex/Webscout.git
|
|
50
47
|
```
|
|
51
48
|
|
|
52
49
|
## 🚀 Quick Start
|
|
@@ -361,7 +358,7 @@ cached_data = scout.cache('parsed_data')
|
|
|
361
358
|
- `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
|
|
362
359
|
- `_is_valid_url(url)`: Check if a URL is valid (internal method)
|
|
363
360
|
|
|
364
|
-
For detailed API documentation, please refer to the [documentation](https://github.com/
|
|
361
|
+
For detailed API documentation, please refer to the [documentation](https://github.com/OEvortex/Webscout/wiki).
|
|
365
362
|
|
|
366
363
|
## 🔧 Dependencies
|
|
367
364
|
|
|
@@ -396,9 +393,9 @@ This project is licensed under the MIT License - see the LICENSE file for detail
|
|
|
396
393
|
<div align="center">
|
|
397
394
|
<p>Made with ❤️ by the Webscout team</p>
|
|
398
395
|
<p>
|
|
399
|
-
<a href="https://github.com/
|
|
400
|
-
<a href="https://github.com/
|
|
401
|
-
<a href="https://github.com/
|
|
402
|
-
<a href="https://github.com/
|
|
396
|
+
<a href="https://github.com/OEvortex/Webscout">GitHub</a> •
|
|
397
|
+
<a href="https://github.com/OEvortex/Webscout/wiki">Documentation</a> •
|
|
398
|
+
<a href="https://github.com/OEvortex/Webscout/issues">Report Bug</a> •
|
|
399
|
+
<a href="https://github.com/OEvortex/Webscout/issues">Request Feature</a>
|
|
403
400
|
</p>
|
|
404
401
|
</div>
|