webscout 8.3.5__py3-none-any.whl → 8.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (63) hide show
  1. webscout/Bard.py +12 -6
  2. webscout/DWEBS.py +66 -57
  3. webscout/Provider/{UNFINISHED → AISEARCH}/PERPLEXED_search.py +34 -74
  4. webscout/Provider/AISEARCH/__init__.py +1 -1
  5. webscout/Provider/Deepinfra.py +6 -0
  6. webscout/Provider/Flowith.py +6 -1
  7. webscout/Provider/GithubChat.py +1 -0
  8. webscout/Provider/GptOss.py +207 -0
  9. webscout/Provider/Kimi.py +445 -0
  10. webscout/Provider/Netwrck.py +3 -6
  11. webscout/Provider/OPENAI/README.md +2 -1
  12. webscout/Provider/OPENAI/TogetherAI.py +50 -55
  13. webscout/Provider/OPENAI/__init__.py +4 -2
  14. webscout/Provider/OPENAI/copilot.py +20 -4
  15. webscout/Provider/OPENAI/deepinfra.py +6 -0
  16. webscout/Provider/OPENAI/e2b.py +60 -8
  17. webscout/Provider/OPENAI/flowith.py +4 -3
  18. webscout/Provider/OPENAI/generate_api_key.py +48 -0
  19. webscout/Provider/OPENAI/gptoss.py +288 -0
  20. webscout/Provider/OPENAI/kimi.py +469 -0
  21. webscout/Provider/OPENAI/netwrck.py +8 -12
  22. webscout/Provider/OPENAI/refact.py +274 -0
  23. webscout/Provider/OPENAI/textpollinations.py +3 -6
  24. webscout/Provider/OPENAI/toolbaz.py +1 -0
  25. webscout/Provider/TTI/bing.py +14 -2
  26. webscout/Provider/TTI/together.py +10 -9
  27. webscout/Provider/TTS/README.md +0 -1
  28. webscout/Provider/TTS/__init__.py +0 -1
  29. webscout/Provider/TTS/base.py +479 -159
  30. webscout/Provider/TTS/deepgram.py +409 -156
  31. webscout/Provider/TTS/elevenlabs.py +425 -111
  32. webscout/Provider/TTS/freetts.py +317 -140
  33. webscout/Provider/TTS/gesserit.py +192 -128
  34. webscout/Provider/TTS/murfai.py +248 -113
  35. webscout/Provider/TTS/openai_fm.py +347 -129
  36. webscout/Provider/TTS/speechma.py +620 -586
  37. webscout/Provider/TextPollinationsAI.py +3 -6
  38. webscout/Provider/TogetherAI.py +50 -55
  39. webscout/Provider/UNFINISHED/VercelAIGateway.py +339 -0
  40. webscout/Provider/__init__.py +2 -90
  41. webscout/Provider/cerebras.py +83 -33
  42. webscout/Provider/copilot.py +42 -23
  43. webscout/Provider/toolbaz.py +1 -0
  44. webscout/conversation.py +22 -20
  45. webscout/sanitize.py +14 -10
  46. webscout/scout/README.md +20 -23
  47. webscout/scout/core/crawler.py +125 -38
  48. webscout/scout/core/scout.py +26 -5
  49. webscout/version.py +1 -1
  50. webscout/webscout_search.py +13 -6
  51. webscout/webscout_search_async.py +10 -8
  52. webscout/yep_search.py +13 -5
  53. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/METADATA +2 -1
  54. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/RECORD +59 -56
  55. webscout/Provider/Glider.py +0 -225
  56. webscout/Provider/OPENAI/c4ai.py +0 -394
  57. webscout/Provider/OPENAI/glider.py +0 -330
  58. webscout/Provider/TTS/sthir.py +0 -94
  59. /webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +0 -0
  60. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/WHEEL +0 -0
  61. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/entry_points.txt +0 -0
  62. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/licenses/LICENSE.md +0 -0
  63. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,48 @@
1
1
 
2
2
  import re
3
+
4
+ # Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
5
+ # See: https://github.com/python-trio/trio/issues/3015
6
+ try:
7
+ import trio # noqa: F401
8
+ except ImportError:
9
+ pass # trio is optional, ignore if not available
10
+ import json
11
+ from typing import Any, Dict, Generator, List, Optional, Union
12
+
3
13
  import curl_cffi
4
14
  from curl_cffi.requests import Session
5
- import json
6
- import os
7
- from typing import Any, Dict, Optional, Generator, List, Union
8
- from webscout.AIutel import Optimizers, Conversation, AwesomePrompts, sanitize_stream # Import sanitize_stream
9
- from webscout.AIbase import Provider
15
+
10
16
  from webscout import exceptions
17
+ from webscout.AIbase import Provider
18
+ from webscout.AIutel import ( # Import sanitize_stream
19
+ AwesomePrompts,
20
+ Conversation,
21
+ Optimizers,
22
+ sanitize_stream,
23
+ )
11
24
  from webscout.litagent import LitAgent as UserAgent
12
25
 
26
+
13
27
  class Cerebras(Provider):
14
28
  """
15
29
  A class to interact with the Cerebras API using a cookie for authentication.
16
30
  """
17
-
31
+
18
32
  AVAILABLE_MODELS = [
19
- "llama3.1-8b",
20
- "llama-3.3-70b",
21
- "deepseek-r1-distill-llama-70b",
22
- "llama-4-scout-17b-16e-instruct",
33
+ "qwen-3-coder-480b",
34
+ "qwen-3-235b-a22b-instruct-2507",
35
+ "qwen-3-235b-a22b-thinking-2507",
23
36
  "qwen-3-32b",
24
-
25
-
37
+ "llama-3.3-70b",
38
+ "llama-4-maverick-17b-128e-instruct"
26
39
  ]
27
40
 
28
41
  def __init__(
29
42
  self,
43
+ cookie_path: str = None,
30
44
  is_conversation: bool = True,
31
- max_tokens: int = 2049,
45
+ max_tokens: int = 40000,
32
46
  timeout: int = 30,
33
47
  intro: str = None,
34
48
  filepath: str = None,
@@ -36,9 +50,11 @@ class Cerebras(Provider):
36
50
  proxies: dict = {},
37
51
  history_offset: int = 10250,
38
52
  act: str = None,
39
- cookie_path: str = "cookie.json",
40
- model: str = "llama3.1-8b",
53
+ api_key: str = None,
54
+ model: str = "qwen-3-coder-480b",
41
55
  system_prompt: str = "You are a helpful assistant.",
56
+ temperature: float = 0.7,
57
+ top_p: float = 0.8,
42
58
  ):
43
59
  # Validate model choice
44
60
  if model not in self.AVAILABLE_MODELS:
@@ -52,15 +68,26 @@ class Cerebras(Provider):
52
68
  self.system_prompt = system_prompt
53
69
  self.is_conversation = is_conversation
54
70
  self.max_tokens_to_sample = max_tokens
71
+ self.temperature = temperature
72
+ self.top_p = top_p
55
73
  self.last_response = {}
56
74
 
57
75
  self.session = Session() # Initialize curl_cffi session
58
76
 
59
- # Get API key first
60
- try:
61
- self.api_key = self.get_demo_api_key(cookie_path)
62
- except Exception as e:
63
- raise exceptions.APIConnectionError(f"Failed to initialize Cerebras client: {e}")
77
+ # Handle API key - either provided directly or retrieved from cookies
78
+ if api_key:
79
+ self.api_key = api_key.strip()
80
+ # Basic validation for API key format
81
+ if not self.api_key or len(self.api_key) < 10:
82
+ raise ValueError("Invalid API key format. API key must be at least 10 characters long.")
83
+ elif cookie_path:
84
+ # Get API key from cookies
85
+ try:
86
+ self.api_key = self.get_demo_api_key(cookie_path)
87
+ except Exception as e:
88
+ raise exceptions.APIConnectionError(f"Failed to initialize Cerebras client: {e}")
89
+ else:
90
+ raise ValueError("Either api_key must be provided or cookie_path must be specified")
64
91
 
65
92
  # Initialize optimizers
66
93
  self.__available_optimizers = (
@@ -72,16 +99,16 @@ class Cerebras(Provider):
72
99
  # Initialize conversation settings
73
100
  Conversation.intro = (
74
101
  AwesomePrompts().get_act(
75
- act, raise_not_found=True, default=None, case_insensitive=True
102
+ act, raise_not_found=True, default="You are a helpful assistant.", case_insensitive=True
76
103
  )
77
104
  if act
78
- else None
105
+ else "You are a helpful assistant."
79
106
  )
80
107
  self.conversation = Conversation(
81
108
  is_conversation, self.max_tokens_to_sample, filepath, update_file
82
109
  )
83
110
  self.conversation.history_offset = history_offset
84
-
111
+
85
112
  # Apply proxies to the session
86
113
  self.session.proxies = proxies
87
114
 
@@ -105,8 +132,10 @@ class Cerebras(Provider):
105
132
  return chunk.get("choices", [{}])[0].get("delta", {}).get("content")
106
133
  return None
107
134
 
108
- def get_demo_api_key(self, cookie_path: str) -> str: # Keep this using requests or switch to curl_cffi
135
+ def get_demo_api_key(self, cookie_path: str = None) -> str: # Keep this using requests or switch to curl_cffi
109
136
  """Retrieves the demo API key using the provided cookie."""
137
+ if not cookie_path:
138
+ raise ValueError("cookie_path must be provided when using cookie-based authentication")
110
139
  try:
111
140
  with open(cookie_path, "r") as file:
112
141
  cookies = {item["name"]: item["value"] for item in json.load(file)}
@@ -159,7 +188,10 @@ class Cerebras(Provider):
159
188
  payload = {
160
189
  "model": self.model,
161
190
  "messages": messages,
162
- "stream": stream
191
+ "stream": stream,
192
+ "max_tokens": self.max_tokens_to_sample,
193
+ "temperature": self.temperature,
194
+ "top_p": self.top_p
163
195
  }
164
196
 
165
197
  try:
@@ -197,8 +229,26 @@ class Cerebras(Provider):
197
229
 
198
230
  except curl_cffi.CurlError as e:
199
231
  raise exceptions.APIConnectionError(f"Request failed (CurlError): {e}") from e
200
- except Exception as e: # Catch other potential errors
201
- raise exceptions.APIConnectionError(f"Request failed: {e}")
232
+ except Exception as e:
233
+ # Check if it's an HTTP error with status code
234
+ if hasattr(e, 'response') and hasattr(e.response, 'status_code'):
235
+ status_code = e.response.status_code
236
+ if status_code == 401:
237
+ raise exceptions.APIConnectionError(
238
+ "Authentication failed (401): Invalid API key. Please check your API key and try again."
239
+ ) from e
240
+ elif status_code == 403:
241
+ raise exceptions.APIConnectionError(
242
+ "Access forbidden (403): Your API key may not have permission to access this resource."
243
+ ) from e
244
+ elif status_code == 429:
245
+ raise exceptions.APIConnectionError(
246
+ "Rate limit exceeded (429): Too many requests. Please wait and try again."
247
+ ) from e
248
+ else:
249
+ raise exceptions.APIConnectionError(f"HTTP {status_code} error: {e}") from e
250
+ else:
251
+ raise exceptions.APIConnectionError(f"Request failed: {e}") from e
202
252
 
203
253
  def ask(
204
254
  self,
@@ -225,7 +275,7 @@ class Cerebras(Provider):
225
275
 
226
276
  try:
227
277
  response = self._make_request(messages, stream)
228
-
278
+
229
279
  if stream:
230
280
  # Wrap the generator to yield dicts or raw strings
231
281
  def stream_wrapper():
@@ -256,7 +306,7 @@ class Cerebras(Provider):
256
306
  """Chat with the model."""
257
307
  # Ask returns a generator for stream=True, dict/str for stream=False
258
308
  response_gen_or_dict = self.ask(prompt, stream, raw=False, optimizer=optimizer, conversationally=conversationally)
259
-
309
+
260
310
  if stream:
261
311
  # Wrap the generator from ask() to get message text
262
312
  def stream_wrapper():
@@ -276,14 +326,14 @@ class Cerebras(Provider):
276
326
 
277
327
  if __name__ == "__main__":
278
328
  from rich import print
279
-
329
+
280
330
  # Example usage
281
331
  cerebras = Cerebras(
282
- cookie_path=r'cookies.json',
283
- model='llama3.1-8b',
332
+ api_key='csk-**********************', # Replace with your actual API key
333
+ model='qwen-3-235b-a22b-instruct-2507',
284
334
  system_prompt="You are a helpful AI assistant."
285
335
  )
286
-
336
+
287
337
  # Test with streaming
288
338
  response = cerebras.chat("Hello!", stream=True)
289
339
  for chunk in response:
@@ -1,17 +1,21 @@
1
- import os
2
- import json
3
- import base64
4
1
  import asyncio
2
+ import base64
3
+ import json
4
+ import os
5
+ from typing import Any, Dict, Generator, Union
5
6
  from urllib.parse import quote
6
- from typing import Optional, Dict, Any, List, Union, Generator
7
7
 
8
- from curl_cffi.requests import Session, CurlWsFlag
8
+ # Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
9
+ # See: https://github.com/python-trio/trio/issues/3015
10
+ try:
11
+ import trio # noqa: F401
12
+ except ImportError:
13
+ pass # trio is optional, ignore if not available
14
+ from curl_cffi.requests import CurlWsFlag, Session
9
15
 
10
- from webscout.AIutel import Optimizers
11
- from webscout.AIutel import Conversation
12
- from webscout.AIutel import AwesomePrompts, sanitize_stream
13
- from webscout.AIbase import Provider, AsyncProvider
14
16
  from webscout import exceptions
17
+ from webscout.AIbase import Provider
18
+ from webscout.AIutel import AwesomePrompts, Conversation, Optimizers
15
19
  from webscout.litagent import LitAgent
16
20
 
17
21
  try:
@@ -41,12 +45,17 @@ class Copilot(Provider):
41
45
  """
42
46
  A class to interact with the Microsoft Copilot API.
43
47
  """
44
-
48
+
45
49
  label = "Microsoft Copilot"
46
50
  url = "https://copilot.microsoft.com"
47
51
  websocket_url = "wss://copilot.microsoft.com/c/api/chat?api-version=2"
48
52
  conversation_url = f"{url}/c/api/conversations"
49
- AVAILABLE_MODELS = ["Copilot", "Think Deeper"]
53
+ AVAILABLE_MODELS = ["Copilot", "Think Deeper", "Smart"]
54
+ MODEL_ALIASES = {
55
+ "gpt-4o": "Copilot",
56
+ "o4-mini": "Think Deeper",
57
+ "gpt-5": "Smart",
58
+ }
50
59
  _access_token: str = None
51
60
  _cookies: dict = None
52
61
 
@@ -64,9 +73,12 @@ class Copilot(Provider):
64
73
  model: str = "Copilot"
65
74
  ):
66
75
  """Initializes the Copilot API client."""
67
- if model not in self.AVAILABLE_MODELS:
76
+ # Map alias to real model name if needed
77
+ real_model = self.MODEL_ALIASES.get(model, model)
78
+ if real_model not in self.AVAILABLE_MODELS:
68
79
  raise ValueError(f"Invalid model: {model}. Choose from: {self.AVAILABLE_MODELS}")
69
-
80
+ self.model = real_model
81
+
70
82
  # Use LitAgent for user-agent
71
83
  self.headers = {
72
84
  'User-Agent': LitAgent().random(),
@@ -79,7 +91,7 @@ class Copilot(Provider):
79
91
  'Sec-Fetch-Mode': 'cors',
80
92
  'Sec-Fetch-Site': 'same-origin',
81
93
  }
82
-
94
+
83
95
  self.is_conversation = is_conversation
84
96
  self.max_tokens_to_sample = max_tokens
85
97
  self.timeout = timeout
@@ -253,6 +265,12 @@ class Copilot(Provider):
253
265
  # WebSocket connection
254
266
  wss = session.ws_connect(websocket_url)
255
267
  wss.send(json.dumps({"event": "setOptions", "supportedCards": ["weather", "local", "image", "sports", "video", "ads", "finance"], "ads": {"supportedTypes": ["multimedia", "product", "tourActivity", "propertyPromotion", "text"]}}))
268
+ if self.model == "Smart":
269
+ mode_value = "smart"
270
+ elif "Think" in self.model:
271
+ mode_value = "reasoning"
272
+ else:
273
+ mode_value = "chat"
256
274
  wss.send(json.dumps({
257
275
  "event": "send",
258
276
  "conversationId": conversation_id,
@@ -260,7 +278,8 @@ class Copilot(Provider):
260
278
  "type": "text",
261
279
  "text": conversation_prompt,
262
280
  }],
263
- "mode": "reasoning" if "Think" in self.model else "chat"
281
+ "mode": mode_value,
282
+ "model": self.model
264
283
  }).encode(), CurlWsFlag.TEXT)
265
284
 
266
285
  # Event-driven response loop
@@ -307,8 +326,8 @@ class Copilot(Provider):
307
326
  **kwargs
308
327
  ) -> Union[str, Generator]:
309
328
  def for_stream():
310
- for response in self.ask(prompt, True, optimizer=optimizer,
311
- conversationally=conversationally,
329
+ for response in self.ask(prompt, True, optimizer=optimizer,
330
+ conversationally=conversationally,
312
331
  images=images, api_key=api_key, **kwargs):
313
332
  if isinstance(response, dict):
314
333
  if "text" in response:
@@ -320,13 +339,13 @@ class Copilot(Provider):
320
339
  yield "\nSuggested follow-up questions:\n"
321
340
  for suggestion in response["suggestions"]:
322
341
  yield f"- {suggestion}\n"
323
-
342
+
324
343
  def for_non_stream():
325
- response = self.ask(prompt, False, optimizer=optimizer,
344
+ response = self.ask(prompt, False, optimizer=optimizer,
326
345
  conversationally=conversationally,
327
346
  images=images, api_key=api_key, **kwargs)
328
347
  return self.get_message(response)
329
-
348
+
330
349
  return for_stream() if stream else for_non_stream()
331
350
 
332
351
  def get_message(self, response: dict) -> str:
@@ -379,7 +398,7 @@ def readHAR(url: str):
379
398
  for file in os.listdir(path):
380
399
  if file.endswith(".har"):
381
400
  har_files.append(os.path.join(path, file))
382
-
401
+
383
402
  for path in har_files:
384
403
  with open(path, 'rb') as file:
385
404
  try:
@@ -416,7 +435,7 @@ async def get_nodriver(proxy=None, user_data_dir=None):
416
435
 
417
436
  if __name__ == "__main__":
418
437
  from rich import print
419
- ai = Copilot(timeout=900, model="Think Deeper")
438
+ ai = Copilot(timeout=900, model="gpt-5")
420
439
  response = ai.chat(input("> "), stream=True)
421
440
  for chunk in response:
422
- print(chunk, end="", flush=True)
441
+ print(chunk, end="", flush=True)
@@ -34,6 +34,7 @@ class Toolbaz(Provider):
34
34
  "Llama-4-Maverick",
35
35
  "Llama-4-Scout",
36
36
  "Llama-3.3-70B",
37
+ "gpt-oss-120b",
37
38
  "Qwen2.5-72B",
38
39
  "grok-2-1212",
39
40
  "grok-3-beta",
webscout/conversation.py CHANGED
@@ -165,29 +165,19 @@ class Conversation:
165
165
  ))
166
166
 
167
167
  def _compress_history(self) -> None:
168
- """Compress history when it exceeds threshold."""
168
+ """Delete old history when it exceeds threshold."""
169
169
  if len(self.messages) > self.compression_threshold:
170
- # Keep recent messages and summarize older ones
171
- keep_recent = 100 # Adjust based on needs
172
- self.messages = (
173
- [self._summarize_messages(self.messages[:-keep_recent])] +
174
- self.messages[-keep_recent:]
175
- )
176
-
177
- def _summarize_messages(self, messages: List[Message]) -> Message:
178
- """Create a summary message from older messages."""
179
- return Message(
180
- role="system",
181
- content="[History Summary] Previous conversation summarized for context",
182
- metadata={"summarized_count": len(messages)}
183
- )
170
+ # Remove oldest messages, keep only the most recent ones
171
+ self.messages = self.messages[-self.compression_threshold:]
172
+
173
+ # _summarize_messages removed
184
174
 
185
175
  def gen_complete_prompt(self, prompt: str, intro: Optional[str] = None) -> str:
186
176
  """Generate complete prompt with enhanced context management."""
187
177
  if not self.status:
188
178
  return prompt
189
179
 
190
- intro = intro or self.intro
180
+ intro = intro or self.intro or ""
191
181
 
192
182
  # Add tool information if available
193
183
  tools_description = self.get_tools_description()
@@ -260,6 +250,7 @@ Your goal is to assist the user effectively. Analyze each query and choose one o
260
250
 
261
251
  def _trim_chat_history(self, chat_history: str, intro: str) -> str:
262
252
  """Trim chat history with improved token management."""
253
+ intro = intro or ""
263
254
  total_length = len(intro) + len(chat_history)
264
255
 
265
256
  if total_length > self.history_offset:
@@ -273,20 +264,31 @@ Your goal is to assist the user effectively. Analyze each query and choose one o
273
264
  return chat_history
274
265
 
275
266
  def add_message(self, role: str, content: str, metadata: Optional[Dict[str, Any]] = None) -> None:
276
- """Add a message with enhanced validation and metadata support."""
267
+ """Add a message with enhanced validation and metadata support. Deletes oldest messages if total word count exceeds max_tokens_to_sample."""
277
268
  try:
278
269
  role = role.lower() # Normalize role to lowercase
279
270
  if not self.validate_message(role, content):
280
271
  raise MessageValidationError("Invalid message role or content")
281
272
 
273
+ # Calculate total word count in history
274
+ def total_word_count(messages):
275
+ return sum(len(msg.content.split()) for msg in messages)
276
+
277
+ # Remove oldest messages until total word count is below limit
278
+ temp_messages = self.messages.copy()
279
+ while temp_messages and (total_word_count(temp_messages) + len(content.split()) > self.max_tokens_to_sample):
280
+ temp_messages.pop(0)
281
+
282
+ self.messages = temp_messages
283
+
282
284
  message = Message(role=role, content=content, metadata=metadata or {})
283
285
  self.messages.append(message)
284
-
286
+
285
287
  if self.file and self.update_file:
286
288
  self._append_to_file(message)
287
-
289
+
288
290
  self._compress_history()
289
-
291
+
290
292
  except Exception as e:
291
293
  raise ConversationError(f"Failed to add message: {str(e)}") from e
292
294
 
webscout/sanitize.py CHANGED
@@ -143,7 +143,7 @@ def _process_chunk(
143
143
  if to_json:
144
144
  try:
145
145
  # Only strip before JSON parsing if both boundaries are incorrect
146
- if sanitized_chunk[0] not in '{[' and sanitized_chunk[-1] not in '}]':
146
+ if len(sanitized_chunk) >= 2 and sanitized_chunk[0] not in '{[' and sanitized_chunk[-1] not in '}]':
147
147
  sanitized_chunk = sanitized_chunk.strip()
148
148
  return json.loads(sanitized_chunk)
149
149
  except (json.JSONDecodeError, Exception) as e:
@@ -646,13 +646,14 @@ async def _sanitize_stream_async(
646
646
  f"Stream must yield strings or bytes, not {type(first_item).__name__}"
647
647
  )
648
648
 
649
- async for line in line_iterator:
650
- if not line:
651
- continue
652
- buffer += line
653
- while True:
654
- if not found_start and start_marker:
655
- idx = buffer.find(start_marker)
649
+ try:
650
+ async for line in line_iterator:
651
+ if not line:
652
+ continue
653
+ buffer += line
654
+ while True:
655
+ if not found_start and start_marker:
656
+ idx = buffer.find(start_marker)
656
657
  if idx != -1:
657
658
  found_start = True
658
659
  buffer = buffer[idx + len(start_marker) :]
@@ -735,6 +736,9 @@ async def _sanitize_stream_async(
735
736
  break
736
737
  else:
737
738
  break
739
+ except Exception as e:
740
+ import sys
741
+ print(f"Async stream processing error: {str(e)}", file=sys.stderr)
738
742
 
739
743
 
740
744
  def sanitize_stream(
@@ -937,7 +941,7 @@ def sanitize_stream(
937
941
  payload, intro_value, to_json, skip_markers, strip_chars,
938
942
  start_marker, end_marker, content_extractor, yield_raw_on_error,
939
943
  encoding, encoding_errors, buffer_size, line_delimiter, error_handler,
940
- skip_regexes, extract_regexes,
944
+ skip_regexes, extract_regexes, raw,
941
945
  )
942
946
 
943
947
  # Handle async iterables
@@ -966,6 +970,7 @@ def sanitize_stream(
966
970
 
967
971
  # --- Decorator version of sanitize_stream ---
968
972
  import functools
973
+ import asyncio
969
974
  from typing import overload
970
975
 
971
976
  def _sanitize_stream_decorator(
@@ -1057,7 +1062,6 @@ sanitize_stream_decorator = _sanitize_stream_decorator
1057
1062
  lit_streamer = _sanitize_stream_decorator
1058
1063
 
1059
1064
  # Allow @sanitize_stream and @lit_streamer as decorators
1060
- import asyncio
1061
1065
  sanitize_stream.__decorator__ = _sanitize_stream_decorator
1062
1066
  LITSTREAM.__decorator__ = _sanitize_stream_decorator
1063
1067
  lit_streamer.__decorator__ = _sanitize_stream_decorator
webscout/scout/README.md CHANGED
@@ -1,27 +1,24 @@
1
- # 🕵️ Scout: Next-Gen Web Parsing Library
1
+ **🚀 The Most Advanced HTML Parser & Web Crawler for AI/LLM Data Collection**
2
2
 
3
- <div align="center">
4
-
5
- [![Python](https://img.shields.io/badge/Python-3.8%2B-blue)](https://www.python.org/)
6
- [![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
7
- [![Maintenance](https://img.shields.io/badge/Maintained-Yes-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout)
8
- [![Documentation](https://img.shields.io/badge/Docs-Wiki-orange)](https://github.com/OE-LUCIFER/Webscout/wiki)
9
- [![PRs Welcome](https://img.shields.io/badge/PRs-Welcome-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout/pulls)
3
+ **🌟 Built for the Future • Powered by Intelligence • Trusted by Developers**
10
4
 
11
- </div>
12
5
 
13
6
  ## 📋 Overview
14
7
 
15
- Scout is a powerful, flexible, and performant HTML parsing library that makes web scraping a breeze! It provides intelligent HTML/XML parsing with advanced features like web crawling, text analysis, semantic extraction, and Markdown conversion. Scout goes beyond traditional parsing libraries with its intuitive API and comprehensive feature set.
8
+ Scout is an ultra-powerful, enterprise-grade HTML parsing and web crawling library designed for the AI era. Built with LLM data collection in mind, Scout provides unparalleled capabilities for extracting, analyzing, and processing web content at scale. With its BeautifulSoup-compatible API enhanced with modern features, Scout is the go-to solution for serious web scraping projects.
16
9
 
17
10
  <details open>
18
- <summary><b>Why Choose Scout?</b></summary>
19
-
20
- - **Powerful Parsing**: Multiple parser backends with intelligent markup handling
21
- - **Advanced Analysis**: Built-in text and web content analysis tools
22
- - **Concurrent Crawling**: Efficient multi-threaded web crawling
23
- - **Flexible API**: Intuitive interface similar to BeautifulSoup but with enhanced capabilities
24
- - **Format Conversion**: Convert HTML to JSON, Markdown, and more
11
+ <summary><b>🌟 Why Scout is the Ultimate Choice</b></summary>
12
+
13
+ - **🧠 LLM-Optimized Crawling**: Purpose-built for collecting high-quality training data for Large Language Models
14
+ - **🌐 Subdomain Intelligence**: Automatically discovers and crawls subdomains (e.g., blog.example.com, docs.example.com)
15
+ - **⚡ Lightning-Fast Performance**: Multi-threaded concurrent crawling with intelligent rate limiting
16
+ - **🎯 Surgical Precision**: Advanced content extraction that preserves structure while removing noise
17
+ - **🔍 Deep Analysis**: Built-in NLP capabilities for entity extraction, text analysis, and semantic understanding
18
+ - **🛡️ Enterprise-Ready**: Robust error handling, retry mechanisms, and respect for robots.txt
19
+ - **📊 Rich Data Extraction**: Captures metadata, structured data, semantic content, and more
20
+ - **🔄 Format Flexibility**: Export to JSON, Markdown, CSV, or custom formats
21
+ - **🎨 BeautifulSoup++ API**: Familiar interface with 10x more features
25
22
 
26
23
  </details>
27
24
 
@@ -46,7 +43,7 @@ pip install webscout
46
43
  Or install the latest version from GitHub:
47
44
 
48
45
  ```bash
49
- pip install git+https://github.com/OE-LUCIFER/Webscout.git
46
+ pip install git+https://github.com/OEvortex/Webscout.git
50
47
  ```
51
48
 
52
49
  ## 🚀 Quick Start
@@ -361,7 +358,7 @@ cached_data = scout.cache('parsed_data')
361
358
  - `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
362
359
  - `_is_valid_url(url)`: Check if a URL is valid (internal method)
363
360
 
364
- For detailed API documentation, please refer to the [documentation](https://github.com/OE-LUCIFER/Webscout/wiki).
361
+ For detailed API documentation, please refer to the [documentation](https://github.com/OEvortex/Webscout/wiki).
365
362
 
366
363
  ## 🔧 Dependencies
367
364
 
@@ -396,9 +393,9 @@ This project is licensed under the MIT License - see the LICENSE file for detail
396
393
  <div align="center">
397
394
  <p>Made with ❤️ by the Webscout team</p>
398
395
  <p>
399
- <a href="https://github.com/OE-LUCIFER/Webscout">GitHub</a> •
400
- <a href="https://github.com/OE-LUCIFER/Webscout/wiki">Documentation</a> •
401
- <a href="https://github.com/OE-LUCIFER/Webscout/issues">Report Bug</a> •
402
- <a href="https://github.com/OE-LUCIFER/Webscout/issues">Request Feature</a>
396
+ <a href="https://github.com/OEvortex/Webscout">GitHub</a> •
397
+ <a href="https://github.com/OEvortex/Webscout/wiki">Documentation</a> •
398
+ <a href="https://github.com/OEvortex/Webscout/issues">Report Bug</a> •
399
+ <a href="https://github.com/OEvortex/Webscout/issues">Request Feature</a>
403
400
  </p>
404
401
  </div>