webscout 8.3.5__py3-none-any.whl → 8.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (63) hide show
  1. webscout/Bard.py +12 -6
  2. webscout/DWEBS.py +66 -57
  3. webscout/Provider/{UNFINISHED → AISEARCH}/PERPLEXED_search.py +34 -74
  4. webscout/Provider/AISEARCH/__init__.py +1 -1
  5. webscout/Provider/Deepinfra.py +6 -0
  6. webscout/Provider/Flowith.py +6 -1
  7. webscout/Provider/GithubChat.py +1 -0
  8. webscout/Provider/GptOss.py +207 -0
  9. webscout/Provider/Kimi.py +445 -0
  10. webscout/Provider/Netwrck.py +3 -6
  11. webscout/Provider/OPENAI/README.md +2 -1
  12. webscout/Provider/OPENAI/TogetherAI.py +50 -55
  13. webscout/Provider/OPENAI/__init__.py +4 -2
  14. webscout/Provider/OPENAI/copilot.py +20 -4
  15. webscout/Provider/OPENAI/deepinfra.py +6 -0
  16. webscout/Provider/OPENAI/e2b.py +60 -8
  17. webscout/Provider/OPENAI/flowith.py +4 -3
  18. webscout/Provider/OPENAI/generate_api_key.py +48 -0
  19. webscout/Provider/OPENAI/gptoss.py +288 -0
  20. webscout/Provider/OPENAI/kimi.py +469 -0
  21. webscout/Provider/OPENAI/netwrck.py +8 -12
  22. webscout/Provider/OPENAI/refact.py +274 -0
  23. webscout/Provider/OPENAI/textpollinations.py +3 -6
  24. webscout/Provider/OPENAI/toolbaz.py +1 -0
  25. webscout/Provider/TTI/bing.py +14 -2
  26. webscout/Provider/TTI/together.py +10 -9
  27. webscout/Provider/TTS/README.md +0 -1
  28. webscout/Provider/TTS/__init__.py +0 -1
  29. webscout/Provider/TTS/base.py +479 -159
  30. webscout/Provider/TTS/deepgram.py +409 -156
  31. webscout/Provider/TTS/elevenlabs.py +425 -111
  32. webscout/Provider/TTS/freetts.py +317 -140
  33. webscout/Provider/TTS/gesserit.py +192 -128
  34. webscout/Provider/TTS/murfai.py +248 -113
  35. webscout/Provider/TTS/openai_fm.py +347 -129
  36. webscout/Provider/TTS/speechma.py +620 -586
  37. webscout/Provider/TextPollinationsAI.py +3 -6
  38. webscout/Provider/TogetherAI.py +50 -55
  39. webscout/Provider/UNFINISHED/VercelAIGateway.py +339 -0
  40. webscout/Provider/__init__.py +2 -90
  41. webscout/Provider/cerebras.py +83 -33
  42. webscout/Provider/copilot.py +42 -23
  43. webscout/Provider/toolbaz.py +1 -0
  44. webscout/conversation.py +22 -20
  45. webscout/sanitize.py +14 -10
  46. webscout/scout/README.md +20 -23
  47. webscout/scout/core/crawler.py +125 -38
  48. webscout/scout/core/scout.py +26 -5
  49. webscout/version.py +1 -1
  50. webscout/webscout_search.py +13 -6
  51. webscout/webscout_search_async.py +10 -8
  52. webscout/yep_search.py +13 -5
  53. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/METADATA +2 -1
  54. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/RECORD +59 -56
  55. webscout/Provider/Glider.py +0 -225
  56. webscout/Provider/OPENAI/c4ai.py +0 -394
  57. webscout/Provider/OPENAI/glider.py +0 -330
  58. webscout/Provider/TTS/sthir.py +0 -94
  59. /webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +0 -0
  60. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/WHEEL +0 -0
  61. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/entry_points.txt +0 -0
  62. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/licenses/LICENSE.md +0 -0
  63. {webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Scout Crawler Module
2
+ Scout Crawler Module - Ultra Advanced Web Crawling System
3
3
  """
4
4
 
5
5
  import concurrent.futures
@@ -7,18 +7,82 @@ import urllib.parse
7
7
  import time
8
8
  import hashlib
9
9
  import re
10
+ import json
11
+ import sqlite3
12
+ import threading
13
+ import queue
14
+ import logging
15
+ import mimetypes
16
+ import pickle
17
+ import asyncio
18
+ import aiohttp
19
+ import random
10
20
  from urllib import robotparser
11
- from datetime import datetime
12
- from typing import Dict, List, Optional, Union
13
- from webscout.litagent import LitAgent
14
- from curl_cffi.requests import Session
21
+ from datetime import datetime, timedelta
22
+ from typing import Dict, List, Optional, Union, Set, Tuple, Callable, Any
23
+ from collections import defaultdict, deque
24
+ from dataclasses import dataclass, field
25
+ from enum import Enum
26
+ from pathlib import Path
27
+
28
+ try:
29
+ from webscout.litagent import LitAgent
30
+ except ImportError:
31
+ LitAgent = None
32
+
33
+ try:
34
+ from curl_cffi.requests import Session
35
+ except ImportError:
36
+ import requests
37
+ Session = requests.Session
15
38
 
16
39
  from .scout import Scout
40
+ from .text_analyzer import ScoutTextAnalyzer
41
+
17
42
 
43
+ @dataclass
44
+ class CrawlConfig:
45
+ """Configuration for the crawler."""
46
+ max_pages: int = 1000
47
+ max_depth: int = 10
48
+ delay: float = 0.5
49
+ obey_robots: bool = True
50
+ crawl_subdomains: bool = True
51
+ max_workers: int = 10
52
+ timeout: int = 30
53
+ retry_attempts: int = 3
54
+ include_external_links: bool = False
55
+ extract_metadata: bool = True
56
+ extract_structured_data: bool = True
57
+ extract_semantic_content: bool = True
58
+
59
+
60
+ @dataclass
61
+ class PageData:
62
+ """Comprehensive page data for LLM training."""
63
+ url: str
64
+ title: str
65
+ text: str
66
+ clean_text: str
67
+ markdown_text: str
68
+ links: List[str]
69
+ internal_links: List[str]
70
+ external_links: List[str]
71
+ metadata: Dict[str, Any]
72
+ structured_data: Dict[str, Any]
73
+ semantic_content: Dict[str, Any]
74
+ headers: Dict[str, str]
75
+ status_code: int
76
+ content_type: str
77
+ language: str
78
+ timestamp: str
79
+ depth: int
80
+ word_count: int
81
+
18
82
 
19
83
  class ScoutCrawler:
20
84
  """
21
- Advanced web crawling utility for Scout library.
85
+ Ultra-advanced web crawling utility optimized for LLM data collection.
22
86
  """
23
87
  def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
24
88
  """
@@ -33,13 +97,7 @@ class ScoutCrawler:
33
97
  self.max_pages = max_pages
34
98
  self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
35
99
  "script",
36
- "style",
37
- "header",
38
- "footer",
39
- "nav",
40
- "aside",
41
- "form",
42
- "button",
100
+ "style"
43
101
  ]
44
102
  self.visited_urls = set()
45
103
  self.crawled_pages = []
@@ -50,7 +108,10 @@ class ScoutCrawler:
50
108
  self.session.headers.setdefault("User-Agent", self.agent.chrome())
51
109
  self.delay = delay
52
110
  self.obey_robots = obey_robots
53
- self.allowed_domains = allowed_domains or [urllib.parse.urlparse(base_url).netloc]
111
+ # Allow crawling of subdomains by default
112
+ base_domain = urllib.parse.urlparse(base_url).netloc.split('.')
113
+ self.base_domain = '.'.join(base_domain[-2:]) if len(base_domain) > 1 else base_domain[0]
114
+ self.allowed_domains = allowed_domains or [self.base_domain]
54
115
  self.last_request_time = 0
55
116
  self.url_hashes = set()
56
117
  if obey_robots:
@@ -84,7 +145,8 @@ class ScoutCrawler:
84
145
  parsed_url = urllib.parse.urlparse(url)
85
146
  if parsed_url.scheme not in ["http", "https"]:
86
147
  return False
87
- if parsed_url.netloc not in self.allowed_domains:
148
+ # Allow crawling subdomains
149
+ if not parsed_url.netloc.endswith(self.base_domain):
88
150
  return False
89
151
  if self.obey_robots and self.robots:
90
152
  return self.robots.can_fetch("*", url)
@@ -127,6 +189,9 @@ class ScoutCrawler:
127
189
  """
128
190
  if url in self.visited_urls or self._is_duplicate(url):
129
191
  return {}
192
+ # Log URL to crawl
193
+ print(f"Attempting to crawl URL: {url} (depth: {depth})")
194
+
130
195
  # Throttle requests
131
196
  now = time.time()
132
197
  if self.last_request_time:
@@ -142,18 +207,38 @@ class ScoutCrawler:
142
207
  scout = Scout(response.content, features="lxml")
143
208
  title_result = scout.find("title")
144
209
  title = title_result[0].get_text() if title_result else ""
210
+
211
+ # Remove only script and style tags before extracting text
145
212
  for tag_name in self.tags_to_remove:
146
213
  for tag in scout._soup.find_all(tag_name):
147
- tag.extract()
214
+ tag.decompose()
215
+
148
216
  visible_text = self._extract_main_text(scout._soup)
217
+
218
+ # Extract links from header, footer, nav, etc.
219
+ essential_links = []
220
+ for essential_tag in ['header', 'nav', 'footer']:
221
+ elements = scout.find_all(essential_tag)
222
+ for element in elements:
223
+ links = element.find_all('a', href=True)
224
+ essential_links.extend(
225
+ urllib.parse.urljoin(url, link.get('href'))
226
+ for link in links
227
+ if link.get('href') and self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
228
+ )
229
+
230
+ all_links = [
231
+ urllib.parse.urljoin(url, link.get('href'))
232
+ for link in scout.find_all('a', href=True)
233
+ if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
234
+ ]
235
+
236
+ combined_links = list(set(all_links + essential_links))
237
+
149
238
  page_info = {
150
239
  'url': url,
151
240
  'title': title,
152
- 'links': [
153
- urllib.parse.urljoin(url, link.get('href'))
154
- for link in scout.find_all('a', href=True)
155
- if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
156
- ],
241
+ 'links': combined_links,
157
242
  'text': visible_text,
158
243
  'depth': depth,
159
244
  'timestamp': datetime.utcnow().isoformat(),
@@ -178,7 +263,7 @@ class ScoutCrawler:
178
263
  submitted_links: set[str] = set()
179
264
 
180
265
  while futures:
181
- if len(self.visited_urls) >= self.max_pages:
266
+ if self.max_pages is not None and len(self.visited_urls) >= self.max_pages:
182
267
  break
183
268
  done, not_done = concurrent.futures.wait(
184
269
  futures, return_when=concurrent.futures.FIRST_COMPLETED
@@ -190,21 +275,23 @@ class ScoutCrawler:
190
275
 
191
276
  if page_info:
192
277
  yield page_info
278
+
279
+ if self.max_pages is not None and len(self.visited_urls) >= self.max_pages:
280
+ return
193
281
 
194
- if len(self.visited_urls) >= self.max_pages:
195
- return
196
-
197
- for link in page_info.get("links", []):
198
- if (
199
- len(self.visited_urls) < self.max_pages
200
- and link not in self.visited_urls
201
- and link not in submitted_links
202
- ):
203
- submitted_links.add(link)
204
- futures.add(
205
- executor.submit(
206
- self._crawl_page,
207
- link,
208
- page_info.get("depth", 0) + 1,
282
+ for link in page_info.get("links", []):
283
+ if (
284
+ (self.max_pages is None or len(self.visited_urls) < self.max_pages)
285
+ and link not in self.visited_urls
286
+ and link not in submitted_links
287
+ ):
288
+ submitted_links.add(link)
289
+ futures.add(
290
+ executor.submit(
291
+ self._crawl_page,
292
+ link,
293
+ page_info.get("depth", 0) + 1,
294
+ )
209
295
  )
210
- )
296
+ else:
297
+ print(f"No page info retrieved from crawling")
@@ -24,7 +24,8 @@ class Scout:
24
24
  Enhanced with advanced features and intelligent parsing.
25
25
  """
26
26
 
27
- def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
27
+ def __init__(self, markup="", features='html.parser', from_encoding=None,
28
+ exclude_encodings=None, element_classes=None, **kwargs):
28
29
  """
29
30
  Initialize Scout with HTML content.
30
31
 
@@ -32,8 +33,17 @@ class Scout:
32
33
  markup (str): HTML content to parse
33
34
  features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
34
35
  from_encoding (str): Source encoding (if known)
36
+ exclude_encodings (list): Encodings to avoid
37
+ element_classes (dict): Custom classes for different element types
35
38
  **kwargs: Additional parsing options
36
39
  """
40
+ # Store original markup and settings
41
+ self.original_encoding = from_encoding
42
+ self.exclude_encodings = exclude_encodings or []
43
+ self.element_classes = element_classes or {}
44
+ self.builder_features = features
45
+ self.contains_replacement_characters = False
46
+
37
47
  # Intelligent markup handling
38
48
  self.markup = self._preprocess_markup(markup, from_encoding)
39
49
  self.features = features
@@ -50,13 +60,24 @@ class Scout:
50
60
 
51
61
  # Parse that HTML! 🎯
52
62
  self._soup = self.parser.parse(self.markup)
53
-
63
+
64
+ # Set up the root element properly
65
+ if hasattr(self._soup, 'name'):
66
+ self.name = self._soup.name
67
+ else:
68
+ self.name = '[document]'
69
+
54
70
  # BeautifulSoup-like attributes
55
- self.name = self._soup.name if hasattr(self._soup, 'name') else None
56
71
  self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
57
-
58
- # Advanced parsing options
72
+ self.contents = self._soup.contents if hasattr(self._soup, 'contents') else []
73
+ self.parent = None
74
+ self.next_sibling = None
75
+ self.previous_sibling = None
76
+
77
+ # Advanced parsing options and caching
59
78
  self._cache = {}
79
+ self._tag_name_cache = {}
80
+ self._css_selector_cache = {}
60
81
 
61
82
  # Text and web analyzers
62
83
  self.text_analyzer = ScoutTextAnalyzer()
webscout/version.py CHANGED
@@ -1,2 +1,2 @@
1
- __version__ = "8.3.5"
1
+ __version__ = "8.3.6"
2
2
  __prog__ = "webscout"
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  # import logging
4
4
  import json
5
- from urllib.parse import quote
5
+ import os
6
6
  import warnings
7
7
  from concurrent.futures import ThreadPoolExecutor
8
8
  from datetime import datetime, timezone
@@ -13,10 +13,17 @@ from random import choice, shuffle
13
13
  from threading import Event
14
14
  from time import sleep, time
15
15
  from types import TracebackType
16
- from typing import Any, cast
17
- import os
18
- from typing import Literal, Iterator
16
+ from typing import Any, Literal
17
+ from urllib.parse import quote
18
+
19
19
  from webscout.litagent import LitAgent
20
+
21
+ # Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
22
+ # See: https://github.com/python-trio/trio/issues/3015
23
+ try:
24
+ import trio # noqa: F401
25
+ except ImportError:
26
+ pass # trio is optional, ignore if not available
20
27
  import curl_cffi.requests # type: ignore
21
28
 
22
29
  try:
@@ -28,7 +35,7 @@ try:
28
35
  except ImportError:
29
36
  LXML_AVAILABLE = False
30
37
 
31
- from .exceptions import ConversationLimitException, WebscoutE, RatelimitE, TimeoutE
38
+ from .exceptions import RatelimitE, TimeoutE, WebscoutE
32
39
  from .utils import (
33
40
  _calculate_distance,
34
41
  _expand_proxy_tb_alias,
@@ -1173,4 +1180,4 @@ class WEBS:
1173
1180
  "visibility_m": hour.get("visibility"),
1174
1181
  })
1175
1182
 
1176
- return formatted_data
1183
+ return formatted_data
@@ -3,14 +3,19 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import os
5
5
  import warnings
6
- from datetime import datetime, timezone
7
6
  from functools import cached_property
8
7
  from itertools import cycle
9
8
  from random import choice, shuffle
10
9
  from time import time
11
10
  from types import TracebackType
12
- from typing import Any, Dict, List, Optional, Type, Union, cast, AsyncIterator
13
-
11
+ from typing import Any, Dict, List, Optional, Type, Union
12
+
13
+ # Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
14
+ # See: https://github.com/python-trio/trio/issues/3015
15
+ try:
16
+ import trio # noqa: F401
17
+ except ImportError:
18
+ pass # trio is optional, ignore if not available
14
19
  import curl_cffi.requests
15
20
  from lxml.etree import _Element
16
21
  from lxml.html import HTMLParser as LHTMLParser
@@ -18,18 +23,15 @@ from lxml.html import document_fromstring
18
23
 
19
24
  from webscout.litagent.agent import LitAgent
20
25
 
21
- from .exceptions import ConversationLimitException, RatelimitE, TimeoutE, WebscoutE
26
+ from .exceptions import RatelimitE, TimeoutE, WebscoutE
22
27
  from .utils import (
23
28
  _expand_proxy_tb_alias,
24
29
  _extract_vqd,
25
30
  _normalize,
26
31
  _normalize_url,
27
- json_loads,
28
32
  )
29
33
 
30
34
 
31
-
32
-
33
35
  class AsyncWEBS:
34
36
  """Asynchronous webscout class to get search results."""
35
37
 
@@ -644,4 +646,4 @@ class AsyncWEBS:
644
646
  TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
645
647
  """
646
648
  # These methods are not implemented in the async version yet
647
- raise NotImplementedError("aweather method is not implemented yet")
649
+ raise NotImplementedError("aweather method is not implemented yet")
webscout/yep_search.py CHANGED
@@ -1,9 +1,17 @@
1
- from curl_cffi.requests import Session
1
+ # Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
2
+ # See: https://github.com/python-trio/trio/issues/3015
3
+ try:
4
+ import trio # noqa: F401
5
+ except ImportError:
6
+ pass # trio is optional, ignore if not available
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from typing import Dict, List, Optional
2
9
  from urllib.parse import urlencode
10
+
11
+ from curl_cffi.requests import Session
12
+
3
13
  from webscout.litagent import LitAgent
4
- from typing import List, Dict, Optional, Tuple
5
- from concurrent.futures import ThreadPoolExecutor
6
- import json
14
+
7
15
 
8
16
  class YepSearch:
9
17
  """Yep.com search class to get search results."""
@@ -335,4 +343,4 @@ if __name__ == "__main__":
335
343
  print("---" * 30)
336
344
  print(image_results)
337
345
  print("---" * 30)
338
- print(suggestions)
346
+ print(suggestions)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: webscout
3
- Version: 8.3.5
3
+ Version: 8.3.6
4
4
  Summary: Search for anything using Google, DuckDuckGo, phind.com, Contains AI models, can transcribe yt videos, temporary email and phone number generation, has TTS support, webai (terminal gpt and open interpreter) and offline LLMs and more
5
5
  Author-email: OEvortex <helpingai5@gmail.com>
6
6
  License: HelpingAI
@@ -73,6 +73,7 @@ Requires-Dist: tiktoken; extra == "api"
73
73
  Requires-Dist: motor; extra == "api"
74
74
  Requires-Dist: jinja2; extra == "api"
75
75
  Requires-Dist: supabase; extra == "api"
76
+ Requires-Dist: websockets>=11.0; extra == "api"
76
77
  Dynamic: license-file
77
78
 
78
79
  <div align="center">