PyPI - webscout - Versions diffs - 8.3.5__py3-none-any.whl → 8.3.6__py3-none-any.whl - Mend

webscout 8.3.5py3-none-any.whl → 8.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of webscout might be problematic. Click here for more details.

Files changed (63) hide show

webscout/Bard.py +12 -6
webscout/DWEBS.py +66 -57
webscout/Provider/{UNFINISHED → AISEARCH}/PERPLEXED_search.py +34 -74
webscout/Provider/AISEARCH/__init__.py +1 -1
webscout/Provider/Deepinfra.py +6 -0
webscout/Provider/Flowith.py +6 -1
webscout/Provider/GithubChat.py +1 -0
webscout/Provider/GptOss.py +207 -0
webscout/Provider/Kimi.py +445 -0
webscout/Provider/Netwrck.py +3 -6
webscout/Provider/OPENAI/README.md +2 -1
webscout/Provider/OPENAI/TogetherAI.py +50 -55
webscout/Provider/OPENAI/__init__.py +4 -2
webscout/Provider/OPENAI/copilot.py +20 -4
webscout/Provider/OPENAI/deepinfra.py +6 -0
webscout/Provider/OPENAI/e2b.py +60 -8
webscout/Provider/OPENAI/flowith.py +4 -3
webscout/Provider/OPENAI/generate_api_key.py +48 -0
webscout/Provider/OPENAI/gptoss.py +288 -0
webscout/Provider/OPENAI/kimi.py +469 -0
webscout/Provider/OPENAI/netwrck.py +8 -12
webscout/Provider/OPENAI/refact.py +274 -0
webscout/Provider/OPENAI/textpollinations.py +3 -6
webscout/Provider/OPENAI/toolbaz.py +1 -0
webscout/Provider/TTI/bing.py +14 -2
webscout/Provider/TTI/together.py +10 -9
webscout/Provider/TTS/README.md +0 -1
webscout/Provider/TTS/__init__.py +0 -1
webscout/Provider/TTS/base.py +479 -159
webscout/Provider/TTS/deepgram.py +409 -156
webscout/Provider/TTS/elevenlabs.py +425 -111
webscout/Provider/TTS/freetts.py +317 -140
webscout/Provider/TTS/gesserit.py +192 -128
webscout/Provider/TTS/murfai.py +248 -113
webscout/Provider/TTS/openai_fm.py +347 -129
webscout/Provider/TTS/speechma.py +620 -586
webscout/Provider/TextPollinationsAI.py +3 -6
webscout/Provider/TogetherAI.py +50 -55
webscout/Provider/UNFINISHED/VercelAIGateway.py +339 -0
webscout/Provider/__init__.py +2 -90
webscout/Provider/cerebras.py +83 -33
webscout/Provider/copilot.py +42 -23
webscout/Provider/toolbaz.py +1 -0
webscout/conversation.py +22 -20
webscout/sanitize.py +14 -10
webscout/scout/README.md +20 -23
webscout/scout/core/crawler.py +125 -38
webscout/scout/core/scout.py +26 -5
webscout/version.py +1 -1
webscout/webscout_search.py +13 -6
webscout/webscout_search_async.py +10 -8
webscout/yep_search.py +13 -5
{webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/METADATA +2 -1
{webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/RECORD +59 -56
webscout/Provider/Glider.py +0 -225
webscout/Provider/OPENAI/c4ai.py +0 -394
webscout/Provider/OPENAI/glider.py +0 -330
webscout/Provider/TTS/sthir.py +0 -94
/webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +0 -0
{webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/WHEEL +0 -0
{webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/entry_points.txt +0 -0
{webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/licenses/LICENSE.md +0 -0
{webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/top_level.txt +0 -0

webscout/scout/core/crawler.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Scout Crawler Module
+Scout Crawler Module - Ultra Advanced Web Crawling System
 """
 import concurrent.futures
@@ -7,18 +7,82 @@ import urllib.parse
 import time
 import hashlib
 import re
+import json
+import sqlite3
+import threading
+import queue
+import logging
+import mimetypes
+import pickle
+import asyncio
+import aiohttp
+import random
 from urllib import robotparser
-from datetime import datetime
-from typing import Dict, List, Optional, Union
-from webscout.litagent import LitAgent
-from curl_cffi.requests import Session
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Union, Set, Tuple, Callable, Any
+from collections import defaultdict, deque
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+try:
+    from webscout.litagent import LitAgent
+except ImportError:
+    LitAgent = None
+try:
+    from curl_cffi.requests import Session
+except ImportError:
+    import requests
+    Session = requests.Session
 from .scout import Scout
+from .text_analyzer import ScoutTextAnalyzer
+@dataclass
+class CrawlConfig:
+    """Configuration for the crawler."""
+    max_pages: int = 1000
+    max_depth: int = 10
+    delay: float = 0.5
+    obey_robots: bool = True
+    crawl_subdomains: bool = True
+    max_workers: int = 10
+    timeout: int = 30
+    retry_attempts: int = 3
+    include_external_links: bool = False
+    extract_metadata: bool = True
+    extract_structured_data: bool = True
+    extract_semantic_content: bool = True
+@dataclass
+class PageData:
+    """Comprehensive page data for LLM training."""
+    url: str
+    title: str
+    text: str
+    clean_text: str
+    markdown_text: str
+    links: List[str]
+    internal_links: List[str]
+    external_links: List[str]
+    metadata: Dict[str, Any]
+    structured_data: Dict[str, Any]
+    semantic_content: Dict[str, Any]
+    headers: Dict[str, str]
+    status_code: int
+    content_type: str
+    language: str
+    timestamp: str
+    depth: int
+    word_count: int
 class ScoutCrawler:
     """
-    Advanced web crawling utility for Scout library.
+    Ultra-advanced web crawling utility optimized for LLM data collection.
     """
     def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
         """
@@ -33,13 +97,7 @@ class ScoutCrawler:
         self.max_pages = max_pages
         self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
             "script",
-            "style",
-            "header",
-            "footer",
-            "nav",
-            "aside",
-            "form",
-            "button",
+            "style"
         ]
         self.visited_urls = set()
         self.crawled_pages = []
@@ -50,7 +108,10 @@ class ScoutCrawler:
         self.session.headers.setdefault("User-Agent", self.agent.chrome())
         self.delay = delay
         self.obey_robots = obey_robots
-        self.allowed_domains = allowed_domains or [urllib.parse.urlparse(base_url).netloc]
+        # Allow crawling of subdomains by default
+        base_domain = urllib.parse.urlparse(base_url).netloc.split('.')
+        self.base_domain = '.'.join(base_domain[-2:]) if len(base_domain) > 1 else base_domain[0]
+        self.allowed_domains = allowed_domains or [self.base_domain]
         self.last_request_time = 0
         self.url_hashes = set()
         if obey_robots:
@@ -84,7 +145,8 @@ class ScoutCrawler:
             parsed_url = urllib.parse.urlparse(url)
             if parsed_url.scheme not in ["http", "https"]:
                 return False
-            if parsed_url.netloc not in self.allowed_domains:
+            # Allow crawling subdomains
+            if not parsed_url.netloc.endswith(self.base_domain):
                 return False
             if self.obey_robots and self.robots:
                 return self.robots.can_fetch("*", url)
@@ -127,6 +189,9 @@ class ScoutCrawler:
         """
         if url in self.visited_urls or self._is_duplicate(url):
             return {}
+        # Log URL to crawl
+        print(f"Attempting to crawl URL: {url} (depth: {depth})")
         # Throttle requests
         now = time.time()
         if self.last_request_time:
@@ -142,18 +207,38 @@ class ScoutCrawler:
             scout = Scout(response.content, features="lxml")
             title_result = scout.find("title")
             title = title_result[0].get_text() if title_result else ""
+            # Remove only script and style tags before extracting text
             for tag_name in self.tags_to_remove:
                 for tag in scout._soup.find_all(tag_name):
-                    tag.extract()
+                    tag.decompose()
             visible_text = self._extract_main_text(scout._soup)
+            # Extract links from header, footer, nav, etc.
+            essential_links = []
+            for essential_tag in ['header', 'nav', 'footer']:
+                elements = scout.find_all(essential_tag)
+                for element in elements:
+                    links = element.find_all('a', href=True)
+                    essential_links.extend(
+                        urllib.parse.urljoin(url, link.get('href'))
+                        for link in links
+                        if link.get('href') and self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
+                    )
+            all_links = [
+                urllib.parse.urljoin(url, link.get('href'))
+                for link in scout.find_all('a', href=True)
+                if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
+            ]
+            combined_links = list(set(all_links + essential_links))
             page_info = {
                 'url': url,
                 'title': title,
-                'links': [
-                    urllib.parse.urljoin(url, link.get('href'))
-                    for link in scout.find_all('a', href=True)
-                    if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
-                ],
+                'links': combined_links,
                 'text': visible_text,
                 'depth': depth,
                 'timestamp': datetime.utcnow().isoformat(),
@@ -178,7 +263,7 @@ class ScoutCrawler:
             submitted_links: set[str] = set()
             while futures:
-                if len(self.visited_urls) >= self.max_pages:
+                if self.max_pages is not None and len(self.visited_urls) >= self.max_pages:
                     break
                 done, not_done = concurrent.futures.wait(
                     futures, return_when=concurrent.futures.FIRST_COMPLETED
@@ -190,21 +275,23 @@ class ScoutCrawler:
                     if page_info:
                         yield page_info
+                        if self.max_pages is not None and len(self.visited_urls) >= self.max_pages:
+                            return
-                    if len(self.visited_urls) >= self.max_pages:
-                        return
-                    for link in page_info.get("links", []):
-                        if (
-                            len(self.visited_urls) < self.max_pages
-                            and link not in self.visited_urls
-                            and link not in submitted_links
-                        ):
-                            submitted_links.add(link)
-                            futures.add(
-                                executor.submit(
-                                    self._crawl_page,
-                                    link,
-                                    page_info.get("depth", 0) + 1,
+                        for link in page_info.get("links", []):
+                            if (
+                                (self.max_pages is None or len(self.visited_urls) < self.max_pages)
+                                and link not in self.visited_urls
+                                and link not in submitted_links
+                            ):
+                                submitted_links.add(link)
+                                futures.add(
+                                    executor.submit(
+                                        self._crawl_page,
+                                        link,
+                                        page_info.get("depth", 0) + 1,
+                                    )
                                 )
-                            )
+                    else:
+                        print(f"No page info retrieved from crawling")

webscout/scout/core/scout.py CHANGED Viewed

@@ -24,7 +24,8 @@ class Scout:
     Enhanced with advanced features and intelligent parsing.
     """
-    def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
+    def __init__(self, markup="", features='html.parser', from_encoding=None,
+                 exclude_encodings=None, element_classes=None, **kwargs):
         """
         Initialize Scout with HTML content.
@@ -32,8 +33,17 @@ class Scout:
             markup (str): HTML content to parse
             features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
             from_encoding (str): Source encoding (if known)
+            exclude_encodings (list): Encodings to avoid
+            element_classes (dict): Custom classes for different element types
             **kwargs: Additional parsing options
         """
+        # Store original markup and settings
+        self.original_encoding = from_encoding
+        self.exclude_encodings = exclude_encodings or []
+        self.element_classes = element_classes or {}
+        self.builder_features = features
+        self.contains_replacement_characters = False
         # Intelligent markup handling
         self.markup = self._preprocess_markup(markup, from_encoding)
         self.features = features
@@ -50,13 +60,24 @@ class Scout:
         # Parse that HTML! 🎯
         self._soup = self.parser.parse(self.markup)
+        # Set up the root element properly
+        if hasattr(self._soup, 'name'):
+            self.name = self._soup.name
+        else:
+            self.name = '[document]'
         # BeautifulSoup-like attributes
-        self.name = self._soup.name if hasattr(self._soup, 'name') else None
         self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
-        # Advanced parsing options
+        self.contents = self._soup.contents if hasattr(self._soup, 'contents') else []
+        self.parent = None
+        self.next_sibling = None
+        self.previous_sibling = None
+        # Advanced parsing options and caching
         self._cache = {}
+        self._tag_name_cache = {}
+        self._css_selector_cache = {}
         # Text and web analyzers
         self.text_analyzer = ScoutTextAnalyzer()

webscout/version.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = "8.3.5"
+__version__ = "8.3.6"
 __prog__ = "webscout"

webscout/webscout_search.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 # import logging
 import json
-from urllib.parse import quote
+import os
 import warnings
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timezone
@@ -13,10 +13,17 @@ from random import choice, shuffle
 from threading import Event
 from time import sleep, time
 from types import TracebackType
-from typing import Any, cast
-import os
-from typing import Literal, Iterator
+from typing import Any, Literal
+from urllib.parse import quote
 from webscout.litagent import LitAgent
+# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
+# See: https://github.com/python-trio/trio/issues/3015
+try:
+    import trio  # noqa: F401
+except ImportError:
+    pass  # trio is optional, ignore if not available
 import curl_cffi.requests  # type: ignore
 try:
@@ -28,7 +35,7 @@ try:
 except ImportError:
     LXML_AVAILABLE = False
-from .exceptions import ConversationLimitException, WebscoutE, RatelimitE, TimeoutE
+from .exceptions import RatelimitE, TimeoutE, WebscoutE
 from .utils import (
     _calculate_distance,
     _expand_proxy_tb_alias,
@@ -1173,4 +1180,4 @@ class WEBS:
                     "visibility_m": hour.get("visibility"),
                 })
-        return formatted_data
+        return formatted_data

webscout/webscout_search_async.py CHANGED Viewed

@@ -3,14 +3,19 @@ from __future__ import annotations
 import asyncio
 import os
 import warnings
-from datetime import datetime, timezone
 from functools import cached_property
 from itertools import cycle
 from random import choice, shuffle
 from time import time
 from types import TracebackType
-from typing import Any, Dict, List, Optional, Type, Union, cast, AsyncIterator
+from typing import Any, Dict, List, Optional, Type, Union
+# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
+# See: https://github.com/python-trio/trio/issues/3015
+try:
+    import trio  # noqa: F401
+except ImportError:
+    pass  # trio is optional, ignore if not available
 import curl_cffi.requests
 from lxml.etree import _Element
 from lxml.html import HTMLParser as LHTMLParser
@@ -18,18 +23,15 @@ from lxml.html import document_fromstring
 from webscout.litagent.agent import LitAgent
-from .exceptions import ConversationLimitException, RatelimitE, TimeoutE, WebscoutE
+from .exceptions import RatelimitE, TimeoutE, WebscoutE
 from .utils import (
     _expand_proxy_tb_alias,
     _extract_vqd,
     _normalize,
     _normalize_url,
-    json_loads,
 )
 class AsyncWEBS:
     """Asynchronous webscout class to get search results."""
@@ -644,4 +646,4 @@ class AsyncWEBS:
             TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
         """
         # These methods are not implemented in the async version yet
-        raise NotImplementedError("aweather method is not implemented yet")
+        raise NotImplementedError("aweather method is not implemented yet")

webscout/yep_search.py CHANGED Viewed

@@ -1,9 +1,17 @@
-from curl_cffi.requests import Session
+# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
+# See: https://github.com/python-trio/trio/issues/3015
+try:
+    import trio  # noqa: F401
+except ImportError:
+    pass  # trio is optional, ignore if not available
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional
 from urllib.parse import urlencode
+from curl_cffi.requests import Session
 from webscout.litagent import LitAgent
-from typing import List, Dict, Optional, Tuple
-from concurrent.futures import ThreadPoolExecutor
-import json
 class YepSearch:
     """Yep.com search class to get search results."""
@@ -335,4 +343,4 @@ if __name__ == "__main__":
     print("---" * 30)
     print(image_results)
     print("---" * 30)
-    print(suggestions)
+    print(suggestions)

{webscout-8.3.5.dist-info → webscout-8.3.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: webscout
-Version: 8.3.5
+Version: 8.3.6
 Summary: Search for anything using Google, DuckDuckGo, phind.com, Contains AI models, can transcribe yt videos, temporary email and phone number generation, has TTS support, webai (terminal gpt and open interpreter) and offline LLMs and more
 Author-email: OEvortex <helpingai5@gmail.com>
 License: HelpingAI
@@ -73,6 +73,7 @@ Requires-Dist: tiktoken; extra == "api"
 Requires-Dist: motor; extra == "api"
 Requires-Dist: jinja2; extra == "api"
 Requires-Dist: supabase; extra == "api"
+Requires-Dist: websockets>=11.0; extra == "api"
 Dynamic: license-file
 <div align="center">

webscout 8.3.5__py3-none-any.whl → 8.3.6__py3-none-any.whl

Potentially problematic release.

webscout 8.3.5py3-none-any.whl → 8.3.6py3-none-any.whl