PyPI - webscout - Versions diffs - 8.3.4__py3-none-any.whl → 8.3.6__py3-none-any.whl - Mend

webscout 8.3.4py3-none-any.whl → 8.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of webscout might be problematic. Click here for more details.

Files changed (98) hide show

webscout/AIutel.py +52 -1016
webscout/Bard.py +12 -6
webscout/DWEBS.py +66 -57
webscout/Provider/AISEARCH/PERPLEXED_search.py +214 -0
webscout/Provider/AISEARCH/__init__.py +11 -10
webscout/Provider/AISEARCH/felo_search.py +7 -3
webscout/Provider/AISEARCH/scira_search.py +2 -0
webscout/Provider/AISEARCH/stellar_search.py +53 -8
webscout/Provider/Deepinfra.py +13 -1
webscout/Provider/Flowith.py +6 -1
webscout/Provider/GithubChat.py +1 -0
webscout/Provider/GptOss.py +207 -0
webscout/Provider/Kimi.py +445 -0
webscout/Provider/Netwrck.py +3 -6
webscout/Provider/OPENAI/README.md +2 -1
webscout/Provider/OPENAI/TogetherAI.py +12 -8
webscout/Provider/OPENAI/TwoAI.py +94 -1
webscout/Provider/OPENAI/__init__.py +4 -4
webscout/Provider/OPENAI/copilot.py +20 -4
webscout/Provider/OPENAI/deepinfra.py +12 -0
webscout/Provider/OPENAI/e2b.py +60 -8
webscout/Provider/OPENAI/flowith.py +4 -3
webscout/Provider/OPENAI/generate_api_key.py +48 -0
webscout/Provider/OPENAI/gptoss.py +288 -0
webscout/Provider/OPENAI/kimi.py +469 -0
webscout/Provider/OPENAI/netwrck.py +8 -12
webscout/Provider/OPENAI/refact.py +274 -0
webscout/Provider/OPENAI/scirachat.py +4 -0
webscout/Provider/OPENAI/textpollinations.py +11 -10
webscout/Provider/OPENAI/toolbaz.py +1 -0
webscout/Provider/OPENAI/venice.py +1 -0
webscout/Provider/Perplexitylabs.py +163 -147
webscout/Provider/Qodo.py +30 -6
webscout/Provider/TTI/__init__.py +1 -0
webscout/Provider/TTI/bing.py +14 -2
webscout/Provider/TTI/together.py +11 -9
webscout/Provider/TTI/venice.py +368 -0
webscout/Provider/TTS/README.md +0 -1
webscout/Provider/TTS/__init__.py +0 -1
webscout/Provider/TTS/base.py +479 -159
webscout/Provider/TTS/deepgram.py +409 -156
webscout/Provider/TTS/elevenlabs.py +425 -111
webscout/Provider/TTS/freetts.py +317 -140
webscout/Provider/TTS/gesserit.py +192 -128
webscout/Provider/TTS/murfai.py +248 -113
webscout/Provider/TTS/openai_fm.py +347 -129
webscout/Provider/TTS/speechma.py +620 -586
webscout/Provider/TextPollinationsAI.py +11 -10
webscout/Provider/TogetherAI.py +12 -4
webscout/Provider/TwoAI.py +96 -2
webscout/Provider/TypliAI.py +33 -27
webscout/Provider/UNFINISHED/VercelAIGateway.py +339 -0
webscout/Provider/UNFINISHED/fetch_together_models.py +6 -11
webscout/Provider/Venice.py +1 -0
webscout/Provider/WiseCat.py +18 -20
webscout/Provider/__init__.py +2 -96
webscout/Provider/cerebras.py +83 -33
webscout/Provider/copilot.py +42 -23
webscout/Provider/scira_chat.py +4 -0
webscout/Provider/toolbaz.py +6 -10
webscout/Provider/typefully.py +1 -11
webscout/__init__.py +3 -15
webscout/auth/__init__.py +19 -4
webscout/auth/api_key_manager.py +189 -189
webscout/auth/auth_system.py +25 -40
webscout/auth/config.py +105 -6
webscout/auth/database.py +377 -22
webscout/auth/models.py +185 -130
webscout/auth/request_processing.py +175 -11
webscout/auth/routes.py +99 -2
webscout/auth/server.py +9 -2
webscout/auth/simple_logger.py +236 -0
webscout/conversation.py +22 -20
webscout/sanitize.py +1078 -0
webscout/scout/README.md +20 -23
webscout/scout/core/crawler.py +125 -38
webscout/scout/core/scout.py +26 -5
webscout/version.py +1 -1
webscout/webscout_search.py +13 -6
webscout/webscout_search_async.py +10 -8
webscout/yep_search.py +13 -5
{webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/METADATA +10 -149
{webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/RECORD +88 -87
webscout/Provider/Glider.py +0 -225
webscout/Provider/OPENAI/README_AUTOPROXY.md +0 -238
webscout/Provider/OPENAI/c4ai.py +0 -394
webscout/Provider/OPENAI/glider.py +0 -330
webscout/Provider/OPENAI/typegpt.py +0 -368
webscout/Provider/OPENAI/uncovrAI.py +0 -477
webscout/Provider/TTS/sthir.py +0 -94
webscout/Provider/WritingMate.py +0 -273
webscout/Provider/typegpt.py +0 -284
webscout/Provider/uncovr.py +0 -333
/webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +0 -0
{webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/WHEEL +0 -0
{webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/entry_points.txt +0 -0
{webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/licenses/LICENSE.md +0 -0
{webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/top_level.txt +0 -0

webscout/scout/README.md CHANGED Viewed

@@ -1,27 +1,24 @@
-# 🕵️ Scout: Next-Gen Web Parsing Library
+**🚀 The Most Advanced HTML Parser & Web Crawler for AI/LLM Data Collection**
-<div align="center">
-[![Python](https://img.shields.io/badge/Python-3.8%2B-blue)](https://www.python.org/)
-[![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
-[![Maintenance](https://img.shields.io/badge/Maintained-Yes-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout)
-[![Documentation](https://img.shields.io/badge/Docs-Wiki-orange)](https://github.com/OE-LUCIFER/Webscout/wiki)
-[![PRs Welcome](https://img.shields.io/badge/PRs-Welcome-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout/pulls)
+**🌟 Built for the Future • Powered by Intelligence • Trusted by Developers**
-</div>
 ## 📋 Overview
-Scout is a powerful, flexible, and performant HTML parsing library that makes web scraping a breeze! It provides intelligent HTML/XML parsing with advanced features like web crawling, text analysis, semantic extraction, and Markdown conversion. Scout goes beyond traditional parsing libraries with its intuitive API and comprehensive feature set.
+Scout is an ultra-powerful, enterprise-grade HTML parsing and web crawling library designed for the AI era. Built with LLM data collection in mind, Scout provides unparalleled capabilities for extracting, analyzing, and processing web content at scale. With its BeautifulSoup-compatible API enhanced with modern features, Scout is the go-to solution for serious web scraping projects.
 <details open>
-<summary><b>Why Choose Scout?</b></summary>
-- **Powerful Parsing**: Multiple parser backends with intelligent markup handling
-- **Advanced Analysis**: Built-in text and web content analysis tools
-- **Concurrent Crawling**: Efficient multi-threaded web crawling
-- **Flexible API**: Intuitive interface similar to BeautifulSoup but with enhanced capabilities
-- **Format Conversion**: Convert HTML to JSON, Markdown, and more
+<summary><b>🌟 Why Scout is the Ultimate Choice</b></summary>
+- **🧠 LLM-Optimized Crawling**: Purpose-built for collecting high-quality training data for Large Language Models
+- **🌐 Subdomain Intelligence**: Automatically discovers and crawls subdomains (e.g., blog.example.com, docs.example.com)
+- **⚡ Lightning-Fast Performance**: Multi-threaded concurrent crawling with intelligent rate limiting
+- **🎯 Surgical Precision**: Advanced content extraction that preserves structure while removing noise
+- **🔍 Deep Analysis**: Built-in NLP capabilities for entity extraction, text analysis, and semantic understanding
+- **🛡️ Enterprise-Ready**: Robust error handling, retry mechanisms, and respect for robots.txt
+- **📊 Rich Data Extraction**: Captures metadata, structured data, semantic content, and more
+- **🔄 Format Flexibility**: Export to JSON, Markdown, CSV, or custom formats
+- **🎨 BeautifulSoup++ API**: Familiar interface with 10x more features
 </details>
@@ -46,7 +43,7 @@ pip install webscout
 Or install the latest version from GitHub:
 ```bash
-pip install git+https://github.com/OE-LUCIFER/Webscout.git
+pip install git+https://github.com/OEvortex/Webscout.git
 ```
 ## 🚀 Quick Start
@@ -361,7 +358,7 @@ cached_data = scout.cache('parsed_data')
 - `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
 - `_is_valid_url(url)`: Check if a URL is valid (internal method)
-For detailed API documentation, please refer to the [documentation](https://github.com/OE-LUCIFER/Webscout/wiki).
+For detailed API documentation, please refer to the [documentation](https://github.com/OEvortex/Webscout/wiki).
 ## 🔧 Dependencies
@@ -396,9 +393,9 @@ This project is licensed under the MIT License - see the LICENSE file for detail
 <div align="center">
   <p>Made with ❤️ by the Webscout team</p>
   <p>
-    <a href="https://github.com/OE-LUCIFER/Webscout">GitHub</a> •
-    <a href="https://github.com/OE-LUCIFER/Webscout/wiki">Documentation</a> •
-    <a href="https://github.com/OE-LUCIFER/Webscout/issues">Report Bug</a> •
-    <a href="https://github.com/OE-LUCIFER/Webscout/issues">Request Feature</a>
+    <a href="https://github.com/OEvortex/Webscout">GitHub</a> •
+    <a href="https://github.com/OEvortex/Webscout/wiki">Documentation</a> •
+    <a href="https://github.com/OEvortex/Webscout/issues">Report Bug</a> •
+    <a href="https://github.com/OEvortex/Webscout/issues">Request Feature</a>
   </p>
 </div>

webscout/scout/core/crawler.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Scout Crawler Module
+Scout Crawler Module - Ultra Advanced Web Crawling System
 """
 import concurrent.futures
@@ -7,18 +7,82 @@ import urllib.parse
 import time
 import hashlib
 import re
+import json
+import sqlite3
+import threading
+import queue
+import logging
+import mimetypes
+import pickle
+import asyncio
+import aiohttp
+import random
 from urllib import robotparser
-from datetime import datetime
-from typing import Dict, List, Optional, Union
-from webscout.litagent import LitAgent
-from curl_cffi.requests import Session
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Union, Set, Tuple, Callable, Any
+from collections import defaultdict, deque
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+try:
+    from webscout.litagent import LitAgent
+except ImportError:
+    LitAgent = None
+try:
+    from curl_cffi.requests import Session
+except ImportError:
+    import requests
+    Session = requests.Session
 from .scout import Scout
+from .text_analyzer import ScoutTextAnalyzer
+@dataclass
+class CrawlConfig:
+    """Configuration for the crawler."""
+    max_pages: int = 1000
+    max_depth: int = 10
+    delay: float = 0.5
+    obey_robots: bool = True
+    crawl_subdomains: bool = True
+    max_workers: int = 10
+    timeout: int = 30
+    retry_attempts: int = 3
+    include_external_links: bool = False
+    extract_metadata: bool = True
+    extract_structured_data: bool = True
+    extract_semantic_content: bool = True
+@dataclass
+class PageData:
+    """Comprehensive page data for LLM training."""
+    url: str
+    title: str
+    text: str
+    clean_text: str
+    markdown_text: str
+    links: List[str]
+    internal_links: List[str]
+    external_links: List[str]
+    metadata: Dict[str, Any]
+    structured_data: Dict[str, Any]
+    semantic_content: Dict[str, Any]
+    headers: Dict[str, str]
+    status_code: int
+    content_type: str
+    language: str
+    timestamp: str
+    depth: int
+    word_count: int
 class ScoutCrawler:
     """
-    Advanced web crawling utility for Scout library.
+    Ultra-advanced web crawling utility optimized for LLM data collection.
     """
     def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
         """
@@ -33,13 +97,7 @@ class ScoutCrawler:
         self.max_pages = max_pages
         self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
             "script",
-            "style",
-            "header",
-            "footer",
-            "nav",
-            "aside",
-            "form",
-            "button",
+            "style"
         ]
         self.visited_urls = set()
         self.crawled_pages = []
@@ -50,7 +108,10 @@ class ScoutCrawler:
         self.session.headers.setdefault("User-Agent", self.agent.chrome())
         self.delay = delay
         self.obey_robots = obey_robots
-        self.allowed_domains = allowed_domains or [urllib.parse.urlparse(base_url).netloc]
+        # Allow crawling of subdomains by default
+        base_domain = urllib.parse.urlparse(base_url).netloc.split('.')
+        self.base_domain = '.'.join(base_domain[-2:]) if len(base_domain) > 1 else base_domain[0]
+        self.allowed_domains = allowed_domains or [self.base_domain]
         self.last_request_time = 0
         self.url_hashes = set()
         if obey_robots:
@@ -84,7 +145,8 @@ class ScoutCrawler:
             parsed_url = urllib.parse.urlparse(url)
             if parsed_url.scheme not in ["http", "https"]:
                 return False
-            if parsed_url.netloc not in self.allowed_domains:
+            # Allow crawling subdomains
+            if not parsed_url.netloc.endswith(self.base_domain):
                 return False
             if self.obey_robots and self.robots:
                 return self.robots.can_fetch("*", url)
@@ -127,6 +189,9 @@ class ScoutCrawler:
         """
         if url in self.visited_urls or self._is_duplicate(url):
             return {}
+        # Log URL to crawl
+        print(f"Attempting to crawl URL: {url} (depth: {depth})")
         # Throttle requests
         now = time.time()
         if self.last_request_time:
@@ -142,18 +207,38 @@ class ScoutCrawler:
             scout = Scout(response.content, features="lxml")
             title_result = scout.find("title")
             title = title_result[0].get_text() if title_result else ""
+            # Remove only script and style tags before extracting text
             for tag_name in self.tags_to_remove:
                 for tag in scout._soup.find_all(tag_name):
-                    tag.extract()
+                    tag.decompose()
             visible_text = self._extract_main_text(scout._soup)
+            # Extract links from header, footer, nav, etc.
+            essential_links = []
+            for essential_tag in ['header', 'nav', 'footer']:
+                elements = scout.find_all(essential_tag)
+                for element in elements:
+                    links = element.find_all('a', href=True)
+                    essential_links.extend(
+                        urllib.parse.urljoin(url, link.get('href'))
+                        for link in links
+                        if link.get('href') and self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
+                    )
+            all_links = [
+                urllib.parse.urljoin(url, link.get('href'))
+                for link in scout.find_all('a', href=True)
+                if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
+            ]
+            combined_links = list(set(all_links + essential_links))
             page_info = {
                 'url': url,
                 'title': title,
-                'links': [
-                    urllib.parse.urljoin(url, link.get('href'))
-                    for link in scout.find_all('a', href=True)
-                    if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
-                ],
+                'links': combined_links,
                 'text': visible_text,
                 'depth': depth,
                 'timestamp': datetime.utcnow().isoformat(),
@@ -178,7 +263,7 @@ class ScoutCrawler:
             submitted_links: set[str] = set()
             while futures:
-                if len(self.visited_urls) >= self.max_pages:
+                if self.max_pages is not None and len(self.visited_urls) >= self.max_pages:
                     break
                 done, not_done = concurrent.futures.wait(
                     futures, return_when=concurrent.futures.FIRST_COMPLETED
@@ -190,21 +275,23 @@ class ScoutCrawler:
                     if page_info:
                         yield page_info
+                        if self.max_pages is not None and len(self.visited_urls) >= self.max_pages:
+                            return
-                    if len(self.visited_urls) >= self.max_pages:
-                        return
-                    for link in page_info.get("links", []):
-                        if (
-                            len(self.visited_urls) < self.max_pages
-                            and link not in self.visited_urls
-                            and link not in submitted_links
-                        ):
-                            submitted_links.add(link)
-                            futures.add(
-                                executor.submit(
-                                    self._crawl_page,
-                                    link,
-                                    page_info.get("depth", 0) + 1,
+                        for link in page_info.get("links", []):
+                            if (
+                                (self.max_pages is None or len(self.visited_urls) < self.max_pages)
+                                and link not in self.visited_urls
+                                and link not in submitted_links
+                            ):
+                                submitted_links.add(link)
+                                futures.add(
+                                    executor.submit(
+                                        self._crawl_page,
+                                        link,
+                                        page_info.get("depth", 0) + 1,
+                                    )
                                 )
-                            )
+                    else:
+                        print(f"No page info retrieved from crawling")

webscout/scout/core/scout.py CHANGED Viewed

@@ -24,7 +24,8 @@ class Scout:
     Enhanced with advanced features and intelligent parsing.
     """
-    def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
+    def __init__(self, markup="", features='html.parser', from_encoding=None,
+                 exclude_encodings=None, element_classes=None, **kwargs):
         """
         Initialize Scout with HTML content.
@@ -32,8 +33,17 @@ class Scout:
             markup (str): HTML content to parse
             features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
             from_encoding (str): Source encoding (if known)
+            exclude_encodings (list): Encodings to avoid
+            element_classes (dict): Custom classes for different element types
             **kwargs: Additional parsing options
         """
+        # Store original markup and settings
+        self.original_encoding = from_encoding
+        self.exclude_encodings = exclude_encodings or []
+        self.element_classes = element_classes or {}
+        self.builder_features = features
+        self.contains_replacement_characters = False
         # Intelligent markup handling
         self.markup = self._preprocess_markup(markup, from_encoding)
         self.features = features
@@ -50,13 +60,24 @@ class Scout:
         # Parse that HTML! 🎯
         self._soup = self.parser.parse(self.markup)
+        # Set up the root element properly
+        if hasattr(self._soup, 'name'):
+            self.name = self._soup.name
+        else:
+            self.name = '[document]'
         # BeautifulSoup-like attributes
-        self.name = self._soup.name if hasattr(self._soup, 'name') else None
         self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
-        # Advanced parsing options
+        self.contents = self._soup.contents if hasattr(self._soup, 'contents') else []
+        self.parent = None
+        self.next_sibling = None
+        self.previous_sibling = None
+        # Advanced parsing options and caching
         self._cache = {}
+        self._tag_name_cache = {}
+        self._css_selector_cache = {}
         # Text and web analyzers
         self.text_analyzer = ScoutTextAnalyzer()

webscout/version.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__version__ = "8.3.4"
+__version__ = "8.3.6"
 __prog__ = "webscout"

webscout/webscout_search.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 # import logging
 import json
-from urllib.parse import quote
+import os
 import warnings
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timezone
@@ -13,10 +13,17 @@ from random import choice, shuffle
 from threading import Event
 from time import sleep, time
 from types import TracebackType
-from typing import Any, cast
-import os
-from typing import Literal, Iterator
+from typing import Any, Literal
+from urllib.parse import quote
 from webscout.litagent import LitAgent
+# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
+# See: https://github.com/python-trio/trio/issues/3015
+try:
+    import trio  # noqa: F401
+except ImportError:
+    pass  # trio is optional, ignore if not available
 import curl_cffi.requests  # type: ignore
 try:
@@ -28,7 +35,7 @@ try:
 except ImportError:
     LXML_AVAILABLE = False
-from .exceptions import ConversationLimitException, WebscoutE, RatelimitE, TimeoutE
+from .exceptions import RatelimitE, TimeoutE, WebscoutE
 from .utils import (
     _calculate_distance,
     _expand_proxy_tb_alias,
@@ -1173,4 +1180,4 @@ class WEBS:
                     "visibility_m": hour.get("visibility"),
                 })
-        return formatted_data
+        return formatted_data

webscout/webscout_search_async.py CHANGED Viewed

@@ -3,14 +3,19 @@ from __future__ import annotations
 import asyncio
 import os
 import warnings
-from datetime import datetime, timezone
 from functools import cached_property
 from itertools import cycle
 from random import choice, shuffle
 from time import time
 from types import TracebackType
-from typing import Any, Dict, List, Optional, Type, Union, cast, AsyncIterator
+from typing import Any, Dict, List, Optional, Type, Union
+# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
+# See: https://github.com/python-trio/trio/issues/3015
+try:
+    import trio  # noqa: F401
+except ImportError:
+    pass  # trio is optional, ignore if not available
 import curl_cffi.requests
 from lxml.etree import _Element
 from lxml.html import HTMLParser as LHTMLParser
@@ -18,18 +23,15 @@ from lxml.html import document_fromstring
 from webscout.litagent.agent import LitAgent
-from .exceptions import ConversationLimitException, RatelimitE, TimeoutE, WebscoutE
+from .exceptions import RatelimitE, TimeoutE, WebscoutE
 from .utils import (
     _expand_proxy_tb_alias,
     _extract_vqd,
     _normalize,
     _normalize_url,
-    json_loads,
 )
 class AsyncWEBS:
     """Asynchronous webscout class to get search results."""
@@ -644,4 +646,4 @@ class AsyncWEBS:
             TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
         """
         # These methods are not implemented in the async version yet
-        raise NotImplementedError("aweather method is not implemented yet")
+        raise NotImplementedError("aweather method is not implemented yet")

webscout/yep_search.py CHANGED Viewed

@@ -1,9 +1,17 @@
-from curl_cffi.requests import Session
+# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
+# See: https://github.com/python-trio/trio/issues/3015
+try:
+    import trio  # noqa: F401
+except ImportError:
+    pass  # trio is optional, ignore if not available
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional
 from urllib.parse import urlencode
+from curl_cffi.requests import Session
 from webscout.litagent import LitAgent
-from typing import List, Dict, Optional, Tuple
-from concurrent.futures import ThreadPoolExecutor
-import json
 class YepSearch:
     """Yep.com search class to get search results."""
@@ -335,4 +343,4 @@ if __name__ == "__main__":
     print("---" * 30)
     print(image_results)
     print("---" * 30)
-    print(suggestions)
+    print(suggestions)

webscout 8.3.4__py3-none-any.whl → 8.3.6__py3-none-any.whl

Potentially problematic release.

webscout 8.3.4py3-none-any.whl → 8.3.6py3-none-any.whl