PyPI - webscout - Versions diffs - 8.2.8__py3-none-any.whl → 8.3__py3-none-any.whl - Mend

webscout 8.2.8py3-none-any.whl → 8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of webscout might be problematic. Click here for more details.

Files changed (197) hide show

webscout/AIauto.py +34 -16
webscout/AIbase.py +96 -37
webscout/AIutel.py +491 -87
webscout/Bard.py +441 -323
webscout/Extra/GitToolkit/__init__.py +10 -10
webscout/Extra/YTToolkit/ytapi/video.py +232 -232
webscout/Litlogger/README.md +10 -0
webscout/Litlogger/__init__.py +7 -59
webscout/Litlogger/formats.py +4 -0
webscout/Litlogger/handlers.py +103 -0
webscout/Litlogger/levels.py +13 -0
webscout/Litlogger/logger.py +92 -0
webscout/Provider/AISEARCH/Perplexity.py +332 -358
webscout/Provider/AISEARCH/felo_search.py +9 -35
webscout/Provider/AISEARCH/genspark_search.py +30 -56
webscout/Provider/AISEARCH/hika_search.py +4 -16
webscout/Provider/AISEARCH/iask_search.py +410 -436
webscout/Provider/AISEARCH/monica_search.py +4 -30
webscout/Provider/AISEARCH/scira_search.py +6 -32
webscout/Provider/AISEARCH/webpilotai_search.py +38 -64
webscout/Provider/Blackboxai.py +155 -35
webscout/Provider/ChatSandbox.py +2 -1
webscout/Provider/Deepinfra.py +339 -339
webscout/Provider/ExaChat.py +358 -358
webscout/Provider/Gemini.py +169 -169
webscout/Provider/GithubChat.py +1 -2
webscout/Provider/Glider.py +3 -3
webscout/Provider/HeckAI.py +172 -82
webscout/Provider/LambdaChat.py +1 -0
webscout/Provider/MCPCore.py +7 -3
webscout/Provider/OPENAI/BLACKBOXAI.py +421 -139
webscout/Provider/OPENAI/Cloudflare.py +38 -21
webscout/Provider/OPENAI/FalconH1.py +457 -0
webscout/Provider/OPENAI/FreeGemini.py +35 -18
webscout/Provider/OPENAI/NEMOTRON.py +34 -34
webscout/Provider/OPENAI/PI.py +427 -0
webscout/Provider/OPENAI/Qwen3.py +304 -0
webscout/Provider/OPENAI/README.md +952 -1253
webscout/Provider/OPENAI/TwoAI.py +374 -0
webscout/Provider/OPENAI/__init__.py +7 -1
webscout/Provider/OPENAI/ai4chat.py +73 -63
webscout/Provider/OPENAI/api.py +869 -644
webscout/Provider/OPENAI/base.py +2 -0
webscout/Provider/OPENAI/c4ai.py +34 -13
webscout/Provider/OPENAI/chatgpt.py +575 -556
webscout/Provider/OPENAI/chatgptclone.py +512 -487
webscout/Provider/OPENAI/chatsandbox.py +11 -6
webscout/Provider/OPENAI/copilot.py +258 -0
webscout/Provider/OPENAI/deepinfra.py +327 -318
webscout/Provider/OPENAI/e2b.py +140 -104
webscout/Provider/OPENAI/exaai.py +420 -411
webscout/Provider/OPENAI/exachat.py +448 -443
webscout/Provider/OPENAI/flowith.py +7 -3
webscout/Provider/OPENAI/freeaichat.py +12 -8
webscout/Provider/OPENAI/glider.py +15 -8
webscout/Provider/OPENAI/groq.py +5 -2
webscout/Provider/OPENAI/heckai.py +311 -307
webscout/Provider/OPENAI/llmchatco.py +9 -7
webscout/Provider/OPENAI/mcpcore.py +18 -9
webscout/Provider/OPENAI/multichat.py +7 -5
webscout/Provider/OPENAI/netwrck.py +16 -11
webscout/Provider/OPENAI/oivscode.py +290 -0
webscout/Provider/OPENAI/opkfc.py +507 -496
webscout/Provider/OPENAI/pydantic_imports.py +172 -0
webscout/Provider/OPENAI/scirachat.py +29 -17
webscout/Provider/OPENAI/sonus.py +308 -303
webscout/Provider/OPENAI/standardinput.py +442 -433
webscout/Provider/OPENAI/textpollinations.py +18 -11
webscout/Provider/OPENAI/toolbaz.py +419 -413
webscout/Provider/OPENAI/typefully.py +17 -10
webscout/Provider/OPENAI/typegpt.py +21 -11
webscout/Provider/OPENAI/uncovrAI.py +477 -462
webscout/Provider/OPENAI/utils.py +90 -79
webscout/Provider/OPENAI/venice.py +435 -425
webscout/Provider/OPENAI/wisecat.py +387 -381
webscout/Provider/OPENAI/writecream.py +166 -163
webscout/Provider/OPENAI/x0gpt.py +26 -37
webscout/Provider/OPENAI/yep.py +384 -356
webscout/Provider/PI.py +2 -1
webscout/Provider/TTI/README.md +55 -101
webscout/Provider/TTI/__init__.py +4 -9
webscout/Provider/TTI/aiarta.py +365 -0
webscout/Provider/TTI/artbit.py +0 -0
webscout/Provider/TTI/base.py +64 -0
webscout/Provider/TTI/fastflux.py +200 -0
webscout/Provider/TTI/magicstudio.py +201 -0
webscout/Provider/TTI/piclumen.py +203 -0
webscout/Provider/TTI/pixelmuse.py +225 -0
webscout/Provider/TTI/pollinations.py +221 -0
webscout/Provider/TTI/utils.py +11 -0
webscout/Provider/TTS/__init__.py +2 -1
webscout/Provider/TTS/base.py +159 -159
webscout/Provider/TTS/openai_fm.py +129 -0
webscout/Provider/TextPollinationsAI.py +308 -308
webscout/Provider/TwoAI.py +239 -44
webscout/Provider/UNFINISHED/Youchat.py +330 -330
webscout/Provider/UNFINISHED/puterjs.py +635 -0
webscout/Provider/UNFINISHED/test_lmarena.py +119 -119
webscout/Provider/Writecream.py +246 -246
webscout/Provider/__init__.py +2 -2
webscout/Provider/ai4chat.py +33 -8
webscout/Provider/granite.py +41 -6
webscout/Provider/koala.py +169 -169
webscout/Provider/oivscode.py +309 -0
webscout/Provider/samurai.py +3 -2
webscout/Provider/scnet.py +1 -0
webscout/Provider/typegpt.py +3 -3
webscout/Provider/uncovr.py +368 -368
webscout/client.py +70 -0
webscout/litprinter/__init__.py +58 -58
webscout/optimizers.py +419 -419
webscout/scout/README.md +3 -1
webscout/scout/core/crawler.py +134 -64
webscout/scout/core/scout.py +148 -109
webscout/scout/element.py +106 -88
webscout/swiftcli/Readme.md +323 -323
webscout/swiftcli/plugins/manager.py +9 -2
webscout/version.py +1 -1
webscout/zeroart/__init__.py +134 -134
webscout/zeroart/effects.py +100 -100
webscout/zeroart/fonts.py +1238 -1238
{webscout-8.2.8.dist-info → webscout-8.3.dist-info}/METADATA +160 -35
webscout-8.3.dist-info/RECORD +290 -0
{webscout-8.2.8.dist-info → webscout-8.3.dist-info}/WHEEL +1 -1
{webscout-8.2.8.dist-info → webscout-8.3.dist-info}/entry_points.txt +1 -0
webscout/Litlogger/Readme.md +0 -175
webscout/Litlogger/core/__init__.py +0 -6
webscout/Litlogger/core/level.py +0 -23
webscout/Litlogger/core/logger.py +0 -165
webscout/Litlogger/handlers/__init__.py +0 -12
webscout/Litlogger/handlers/console.py +0 -33
webscout/Litlogger/handlers/file.py +0 -143
webscout/Litlogger/handlers/network.py +0 -173
webscout/Litlogger/styles/__init__.py +0 -7
webscout/Litlogger/styles/colors.py +0 -249
webscout/Litlogger/styles/formats.py +0 -458
webscout/Litlogger/styles/text.py +0 -87
webscout/Litlogger/utils/__init__.py +0 -6
webscout/Litlogger/utils/detectors.py +0 -153
webscout/Litlogger/utils/formatters.py +0 -200
webscout/Provider/ChatGPTGratis.py +0 -194
webscout/Provider/TTI/AiForce/README.md +0 -159
webscout/Provider/TTI/AiForce/__init__.py +0 -22
webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
webscout/Provider/TTI/FreeAIPlayground/README.md +0 -99
webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
webscout/Provider/TTI/ImgSys/README.md +0 -174
webscout/Provider/TTI/ImgSys/__init__.py +0 -23
webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
webscout/Provider/TTI/MagicStudio/README.md +0 -101
webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
webscout/Provider/TTI/Nexra/README.md +0 -155
webscout/Provider/TTI/Nexra/__init__.py +0 -22
webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
webscout/Provider/TTI/PollinationsAI/README.md +0 -146
webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
webscout/Provider/TTI/aiarta/README.md +0 -134
webscout/Provider/TTI/aiarta/__init__.py +0 -2
webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
webscout/Provider/TTI/artbit/README.md +0 -100
webscout/Provider/TTI/artbit/__init__.py +0 -22
webscout/Provider/TTI/artbit/async_artbit.py +0 -155
webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
webscout/Provider/TTI/fastflux/README.md +0 -129
webscout/Provider/TTI/fastflux/__init__.py +0 -22
webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
webscout/Provider/TTI/huggingface/README.md +0 -114
webscout/Provider/TTI/huggingface/__init__.py +0 -22
webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
webscout/Provider/TTI/piclumen/README.md +0 -161
webscout/Provider/TTI/piclumen/__init__.py +0 -23
webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
webscout/Provider/TTI/pixelmuse/README.md +0 -79
webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
webscout/Provider/TTI/talkai/README.md +0 -139
webscout/Provider/TTI/talkai/__init__.py +0 -4
webscout/Provider/TTI/talkai/async_talkai.py +0 -229
webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
webscout/Provider/UNFINISHED/oivscode.py +0 -351
webscout-8.2.8.dist-info/RECORD +0 -334
{webscout-8.2.8.dist-info → webscout-8.3.dist-info}/licenses/LICENSE.md +0 -0
{webscout-8.2.8.dist-info → webscout-8.3.dist-info}/top_level.txt +0 -0

webscout/scout/README.md CHANGED Viewed

@@ -148,6 +148,7 @@ Scout provides powerful tools for navigating and manipulating HTML/XML documents
 - **Tree Traversal**: Navigate parent-child relationships and sibling elements
 - **Content Extraction**: Extract text, attributes, and structured data
 - **Document Manipulation**: Modify, replace, or remove elements
+- **Dynamic Building**: Easily append or insert new nodes
 ```python
 # CSS selector support
@@ -159,6 +160,7 @@ results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
 # Tree traversal
 parent = element.find_parent('div')
 siblings = element.find_next_siblings('p')
+prev_sibling = element.find_previous_sibling('p')
 ```
 ### 🧠 Intelligent Analysis
@@ -363,7 +365,7 @@ For detailed API documentation, please refer to the [documentation](https://gith
 ## 🔧 Dependencies
-- `requests`: HTTP library for making web requests
+- `curl_cffi`: HTTP library used for web requests
 - `lxml`: XML and HTML processing library (optional, recommended)
 - `html5lib`: Standards-compliant HTML parser (optional)
 - `markdownify`: HTML to Markdown conversion

webscout/scout/core/crawler.py CHANGED Viewed

@@ -4,19 +4,26 @@ Scout Crawler Module
 import concurrent.futures
 import urllib.parse
-from typing import Union, List, Dict
-import requests
+import time
+import hashlib
+import re
+from urllib import robotparser
+from datetime import datetime
+from typing import Dict, List, Optional, Union
+from webscout.litagent import LitAgent
+from curl_cffi.requests import Session
 from .scout import Scout
 class ScoutCrawler:
     """
     Advanced web crawling utility for Scout library.
     """
-    def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None):
+    def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
         """
         Initialize the web crawler.
         Args:
             base_url (str): Starting URL to crawl
             max_pages (int, optional): Maximum number of pages to crawl
@@ -24,117 +31,180 @@ class ScoutCrawler:
         """
         self.base_url = base_url
         self.max_pages = max_pages
-        self.tags_to_remove = tags_to_remove if tags_to_remove is not None else ["script", "style", "header", "footer", "nav", "aside", "form", "button"]
+        self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
+            "script",
+            "style",
+            "header",
+            "footer",
+            "nav",
+            "aside",
+            "form",
+            "button",
+        ]
         self.visited_urls = set()
         self.crawled_pages = []
+        self.session = session or Session()
+        self.agent = LitAgent()
+        # Use all headers and generate fingerprint
+        self.session.headers = self.agent.generate_fingerprint()
+        self.session.headers.setdefault("User-Agent", self.agent.chrome())
+        self.delay = delay
+        self.obey_robots = obey_robots
+        self.allowed_domains = allowed_domains or [urllib.parse.urlparse(base_url).netloc]
+        self.last_request_time = 0
+        self.url_hashes = set()
+        if obey_robots:
+            self.robots = robotparser.RobotFileParser()
+            robots_url = urllib.parse.urljoin(base_url, '/robots.txt')
+            try:
+                self.robots.set_url(robots_url)
+                self.robots.read()
+            except Exception:
+                self.robots = None
+        else:
+            self.robots = None
+    def _normalize_url(self, url: str) -> str:
+        url = url.split('#')[0]
+        url = re.sub(r'\?.*$', '', url)  # Remove query params
+        return url.rstrip('/')
     def _is_valid_url(self, url: str) -> bool:
         """
         Check if a URL is valid and within the same domain.
         Args:
             url (str): URL to validate
         Returns:
             bool: Whether the URL is valid
         """
         try:
             parsed_base = urllib.parse.urlparse(self.base_url)
             parsed_url = urllib.parse.urlparse(url)
-            return (
-                parsed_url.scheme in ['http', 'https'] and
-                parsed_base.netloc == parsed_url.netloc and
-                len(self.visited_urls) < self.max_pages
-            )
+            if parsed_url.scheme not in ["http", "https"]:
+                return False
+            if parsed_url.netloc not in self.allowed_domains:
+                return False
+            if self.obey_robots and self.robots:
+                return self.robots.can_fetch("*", url)
+            return True
         except Exception:
             return False
+    def _is_duplicate(self, url: str) -> bool:
+        norm = self._normalize_url(url)
+        url_hash = hashlib.md5(norm.encode()).hexdigest()
+        if url_hash in self.url_hashes:
+            return True
+        self.url_hashes.add(url_hash)
+        return False
+    def _extract_main_text(self, soup):
+        # Try to extract main content (simple heuristic)
+        main = soup.find('main')
+        if main:
+            return main.get_text(separator=" ", strip=True)
+        article = soup.find('article')
+        if article:
+            return article.get_text(separator=" ", strip=True)
+        # fallback to body
+        body = soup.find('body')
+        if body:
+            return body.get_text(separator=" ", strip=True)
+        return soup.get_text(separator=" ", strip=True)
     def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
         """
         Crawl a single page and extract information.
         Args:
             url (str): URL to crawl
             depth (int, optional): Current crawl depth
         Returns:
             Dict[str, Union[str, List[str]]]: Crawled page information
         """
-        if url in self.visited_urls:
+        if url in self.visited_urls or self._is_duplicate(url):
             return {}
+        # Throttle requests
+        now = time.time()
+        if self.last_request_time:
+            elapsed = now - self.last_request_time
+            if elapsed < self.delay:
+                time.sleep(self.delay - elapsed)
+        self.last_request_time = time.time()
         try:
-            response = requests.get(url, timeout=10)
+            response = self.session.get(url, timeout=10)
             response.raise_for_status()
-            scout = Scout(response.content, features='lxml')
-            title_result = scout.find('title')
-            title = title_result[0].get_text() if title_result else ''
-            visible_text = scout._soup.get_text(strip=True)
-            for tag in scout._soup(self.tags_to_remove):
-                tag.extract()
+            if not response.headers.get('Content-Type', '').startswith('text/html'):
+                return {}
+            scout = Scout(response.content, features="lxml")
+            title_result = scout.find("title")
+            title = title_result[0].get_text() if title_result else ""
+            for tag_name in self.tags_to_remove:
+                for tag in scout._soup.find_all(tag_name):
+                    tag.extract()
+            visible_text = self._extract_main_text(scout._soup)
             page_info = {
                 'url': url,
                 'title': title,
                 'links': [
-                    urllib.parse.urljoin(url, link.get('href'))
-                    for link in scout.find_all('a', href=True)
+                    urllib.parse.urljoin(url, link.get('href'))
+                    for link in scout.find_all('a', href=True)
                     if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
                 ],
                 'text': visible_text,
-                'depth': depth
+                'depth': depth,
+                'timestamp': datetime.utcnow().isoformat(),
+                'headers': dict(response.headers),
             }
             self.visited_urls.add(url)
             self.crawled_pages.append(page_info)
             return page_info
         except Exception as e:
             print(f"Error crawling {url}: {e}")
             return {}
-    def crawl(self) -> List[Dict[str, Union[str, List[str]]]]:
+    def crawl(self):
         """
-        Start web crawling from base URL.
-        Returns:
-            List[Dict[str, Union[str, List[str]]]]: List of crawled pages
+        Start web crawling from base URL and yield each crawled page in real time.
+        Yields:
+            Dict[str, Union[str, List[str]]]: Crawled page information
         """
         with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
             futures = {executor.submit(self._crawl_page, self.base_url, 0)}
+            submitted_links: set[str] = set()
             while futures:
-                done, futures = concurrent.futures.wait(
+                if len(self.visited_urls) >= self.max_pages:
+                    break
+                done, not_done = concurrent.futures.wait(
                     futures, return_when=concurrent.futures.FIRST_COMPLETED
                 )
+                futures = not_done
                 for future in done:
                     page_info = future.result()
+                    if page_info:
+                        yield page_info
                     if len(self.visited_urls) >= self.max_pages:
-                        break
-                    submitted_links = set()  # New set to track submitted links
-                    for link in page_info.get('links', []):
+                        return
+                    for link in page_info.get("links", []):
                         if (
-                            len(self.visited_urls) < self.max_pages and
-                            link not in self.visited_urls
+                            len(self.visited_urls) < self.max_pages
+                            and link not in self.visited_urls
+                            and link not in submitted_links
                         ):
-                            if link not in submitted_links:  # Check against submitted links
-                                submitted_links.add(link)  # Add to submitted links
-                                futures.add(
-                                    executor.submit(
-                                        self._crawl_page,
-                                        link,
-                                        page_info.get('depth', 0) + 1
-                                    )
+                            submitted_links.add(link)
+                            futures.add(
+                                executor.submit(
+                                    self._crawl_page,
+                                    link,
+                                    page_info.get("depth", 0) + 1,
                                 )
-                    if len(self.visited_urls) >= self.max_pages:
-                        break
-        return self.crawled_pages
+                            )

webscout 8.2.8__py3-none-any.whl → 8.3__py3-none-any.whl

Potentially problematic release.

webscout 8.2.8py3-none-any.whl → 8.3py3-none-any.whl