PyPI - janito - Versions diffs - 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

janito 0.12.0py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

janito/__init__.py +1 -1
janito/cli/agent/__init__.py +7 -0
janito/cli/agent/conversation.py +149 -0
janito/cli/agent/initialization.py +172 -0
janito/cli/agent/query.py +108 -0
janito/cli/agent.py +7 -282
janito/cli/app.py +105 -9
janito/cli/commands/__init__.py +12 -0
janito/cli/commands/config.py +242 -0
janito/cli/commands/history.py +119 -0
janito/cli/commands/profile.py +72 -0
janito/cli/commands/validation.py +24 -0
janito/cli/commands/workspace.py +31 -0
janito/cli/commands.py +9 -326
janito/config.py +37 -0
janito/data/instructions_template.txt +9 -5
janito/tools/__init__.py +8 -2
janito/tools/bash/bash.py +3 -1
janito/tools/bash/unix_persistent_bash.py +183 -181
janito/tools/bash/win_persistent_bash.py +4 -2
janito/tools/fetch_webpage/__init__.py +22 -33
janito/tools/fetch_webpage/core.py +182 -155
janito/tools/rich_console.py +46 -9
janito/tools/search_text.py +225 -238
janito/tools/str_replace_editor/handlers/str_replace.py +3 -1
janito/tools/str_replace_editor/handlers/view.py +14 -8
janito/tools/think.py +37 -0
janito/tools/usage_tracker.py +1 -0
janito-0.14.0.dist-info/METADATA +396 -0
janito-0.14.0.dist-info/RECORD +53 -0
janito/test_file.py +0 -4
janito/tools/fetch_webpage/chunking.py +0 -76
janito/tools/fetch_webpage/extractors.py +0 -276
janito/tools/fetch_webpage/news.py +0 -137
janito/tools/fetch_webpage/utils.py +0 -108
janito-0.12.0.dist-info/METADATA +0 -203
janito-0.12.0.dist-info/RECORD +0 -47
{janito-0.12.0.dist-info → janito-0.14.0.dist-info}/WHEEL +0 -0
{janito-0.12.0.dist-info → janito-0.14.0.dist-info}/entry_points.txt +0 -0
{janito-0.12.0.dist-info → janito-0.14.0.dist-info}/licenses/LICENSE +0 -0

janito/tools/fetch_webpage/core.py CHANGED Viewed

@@ -1,155 +1,182 @@
-"""
-Core functionality for fetching web pages and extracting content.
-"""
-import requests
-from typing import Tuple, List, Optional
-from urllib.parse import urlparse
-from janito.tools.rich_console import print_info, print_success, print_error, print_warning
-from janito.tools.usage_tracker import track_usage
-from janito.tools.fetch_webpage.extractors import extract_clean_text
-# Import moved to fetch_and_extract function to avoid circular imports
-from janito.tools.fetch_webpage.utils import SITE_SPECIFIC_STRATEGIES
-@track_usage('web_requests')
-def fetch_webpage(url: str, headers: dict = None, timeout: int = 30, max_size: int = 5000000,
-                 target_strings: List[str] = None) -> Tuple[str, bool]:
-    """
-    Fetch the content of a web page from a given URL.
-    Args:
-        url: The URL of the web page to fetch
-        headers: Optional HTTP headers to include in the request (default: None)
-        timeout: Request timeout in seconds (default: 30)
-        max_size: Maximum size in bytes to download (default: 5MB)
-        target_strings: Optional list of strings to target specific content sections
-    Returns:
-        A tuple containing (message, is_error)
-    """
-    print_info(f"Fetching content from URL: {url}", "Web Fetch")
-    try:
-        # Set default headers if none provided
-        if headers is None:
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-            }
-        # Make the HTTP request with streaming enabled
-        response = requests.get(url, headers=headers, timeout=timeout, stream=True)
-        # Raise an exception for HTTP errors
-        response.raise_for_status()
-        # Check content length before downloading fully
-        content_length = response.headers.get('Content-Length')
-        if content_length and int(content_length) > max_size:
-            warning_msg = f"Web Fetch: Content size ({int(content_length)/1000000:.1f}MB) exceeds max size ({max_size/1000000:.1f}MB). Aborting download."
-            print_warning(warning_msg)
-            return warning_msg, True
-        # Download content with size limit
-        content_bytes = b''
-        for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
-            content_bytes += chunk
-            if len(content_bytes) > max_size:
-                warning_msg = f"Web Fetch: Download exceeded max size ({max_size/1000000:.1f}MB). Truncating."
-                print_warning(warning_msg)
-                break
-        # Get the content
-        content = content_bytes.decode('utf-8', errors='replace')
-        # If target strings are provided, extract only the relevant sections
-        if target_strings and len(target_strings) > 0:
-            print_info(f"Targeting specific content using {len(target_strings)} search strings", "Web Fetch")
-            from janito.tools.fetch_webpage.extractors import extract_targeted_content
-            targeted_content = extract_targeted_content(content, target_strings)
-            if targeted_content:
-                print_success(f"Successfully targeted specific content based on search strings", "Web Fetch")
-                # Create a summary with first 300 chars of targeted content
-                content_preview = targeted_content[:300] + "..." if len(targeted_content) > 300 else targeted_content
-                summary = f"Successfully fetched targeted content from {url}\n\nContent preview:\n{content_preview}"
-                print_success(f"Successfully fetched targeted content from {url} ({len(targeted_content)} bytes)", "Web Fetch")
-                return targeted_content, False
-            else:
-                print_warning(f"Web Fetch: Could not find content matching the target strings. Returning full content.")
-        # Create a summary message with first 300 chars of content
-        content_preview = content[:300] + "..." if len(content) > 300 else content
-        print_success(f"({len(content)} bytes)", "Web Fetch")
-        # Return the full content
-        return content, False
-    except requests.exceptions.RequestException as e:
-        error_msg = f"Error fetching web page: {str(e)}"
-        print_error(error_msg, "Web Fetch Error")
-        return error_msg, True
-@track_usage('web_content')
-def fetch_and_extract(url: str, extract_method: str = 'trafilatura',
-                     max_length: int = 10000,
-                     target_strings: List[str] = None) -> Tuple[str, bool]:
-    """
-    Fetch a webpage and extract its main content in a format suitable for LLM processing.
-    Args:
-        url: The URL to fetch
-        extract_method: Content extraction method ('trafilatura', 'newspaper', 'beautifulsoup', 'all')
-        max_length: Maximum length of text to return
-        target_strings: Optional list of strings to target specific content sections
-    Returns:
-        A tuple containing (extracted_content, is_error)
-    """
-    # Check if this is a news aggregator site that needs special handling
-    domain = urlparse(url).netloc
-    for site_domain in SITE_SPECIFIC_STRATEGIES.keys():
-        if site_domain in domain:
-            print_info(f"Detected news aggregator site: {domain}. Using specialized extraction.", "Content Extraction")
-            # Import here to avoid circular imports
-            from janito.tools.fetch_webpage.news import fetch_and_extract_news_aggregator
-            return fetch_and_extract_news_aggregator(url)
-    # If target strings are provided, pass them directly to fetch_webpage for efficiency
-    if target_strings and len(target_strings) > 0:
-        html_content, is_error = fetch_webpage(url, target_strings=target_strings)
-    else:
-        html_content, is_error = fetch_webpage(url)
-    if is_error:
-        return html_content, True
-    extracted_text = extract_clean_text(html_content, method=extract_method, url=url)
-    if not extracted_text or len(extracted_text) < 100:
-        return f"Could not extract meaningful content from {url}", True
-    # If target strings were provided but not already handled by fetch_webpage
-    if target_strings and len(target_strings) > 0 and not any(target in extracted_text for target in target_strings if len(target) > 3):
-        from janito.tools.fetch_webpage.extractors import extract_targeted_content
-        targeted_content = extract_targeted_content(html_content, target_strings)
-        if targeted_content:
-            print_success(f"Successfully extracted targeted content based on {len(target_strings)} search strings",
-                         "Targeted Extraction")
-            extracted_text = targeted_content
-    # Truncate if needed
-    if len(extracted_text) > max_length:
-        print_info(f"Truncating content from {len(extracted_text)} to {max_length} characters", "Content Extraction")
-        extracted_text = extracted_text[:max_length] + "..."
-    # Check if the content is still too large for an LLM (rough estimate)
-    estimated_tokens = len(extracted_text.split())
-    if estimated_tokens > 10000:  # Conservative estimate for token limits
-        print_warning(f"Content Extraction: Extracted content still very large (~{estimated_tokens} words). Consider using chunk_large_content()")
-    print_success(f"Successfully extracted {len(extracted_text)} characters of content", "Content Extraction")
-    return extracted_text, False
+"""
+Core functionality for fetching web pages and extracting content.
+"""
+import requests
+from typing import Tuple, List, Optional
+from urllib.parse import urlparse, unquote
+from janito.tools.rich_console import print_info, print_success, print_error, print_warning
+from janito.tools.usage_tracker import track_usage
+from bs4 import BeautifulSoup
+@track_usage('web_requests')
+def fetch_webpage(url: str, headers: dict = None, timeout: int = 30, max_size: int = 5000000) -> Tuple[str, bool]:
+    """
+    Fetch the content of a web page from a given URL.
+    Args:
+        url: The URL of the web page to fetch
+        headers: Optional HTTP headers to include in the request (default: None)
+        timeout: Request timeout in seconds (default: 30)
+        max_size: Maximum size in bytes to download (default: 5MB)
+    Returns:
+        A tuple containing (message, is_error)
+    """
+    print_info(f"Fetching content from URL: {url}", "Web Fetch")
+    try:
+        # Set default headers if none provided
+        if headers is None:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'Referer': 'https://www.google.com/',
+                'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+                'Sec-Ch-Ua-Mobile': '?0',
+                'Sec-Ch-Ua-Platform': '"Windows"',
+                'Sec-Fetch-Dest': 'document',
+                'Sec-Fetch-Mode': 'navigate',
+                'Sec-Fetch-Site': 'cross-site',
+                'Sec-Fetch-User': '?1',
+                'Upgrade-Insecure-Requests': '1'
+            }
+        # Make the HTTP request with streaming enabled
+        response = requests.get(url, headers=headers, timeout=timeout, stream=True)
+        # Raise an exception for HTTP errors
+        response.raise_for_status()
+        # Check content length before downloading fully
+        content_length = response.headers.get('Content-Length')
+        if content_length and int(content_length) > max_size:
+            warning_msg = f"Web Fetch: Content size ({int(content_length)/1000000:.1f}MB) exceeds max size ({max_size/1000000:.1f}MB). Aborting download."
+            print_warning(warning_msg)
+            return warning_msg, True
+        # Download content with size limit
+        content_bytes = b''
+        for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
+            content_bytes += chunk
+            if len(content_bytes) > max_size:
+                warning_msg = f"Web Fetch: Download exceeded max size ({max_size/1000000:.1f}MB). Truncating."
+                print_warning(warning_msg)
+                break
+        # Get the content
+        content = content_bytes.decode('utf-8', errors='replace')
+        print_success(f"Successfully fetched content ({len(content)} bytes)", "Web Fetch")
+        # Return the full content
+        return content, False
+    except requests.exceptions.RequestException as e:
+        error_msg = f"Error fetching web page: {str(e)}"
+        print_error(error_msg, "Web Fetch Error")
+        return error_msg, True
+@track_usage('web_content')
+def fetch_and_extract(url: str, max_length: int = 10000, keywords: List[str] = None) -> Tuple[str, bool]:
+    """
+    Fetch a webpage and extract its main content using BeautifulSoup.
+    Args:
+        url: The URL to fetch
+        max_length: Maximum length of text to return
+        keywords: Optional list of URL-encoded keywords to prioritize content containing these terms
+    Returns:
+        A tuple containing (extracted_content, is_error)
+    """
+    html_content, is_error = fetch_webpage(url)
+    if is_error:
+        return html_content, True
+    try:
+        # Use BeautifulSoup to parse and extract content
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Remove script, style, and other non-content elements
+        for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
+            element.decompose()
+        # URL-decode keywords if provided
+        decoded_keywords = []
+        if keywords:
+            decoded_keywords = [unquote(keyword).lower() for keyword in keywords]
+            print_info(f"Prioritizing content with keywords: {', '.join(decoded_keywords)}", "Content Extraction")
+        # Extract text from main content elements
+        paragraphs = []
+        keyword_paragraphs = []
+        for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'article', 'section', 'div']):
+            text = tag.get_text(strip=True)
+            if text and len(text) > 20:  # Skip very short pieces that might be UI elements
+                # Check if the paragraph contains any of the keywords
+                if decoded_keywords and any(keyword in text.lower() for keyword in decoded_keywords):
+                    keyword_paragraphs.append(text)
+                else:
+                    paragraphs.append(text)
+        # Join paragraphs, prioritizing those with keywords
+        if keyword_paragraphs:
+            print_info(f"Found {len(keyword_paragraphs)} paragraphs containing keywords", "Content Extraction")
+            extracted_text = "\n\n".join(keyword_paragraphs + paragraphs)
+        else:
+            extracted_text = "\n\n".join(paragraphs)
+        # If no paragraphs found, fall back to all text
+        if not extracted_text or len(extracted_text) < 100:
+            extracted_text = soup.get_text(separator='\n\n')
+        # Clean up extra whitespace
+        extracted_text = ' '.join(extracted_text.split())
+        extracted_text = extracted_text.replace('. ', '.\n\n')
+        # Truncate if needed
+        if len(extracted_text) > max_length:
+            print_info(f"Truncating content from {len(extracted_text)} to {max_length} characters", "Content Extraction")
+            extracted_text = extracted_text[:max_length] + "..."
+        print_success(f"Successfully extracted {len(extracted_text)} characters of content", "Content Extraction")
+        return extracted_text, False
+    except Exception as e:
+        error_msg = f"Error extracting content: {str(e)}"
+        print_error(error_msg, "Content Extraction Error")
+        return error_msg, True
+def chunk_content(content: str, chunk_size: int = 2000, overlap: int = 200) -> List[str]:
+    """
+    Split content into overlapping chunks of a specified size.
+    Args:
+        content: The text content to chunk
+        chunk_size: Maximum size of each chunk
+        overlap: Number of characters to overlap between chunks
+    Returns:
+        List of text chunks
+    """
+    if not content:
+        return []
+    chunks = []
+    # Simple chunking with overlap
+    for i in range(0, len(content), chunk_size - overlap):
+        chunk_end = min(i + chunk_size, len(content))
+        chunks.append(content[i:chunk_end])
+        if chunk_end == len(content):
+            break
+    print_success(f"Content successfully chunked into {len(chunks)} parts", "Content Chunking")
+    return chunks

janito/tools/rich_console.py CHANGED Viewed

@@ -4,6 +4,7 @@ Utility module for rich console printing in tools.
 from rich.console import Console
 from rich.text import Text
 from typing import Optional
+from janito.config import get_config
 # Create a shared console instance
 console = Console()
@@ -16,6 +17,9 @@ def print_info(message: str, title: Optional[str] = None):
         message: The message to print
         title: Optional title for the panel
     """
+    # Skip printing if trust mode is enabled
+    if get_config().trust_mode:
+        return
     # Map titles to specific icons
     icon_map = {
         # File operations
@@ -82,20 +86,22 @@ def print_info(message: str, title: Optional[str] = None):
             elif "Undoing last edit" in title:
                 icon = "↩️"  # Undo icon
+    # Add indentation to all tool messages
+    indent = "    "
     text = Text(message)
     if title:
         # Special case for Bash Run commands
         if title == "Bash Run":
             console.print("\n" + "-"*50)
-            console.print(f"{icon} {title}", style="bold white on blue")
+            console.print(f"{indent}{icon} {title}", style="bold white on blue")
             console.print("-"*50)
-            console.print(f"$ {text}", style="white on dark_blue")
+            console.print(f"{indent}$ {text}", style="white on dark_blue")
             # Make sure we're not returning anything
             return
         else:
-            console.print(f"{icon} {message}", style="blue", end="")
+            console.print(f"{indent}{icon} {message}", style="blue", end="")
     else:
-        console.print(f"{icon} {text}", style="blue", end="")
+        console.print(f"{indent}{icon} {text}", style="blue", end="")
 def print_success(message: str, title: Optional[str] = None):
     """
@@ -105,6 +111,9 @@ def print_success(message: str, title: Optional[str] = None):
         message: The message to print
         title: Optional title for the panel
     """
+    # Skip printing if trust mode is enabled
+    if get_config().trust_mode:
+        return
     text = Text(message)
     if title:
         console.print(f" ✅ {message}", style="green")
@@ -114,26 +123,54 @@ def print_success(message: str, title: Optional[str] = None):
 def print_error(message: str, title: Optional[str] = None):
     """
     Print an error message with rich formatting.
+    In trust mode, error messages are suppressed.
     Args:
         message: The message to print
         title: Optional title for the panel
     """
+    # Skip printing if trust mode is enabled
+    if get_config().trust_mode:
+        return
     text = Text(message)
-    if title:
-        # Special case for File View - print without header
+    # Check if message starts with question mark emoji (❓)
+    # If it does, use warning styling (yellow) instead of error styling (red)
+    starts_with_question_mark = message.startswith("❓")
+    if starts_with_question_mark:
+        # Use warning styling for question mark emoji errors
+        # For question mark emoji errors, don't include the title (like "Error")
+        # Just print the message with the emoji
         if title == "File View":
-            console.print(f"\n ❌ {message}", style="red")
+            console.print(f"\n {message}", style="yellow")
         else:
-            console.print(f"❌ {title} {text}")
+            console.print(f"{message}", style="yellow")
     else:
-        console.print(f"\n❌ {text}", style="red")
+        # Regular error styling
+        if title:
+            # Special case for File View - print without header
+            if title == "File View":
+                console.print(f"\n ❌ {message}", style="red")
+            # Special case for Search Error
+            elif title == "Search Error":
+                console.print(f"❌ {message}", style="red")
+            else:
+                console.print(f"❌ {title} {text}", style="red")
+        else:
+            console.print(f"\n❌ {text}", style="red")
 def print_warning(message: str):
     """
     Print a warning message with rich formatting.
+    In trust mode, warning messages are suppressed.
     Args:
         message: The message to print
     """
+    # Skip printing if trust mode is enabled
+    if get_config().trust_mode:
+        return
     console.print(f"⚠️  {message}", style="yellow")

janito 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

janito 0.12.0py3-none-any.whl → 0.14.0py3-none-any.whl