PyPI - code-puppy - Versions diffs - 0.0.356__py3-none-any.whl → 0.0.357__py3-none-any.whl - Mend

code-puppy 0.0.356py3-none-any.whl → 0.0.357py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

code_puppy/agents/agent_qa_kitten.py +10 -5
code_puppy/agents/agent_terminal_qa.py +323 -0
code_puppy/api/app.py +79 -2
code_puppy/api/routers/commands.py +21 -2
code_puppy/api/routers/sessions.py +49 -8
code_puppy/config.py +5 -2
code_puppy/tools/__init__.py +37 -0
code_puppy/tools/agent_tools.py +26 -1
code_puppy/tools/browser/__init__.py +41 -0
code_puppy/tools/browser/browser_control.py +6 -6
code_puppy/tools/browser/browser_interactions.py +21 -20
code_puppy/tools/browser/browser_locators.py +9 -9
code_puppy/tools/browser/browser_navigation.py +7 -7
code_puppy/tools/browser/browser_screenshot.py +60 -135
code_puppy/tools/browser/browser_screenshot_vqa.py +195 -0
code_puppy/tools/browser/browser_scripts.py +15 -13
code_puppy/tools/browser/camoufox_manager.py +226 -64
code_puppy/tools/browser/chromium_terminal_manager.py +259 -0
code_puppy/tools/browser/terminal_command_tools.py +521 -0
code_puppy/tools/browser/terminal_screenshot_tools.py +520 -0
code_puppy/tools/browser/terminal_tools.py +525 -0
code_puppy/tools/browser/vqa_agent.py +138 -34
code_puppy/tools/command_runner.py +0 -1
{code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/METADATA +1 -1
{code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/RECORD +30 -24
{code_puppy-0.0.356.data → code_puppy-0.0.357.data}/data/code_puppy/models.json +0 -0
{code_puppy-0.0.356.data → code_puppy-0.0.357.data}/data/code_puppy/models_dev_api.json +0 -0
{code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/WHEEL +0 -0
{code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/entry_points.txt +0 -0
{code_puppy-0.0.356.dist-info → code_puppy-0.0.357.dist-info}/licenses/LICENSE +0 -0

code_puppy/tools/browser/browser_screenshot.py CHANGED Viewed

@@ -1,19 +1,21 @@
-"""Screenshot and visual analysis tool with VQA capabilities."""
+"""Screenshot tool for browser automation.
-import asyncio
+Captures screenshots and returns them as base64 data that multimodal
+models can directly see and analyze - no separate VQA agent needed.
+"""
+import base64
 from datetime import datetime
 from pathlib import Path
 from tempfile import gettempdir, mkdtemp
 from typing import Any, Dict, Optional
-from pydantic import BaseModel
 from pydantic_ai import RunContext
 from code_puppy.messaging import emit_error, emit_info, emit_success
 from code_puppy.tools.common import generate_group_id
-from .camoufox_manager import get_camoufox_manager
-from .vqa_agent import run_vqa_analysis
+from .camoufox_manager import get_session_browser_manager
 _TEMP_SCREENSHOT_ROOT = Path(
     mkdtemp(prefix="code_puppy_screenshots_", dir=gettempdir())
@@ -21,21 +23,11 @@ _TEMP_SCREENSHOT_ROOT = Path(
 def _build_screenshot_path(timestamp: str) -> Path:
-    """Return the target path for a screenshot using a shared temp directory."""
+    """Return the target path for a screenshot."""
     filename = f"screenshot_{timestamp}.png"
     return _TEMP_SCREENSHOT_ROOT / filename
-class ScreenshotResult(BaseModel):
-    """Result from screenshot operation."""
-    success: bool
-    screenshot_path: Optional[str] = None
-    screenshot_data: Optional[bytes] = None
-    timestamp: Optional[str] = None
-    error: Optional[str] = None
 async def _capture_screenshot(
     page,
     full_page: bool = False,
@@ -45,41 +37,38 @@ async def _capture_screenshot(
 ) -> Dict[str, Any]:
     """Internal screenshot capture function."""
     try:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
         # Take screenshot
         if element_selector:
-            # Screenshot specific element
             element = await page.locator(element_selector).first
             if not await element.is_visible():
                 return {
                     "success": False,
                     "error": f"Element '{element_selector}' is not visible",
                 }
-            screenshot_data = await element.screenshot()
+            screenshot_bytes = await element.screenshot()
         else:
-            # Screenshot page or full page
-            screenshot_data = await page.screenshot(full_page=full_page)
+            screenshot_bytes = await page.screenshot(full_page=full_page)
-        result = {
+        result: Dict[str, Any] = {
             "success": True,
-            "screenshot_data": screenshot_data,
+            "screenshot_bytes": screenshot_bytes,
+            "base64_data": base64.b64encode(screenshot_bytes).decode("utf-8"),
             "timestamp": timestamp,
         }
         if save_screenshot:
             screenshot_path = _build_screenshot_path(timestamp)
             screenshot_path.parent.mkdir(parents=True, exist_ok=True)
             with open(screenshot_path, "wb") as f:
-                f.write(screenshot_data)
+                f.write(screenshot_bytes)
             result["screenshot_path"] = str(screenshot_path)
-            message = f"Screenshot saved: {screenshot_path}"
             if group_id:
-                emit_success(message, message_group=group_id)
-            else:
-                emit_success(message)
+                emit_success(
+                    f"Screenshot saved: {screenshot_path}", message_group=group_id
+                )
         return result
@@ -87,46 +76,43 @@ async def _capture_screenshot(
         return {"success": False, "error": str(e)}
-async def take_screenshot_and_analyze(
-    question: str,
+async def take_screenshot(
     full_page: bool = False,
     element_selector: Optional[str] = None,
     save_screenshot: bool = True,
 ) -> Dict[str, Any]:
-    """
-    Take a screenshot and analyze it using visual understanding.
+    """Take a screenshot of the browser page.
+    Returns the screenshot as base64-encoded PNG data that multimodal
+    models can directly see and analyze.
     Args:
-        question: The specific question to ask about the screenshot
-        full_page: Whether to capture the full page or just viewport
-        element_selector: Optional selector to screenshot just a specific element
-        save_screenshot: Whether to save the screenshot to disk
+        full_page: Whether to capture full page or just viewport.
+        element_selector: Optional selector to screenshot specific element.
+        save_screenshot: Whether to save the screenshot to disk.
     Returns:
-        Dict containing analysis results and screenshot info
+        Dict containing:
+            - success (bool): True if screenshot was captured.
+            - base64_image (str): Base64-encoded PNG image data.
+            - media_type (str): Always "image/png".
+            - screenshot_path (str): Path to saved file (if saved).
+            - error (str): Error message if unsuccessful.
     """
     target = element_selector or ("full_page" if full_page else "viewport")
-    group_id = generate_group_id(
-        "browser_screenshot_analyze", f"{question[:50]}_{target}"
-    )
-    emit_info(
-        f"BROWSER SCREENSHOT ANALYZE 📷 question='{question[:100]}{'...' if len(question) > 100 else ''}' target={target}",
-        message_group=group_id,
-    )
+    group_id = generate_group_id("browser_screenshot", target)
+    emit_info(f"BROWSER SCREENSHOT 📷 target={target}", message_group=group_id)
     try:
-        # Get the current browser page
-        browser_manager = get_camoufox_manager()
+        browser_manager = get_session_browser_manager()
         page = await browser_manager.get_current_page()
         if not page:
-            return {
-                "success": False,
-                "error": "No active browser page available. Please navigate to a webpage first.",
-                "question": question,
-            }
+            error_msg = "No active browser page. Navigate to a webpage first."
+            emit_error(error_msg, message_group=group_id)
+            return {"success": False, "error": error_msg}
-        # Take screenshot
-        screenshot_result = await _capture_screenshot(
+        result = await _capture_screenshot(
             page,
             full_page=full_page,
             element_selector=element_selector,
@@ -134,108 +120,47 @@ async def take_screenshot_and_analyze(
             group_id=group_id,
         )
-        if not screenshot_result["success"]:
-            error_message = screenshot_result.get("error", "Screenshot failed")
-            emit_error(
-                f"Screenshot capture failed: {error_message}",
-                message_group=group_id,
-            )
-            return {
-                "success": False,
-                "error": error_message,
-                "question": question,
-            }
-        screenshot_bytes = screenshot_result.get("screenshot_data")
-        if not screenshot_bytes:
-            emit_error(
-                "Screenshot captured but pixel data missing; cannot run visual analysis.",
-                message_group=group_id,
-            )
-            return {
-                "success": False,
-                "error": "Screenshot captured but no image bytes available for analysis.",
-                "question": question,
-            }
-        try:
-            vqa_result = await asyncio.to_thread(
-                run_vqa_analysis,
-                question,
-                screenshot_bytes,
-            )
-        except Exception as exc:
-            emit_error(
-                f"Visual question answering failed: {exc}",
-                message_group=group_id,
-            )
-            return {
-                "success": False,
-                "error": f"Visual analysis failed: {exc}",
-                "question": question,
-                "screenshot_info": {
-                    "path": screenshot_result.get("screenshot_path"),
-                    "timestamp": screenshot_result.get("timestamp"),
-                    "full_page": full_page,
-                    "element_selector": element_selector,
-                },
-            }
-        emit_success(
-            f"Visual analysis answer: {vqa_result.answer}",
-            message_group=group_id,
-        )
-        emit_info(
-            f"Observations: {vqa_result.observations}",
-            message_group=group_id,
-        )
+        if not result["success"]:
+            emit_error(result.get("error", "Screenshot failed"), message_group=group_id)
+            return result
         return {
             "success": True,
-            "question": question,
-            "answer": vqa_result.answer,
-            "confidence": vqa_result.confidence,
-            "observations": vqa_result.observations,
-            "screenshot_info": {
-                "path": screenshot_result.get("screenshot_path"),
-                "size": len(screenshot_bytes),
-                "timestamp": screenshot_result.get("timestamp"),
-                "full_page": full_page,
-                "element_selector": element_selector,
-            },
+            "base64_image": result["base64_data"],
+            "media_type": "image/png",
+            "screenshot_path": result.get("screenshot_path"),
+            "message": "Screenshot captured. The base64_image contains the browser view.",
         }
     except Exception as e:
-        emit_error(f"Screenshot analysis failed: {str(e)}", message_group=group_id)
-        return {"success": False, "error": str(e), "question": question}
+        error_msg = f"Screenshot failed: {str(e)}"
+        emit_error(error_msg, message_group=group_id)
+        return {"success": False, "error": error_msg}
 def register_take_screenshot_and_analyze(agent):
-    """Register the screenshot analysis tool."""
+    """Register the screenshot tool."""
     @agent.tool
     async def browser_screenshot_analyze(
         context: RunContext,
-        question: str,
         full_page: bool = False,
         element_selector: Optional[str] = None,
-        save_screenshot: bool = True,
     ) -> Dict[str, Any]:
         """
-        Take a screenshot and analyze it to answer a specific question.
+        Take a screenshot of the browser page.
+        Returns the screenshot as base64 image data that you can see directly.
+        Use this to see what's displayed in the browser.
         Args:
-            question: The specific question to ask about the screenshot
-            full_page: Whether to capture the full page or just viewport
-            element_selector: Optional CSS/XPath selector to screenshot specific element
-            save_screenshot: Whether to save the screenshot to disk
+            full_page: Capture full page (True) or just viewport (False).
+            element_selector: Optional CSS selector to screenshot specific element.
         Returns:
-            Dict with analysis results including answer, confidence, and observations
+            Dict with base64_image (PNG data you can see), screenshot_path, etc.
         """
-        return await take_screenshot_and_analyze(
-            question=question,
+        return await take_screenshot(
             full_page=full_page,
             element_selector=element_selector,
-            save_screenshot=save_screenshot,
         )

code_puppy/tools/browser/browser_screenshot_vqa.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""VQA-based Screenshot tool for browser automation (qa-kitten).
+This module provides screenshot analysis using a dedicated VQA agent.
+Unlike browser_screenshot.py which returns raw base64 bytes for multimodal
+models to see directly, this version offloads the visual analysis to a
+separate VQA agent, helping manage context in the calling agent.
+Use this for qa-kitten where context management is important.
+Use browser_screenshot.py for terminal-qa where direct image viewing is needed.
+"""
+from typing import Any, Dict, Optional
+from pydantic_ai import RunContext
+from rich.console import Console
+from code_puppy.messaging import emit_error, emit_info, emit_success
+from code_puppy.tools.common import generate_group_id
+from .browser_screenshot import _capture_screenshot
+from .camoufox_manager import get_session_browser_manager
+from .vqa_agent import run_vqa_analysis_stream
+async def take_screenshot_and_analyze(
+    question: str,
+    full_page: bool = False,
+    element_selector: Optional[str] = None,
+    save_screenshot: bool = True,
+) -> Dict[str, Any]:
+    """Take a screenshot and analyze it using the VQA agent.
+    This function captures a screenshot and passes it to a dedicated
+    VQA (Visual Question Answering) agent for analysis. The VQA agent
+    runs separately, keeping the image analysis out of the calling
+    agent's context window.
+    Args:
+        question: The question to ask about the screenshot.
+            Examples:
+            - "What buttons are visible on this page?"
+            - "Is there an error message displayed?"
+            - "What is the main heading text?"
+            - "Describe the layout of this form."
+        full_page: Whether to capture full page or just viewport.
+            Defaults to False (viewport only).
+        element_selector: Optional CSS selector to screenshot a specific
+            element instead of the whole page.
+        save_screenshot: Whether to save the screenshot to disk.
+    Returns:
+        Dict containing:
+            - success (bool): True if analysis succeeded.
+            - answer (str): The VQA agent's streamed answer to your question.
+            - screenshot_info (dict): Path, timestamp, and other metadata.
+            - error (str): Error message if unsuccessful.
+    """
+    target = element_selector or ("full_page" if full_page else "viewport")
+    group_id = generate_group_id(
+        "browser_screenshot_analyze", f"{question[:50]}_{target}"
+    )
+    emit_info(
+        f"BROWSER SCREENSHOT ANALYZE 📷 question='{question[:100]}{'...' if len(question) > 100 else ''}' target={target}",
+        message_group=group_id,
+    )
+    try:
+        # Get the browser page
+        browser_manager = get_session_browser_manager()
+        page = await browser_manager.get_current_page()
+        if not page:
+            error_msg = "No active browser page. Navigate to a webpage first."
+            emit_error(error_msg, message_group=group_id)
+            return {"success": False, "error": error_msg, "question": question}
+        # Capture the screenshot
+        screenshot_result = await _capture_screenshot(
+            page,
+            full_page=full_page,
+            element_selector=element_selector,
+            save_screenshot=save_screenshot,
+            group_id=group_id,
+        )
+        if not screenshot_result["success"]:
+            error_msg = screenshot_result.get("error", "Screenshot failed")
+            emit_error(
+                f"Screenshot capture failed: {error_msg}", message_group=group_id
+            )
+            return {"success": False, "error": error_msg, "question": question}
+        screenshot_bytes = screenshot_result.get("screenshot_bytes")
+        if not screenshot_bytes:
+            emit_error(
+                "Screenshot captured but pixel data missing; cannot run visual analysis.",
+                message_group=group_id,
+            )
+            return {
+                "success": False,
+                "error": "Screenshot captured but no image bytes available for analysis.",
+                "question": question,
+            }
+        # Run VQA analysis with streaming output
+        try:
+            console = Console()
+            console.print()  # Newline before streaming starts
+            console.print("[bold cyan]🔍 VQA Analysis:[/bold cyan]")
+            vqa_answer = await run_vqa_analysis_stream(
+                question,
+                screenshot_bytes,
+            )
+        except Exception as exc:
+            emit_error(
+                f"Visual question answering failed: {exc}",
+                message_group=group_id,
+            )
+            return {
+                "success": False,
+                "error": f"Visual analysis failed: {exc}",
+                "question": question,
+                "screenshot_info": {
+                    "path": screenshot_result.get("screenshot_path"),
+                    "timestamp": screenshot_result.get("timestamp"),
+                    "full_page": full_page,
+                    "element_selector": element_selector,
+                },
+            }
+        emit_success(
+            "Visual analysis complete",
+            message_group=group_id,
+        )
+        return {
+            "success": True,
+            "question": question,
+            "answer": vqa_answer,
+            "screenshot_info": {
+                "path": screenshot_result.get("screenshot_path"),
+                "size": len(screenshot_bytes),
+                "timestamp": screenshot_result.get("timestamp"),
+                "full_page": full_page,
+                "element_selector": element_selector,
+            },
+        }
+    except Exception as e:
+        error_msg = f"Screenshot analysis failed: {str(e)}"
+        emit_error(error_msg, message_group=group_id)
+        return {"success": False, "error": error_msg, "question": question}
+def register_take_screenshot_and_analyze_vqa(agent):
+    """Register the VQA-based screenshot tool.
+    This tool takes a screenshot and analyzes it using a separate VQA agent.
+    Use this for agents where context management is important (like qa-kitten).
+    """
+    @agent.tool
+    async def browser_screenshot_vqa(
+        context: RunContext,
+        question: str,
+        full_page: bool = False,
+        element_selector: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """
+        Take a screenshot and analyze it with VQA.
+        Captures a screenshot of the browser and uses a visual AI to
+        answer your question about what's visible on the page.
+        Args:
+            question: What you want to know about the screenshot.
+                Examples:
+                - "What buttons are visible?"
+                - "Is there an error message?"
+                - "What is the page title?"
+                - "Is the form filled out correctly?"
+            full_page: Capture full page (True) or just viewport (False).
+            element_selector: Optional CSS selector to screenshot specific element.
+        Returns:
+            Dict with:
+            - answer: The streamed answer to your question
+            - screenshot_info: Where the screenshot was saved, etc.
+        """
+        return await take_screenshot_and_analyze(
+            question=question,
+            full_page=full_page,
+            element_selector=element_selector,
+        )

code_puppy/tools/browser/browser_scripts.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pydantic_ai import RunContext
 from code_puppy.messaging import emit_error, emit_info, emit_success
 from code_puppy.tools.common import generate_group_id
-from .camoufox_manager import get_camoufox_manager
+from .camoufox_manager import get_session_browser_manager
 async def execute_javascript(
@@ -21,14 +21,16 @@ async def execute_javascript(
         message_group=group_id,
     )
     try:
-        browser_manager = get_camoufox_manager()
+        browser_manager = get_session_browser_manager()
         page = await browser_manager.get_current_page()
         if not page:
             return {"success": False, "error": "No active browser page available"}
         # Execute JavaScript
-        result = await page.evaluate(script, timeout=timeout)
+        # Note: page.evaluate() does NOT accept a timeout parameter
+        # The timeout arg to this function is kept for API compatibility but unused
+        result = await page.evaluate(script)
         emit_success("JavaScript executed successfully", message_group=group_id)
@@ -52,7 +54,7 @@ async def scroll_page(
         message_group=group_id,
     )
     try:
-        browser_manager = get_camoufox_manager()
+        browser_manager = get_session_browser_manager()
         page = await browser_manager.get_current_page()
         if not page:
@@ -60,7 +62,7 @@ async def scroll_page(
         if element_selector:
             # Scroll specific element
-            element = page.locator(element_selector)
+            element = page.locator(element_selector).first
             await element.scroll_into_view_if_needed()
             # Get element's current scroll position and dimensions
@@ -146,13 +148,13 @@ async def scroll_to_element(
         message_group=group_id,
     )
     try:
-        browser_manager = get_camoufox_manager()
+        browser_manager = get_session_browser_manager()
         page = await browser_manager.get_current_page()
         if not page:
             return {"success": False, "error": "No active browser page available"}
-        element = page.locator(selector)
+        element = page.locator(selector).first
         await element.wait_for(state="attached", timeout=timeout)
         await element.scroll_into_view_if_needed()
@@ -178,7 +180,7 @@ async def set_viewport_size(
         message_group=group_id,
     )
     try:
-        browser_manager = get_camoufox_manager()
+        browser_manager = get_session_browser_manager()
         page = await browser_manager.get_current_page()
         if not page:
@@ -209,13 +211,13 @@ async def wait_for_element(
         message_group=group_id,
     )
     try:
-        browser_manager = get_camoufox_manager()
+        browser_manager = get_session_browser_manager()
         page = await browser_manager.get_current_page()
         if not page:
             return {"success": False, "error": "No active browser page available"}
-        element = page.locator(selector)
+        element = page.locator(selector).first
         await element.wait_for(state=state, timeout=timeout)
         emit_success(f"Element {selector} is now {state}", message_group=group_id)
@@ -240,13 +242,13 @@ async def highlight_element(
         message_group=group_id,
     )
     try:
-        browser_manager = get_camoufox_manager()
+        browser_manager = get_session_browser_manager()
         page = await browser_manager.get_current_page()
         if not page:
             return {"success": False, "error": "No active browser page available"}
-        element = page.locator(selector)
+        element = page.locator(selector).first
         await element.wait_for(state="visible", timeout=timeout)
         # Add highlight style
@@ -277,7 +279,7 @@ async def clear_highlights() -> Dict[str, Any]:
         message_group=group_id,
     )
     try:
-        browser_manager = get_camoufox_manager()
+        browser_manager = get_session_browser_manager()
         page = await browser_manager.get_current_page()
         if not page:

code-puppy 0.0.356__py3-none-any.whl → 0.0.357__py3-none-any.whl

code-puppy 0.0.356py3-none-any.whl → 0.0.357py3-none-any.whl