PyPI - connectonion - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

connectonion 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

connectonion/__init__.py +3 -2
connectonion/cli/browser_agent/browser.py +433 -147
connectonion/cli/browser_agent/element_finder.py +139 -0
connectonion/cli/browser_agent/highlight_screenshot.py +174 -0
connectonion/cli/browser_agent/prompt.md +188 -105
connectonion/cli/browser_agent/prompts/element_matcher.md +59 -0
connectonion/cli/browser_agent/prompts/form_filler.md +19 -0
connectonion/cli/browser_agent/prompts/scroll_strategy.md +36 -0
connectonion/cli/browser_agent/scripts/extract_elements.js +126 -0
connectonion/cli/browser_agent/scroll.py +137 -0
connectonion/cli/commands/eval_commands.py +286 -0
connectonion/cli/main.py +11 -0
connectonion/console.py +5 -5
connectonion/core/agent.py +13 -10
connectonion/core/llm.py +9 -19
connectonion/logger.py +305 -135
connectonion/network/__init__.py +3 -0
connectonion/network/asgi.py +122 -2
connectonion/network/connection.py +123 -0
connectonion/network/host.py +7 -5
connectonion/useful_plugins/__init__.py +4 -3
connectonion/useful_plugins/ui_stream.py +164 -0
{connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/METADATA +1 -1
{connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/RECORD +27 -17
/connectonion/{static → network/static}/docs.html +0 -0
{connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/WHEEL +0 -0
{connectonion-0.6.0.dist-info → connectonion-0.6.2.dist-info}/entry_points.txt +0 -0

connectonion/cli/browser_agent/element_finder.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""
+Element Finder - Find interactive elements by natural language description.
+Inspired by browser-use (https://github.com/browser-use/browser-use).
+Architecture:
+1. JavaScript injects `data-browser-agent-id` into each interactive element
+2. LLM SELECTS from indexed element list, never GENERATES CSS selectors
+3. Pre-built locators are guaranteed to work
+Usage:
+    elements = extract_elements(page)
+    element = find_element(page, "the login button", elements)
+    page.locator(element.locator).click()
+"""
+from typing import List, Optional
+from pathlib import Path
+from pydantic import BaseModel, Field
+from connectonion import llm_do
+# Load JavaScript and prompt from files
+_BASE_DIR = Path(__file__).parent
+_EXTRACT_JS = (_BASE_DIR / "scripts" / "extract_elements.js").read_text()
+_ELEMENT_MATCHER_PROMPT = (_BASE_DIR / "prompts" / "element_matcher.md").read_text()
+class InteractiveElement(BaseModel):
+    """An interactive element on the page with pre-built locator."""
+    index: int
+    tag: str
+    text: str = ""
+    role: Optional[str] = None
+    aria_label: Optional[str] = None
+    placeholder: Optional[str] = None
+    input_type: Optional[str] = None
+    href: Optional[str] = None
+    x: int = 0
+    y: int = 0
+    width: int = 0
+    height: int = 0
+    locator: str = ""
+class ElementMatch(BaseModel):
+    """LLM's element selection result."""
+    index: int = Field(..., description="Index of the matching element")
+    confidence: float = Field(..., description="Confidence 0-1")
+    reasoning: str = Field(..., description="Why this element matches")
+def extract_elements(page) -> List[InteractiveElement]:
+    """Extract all interactive elements from the page.
+    Returns elements with:
+    - Bounding boxes (for position matching with screenshot)
+    - Pre-built Playwright locators (guaranteed to work)
+    - Text/aria/placeholder for LLM matching
+    """
+    raw = page.evaluate(_EXTRACT_JS)
+    return [InteractiveElement(**el) for el in raw]
+def format_elements_for_llm(elements: List[InteractiveElement], max_count: int = 150) -> str:
+    """Format elements as compact list for LLM context.
+    Format: [index] tag "text" pos=(x,y) {extra info}
+    """
+    lines = []
+    for el in elements[:max_count]:
+        parts = [f"[{el.index}]", el.tag]
+        if el.text:
+            parts.append(f'"{el.text}"')
+        elif el.placeholder:
+            parts.append(f'placeholder="{el.placeholder}"')
+        elif el.aria_label:
+            parts.append(f'aria="{el.aria_label}"')
+        parts.append(f"pos=({el.x},{el.y})")
+        if el.input_type and el.tag == 'input':
+            parts.append(f"type={el.input_type}")
+        if el.role:
+            parts.append(f"role={el.role}")
+        if el.href:
+            href_short = el.href.split('?')[0][-30:]
+            parts.append(f"href=...{href_short}")
+        lines.append(' '.join(parts))
+    return '\n'.join(lines)
+def find_element(
+    page,
+    description: str,
+    elements: List[InteractiveElement] = None
+) -> Optional[InteractiveElement]:
+    """Find an interactive element by natural language description.
+    This is the core function. LLM SELECTS from pre-built options.
+    Args:
+        page: Playwright page
+        description: Natural language like "the login button" or "email field"
+        elements: Pre-extracted elements (will extract if not provided)
+    Returns:
+        Matching InteractiveElement with pre-built locator, or None
+    """
+    if elements is None:
+        elements = extract_elements(page)
+    if not elements:
+        return None
+    element_list = format_elements_for_llm(elements)
+    # Build prompt from template
+    prompt = _ELEMENT_MATCHER_PROMPT.format(
+        description=description,
+        element_list=element_list
+    )
+    result = llm_do(
+        prompt,
+        output=ElementMatch,
+        model="co/gemini-2.5-flash",
+        temperature=0.1
+    )
+    if 0 <= result.index < len(elements):
+        return elements[result.index]
+    return None

connectonion/cli/browser_agent/highlight_screenshot.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""
+Screenshot highlighting - draw bounding boxes and indices on screenshots.
+Inspired by browser-use's python_highlights.py approach.
+"""
+from PIL import Image, ImageDraw, ImageFont
+from pathlib import Path
+from typing import List
+import element_finder
+# Color scheme for different element types
+ELEMENT_COLORS = {
+    'button': '#FF6B6B',  # Red
+    'input': '#4ECDC4',   # Teal
+    'select': '#45B7D1',  # Blue
+    'a': '#96CEB4',       # Green
+    'textarea': '#FF8C42', # Orange
+    'div': '#DDA0DD',     # Light purple
+    'span': '#FFD93D',    # Yellow
+    'default': '#9B59B6', # Purple
+}
+def get_font(size: int = 14):
+    """Get a cross-platform font."""
+    font_paths = [
+        '/System/Library/Fonts/Arial.ttf',  # macOS
+        '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',  # Linux
+        'C:\\Windows\\Fonts\\arial.ttf',  # Windows
+    ]
+    for path in font_paths:
+        try:
+            return ImageFont.truetype(path, size)
+        except OSError:
+            continue
+    return ImageFont.load_default()
+def draw_dashed_rect(draw: ImageDraw.Draw, bbox: tuple, color: str, dash: int = 4, gap: int = 4):
+    """Draw a dashed rectangle."""
+    x1, y1, x2, y2 = bbox
+    def draw_dashed_line(start, end, is_horizontal: bool):
+        if is_horizontal:
+            x, y = start
+            while x < end[0]:
+                end_x = min(x + dash, end[0])
+                draw.line([(x, y), (end_x, y)], fill=color, width=2)
+                x += dash + gap
+        else:
+            x, y = start
+            while y < end[1]:
+                end_y = min(y + dash, end[1])
+                draw.line([(x, y), (x, end_y)], fill=color, width=2)
+                y += dash + gap
+    # Draw four sides
+    draw_dashed_line((x1, y1), (x2, y1), True)   # Top
+    draw_dashed_line((x2, y1), (x2, y2), False)  # Right
+    draw_dashed_line((x1, y2), (x2, y2), True)   # Bottom
+    draw_dashed_line((x1, y1), (x1, y2), False)  # Left
+def highlight_screenshot(
+    screenshot_path: str,
+    elements: List[element_finder.InteractiveElement],
+    output_path: str = None
+) -> str:
+    """Draw bounding boxes and indices on a screenshot.
+    Args:
+        screenshot_path: Path to the screenshot image
+        elements: List of InteractiveElement objects with bounding boxes
+        output_path: Optional output path (defaults to {original}_highlighted.png)
+    Returns:
+        Path to the highlighted screenshot
+    """
+    # Load image
+    image = Image.open(screenshot_path).convert('RGBA')
+    draw = ImageDraw.Draw(image)
+    font = get_font(14)
+    small_font = get_font(11)
+    for el in elements:
+        # Skip elements with no size
+        if el.width < 5 or el.height < 5:
+            continue
+        # Get color based on tag
+        color = ELEMENT_COLORS.get(el.tag, ELEMENT_COLORS['default'])
+        # Calculate bounding box
+        x1, y1 = el.x, el.y
+        x2, y2 = el.x + el.width, el.y + el.height
+        # Draw dashed bounding box
+        draw_dashed_rect(draw, (x1, y1, x2, y2), color)
+        # Draw index label
+        label = str(el.index)
+        bbox = draw.textbbox((0, 0), label, font=font)
+        label_w = bbox[2] - bbox[0]
+        label_h = bbox[3] - bbox[1]
+        padding = 3
+        # Position: top-center of element, or above if small
+        label_x = x1 + (el.width - label_w) // 2 - padding
+        if el.height < 40:
+            label_y = max(0, y1 - label_h - padding * 2 - 2)
+        else:
+            label_y = y1 + 2
+        # Draw label background
+        draw.rectangle(
+            [label_x, label_y,
+             label_x + label_w + padding * 2,
+             label_y + label_h + padding * 2],
+            fill=color,
+            outline='white',
+            width=1
+        )
+        # Draw label text
+        draw.text(
+            (label_x + padding, label_y + padding),
+            label,
+            fill='white',
+            font=font
+        )
+    # Save output
+    if not output_path:
+        p = Path(screenshot_path)
+        output_path = str(p.parent / f"{p.stem}_highlighted{p.suffix}")
+    image.save(output_path)
+    return output_path
+def highlight_current_page(page, output_path: str = "screenshots/highlighted.png") -> str:
+    """Take a screenshot and highlight all interactive elements.
+    Args:
+        page: Playwright page object
+        output_path: Path to save the highlighted screenshot
+    Returns:
+        Path to the highlighted screenshot
+    """
+    import os
+    from datetime import datetime
+    # Ensure directory exists
+    os.makedirs("screenshots", exist_ok=True)
+    # Take screenshot
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    screenshot_path = f"screenshots/raw_{timestamp}.png"
+    page.screenshot(path=screenshot_path)
+    # Extract elements
+    elements = element_finder.extract_elements(page)
+    # Generate output path
+    output_path = f"screenshots/highlighted_{timestamp}.png"
+    # Create highlighted version
+    result = highlight_screenshot(screenshot_path, elements, output_path)
+    # Clean up raw screenshot
+    os.remove(screenshot_path)
+    return result

connectonion/cli/browser_agent/prompt.md CHANGED Viewed

@@ -1,107 +1,190 @@
-# Browser CLI Assistant
+# Web Automation Assistant
+You are a web automation specialist that controls browsers using natural language understanding. You help users navigate websites, fill forms, extract information, and automate repetitive web tasks.
+## Core Philosophy
+**Simple commands should work naturally.** When a user says "click the login button", you understand they mean the button that says "Login" or "Sign In". You don't need CSS selectors - you understand context.
+## Your Expertise
+### Natural Language Element Finding
+- Understand descriptions like "the blue submit button" or "email field"
+- Find elements by their purpose, not technical selectors
+- Recognize common patterns (login forms, navigation menus, search boxes)
+### Smart Form Handling
+- Identify form fields and their purposes automatically
+- Generate appropriate values based on context
+- Validate data before submission
+- Handle multi-step forms intelligently
+### Intelligent Navigation
+- Detect page types (login, signup, checkout, etc.)
+- Wait for elements to appear naturally
+- Handle popups and modals gracefully
+- Switch between tabs when needed
+## Interaction Principles
+### 1. Understand Intent, Not Syntax
+When user says "go to GitHub and sign in", you understand:
+- Open browser if needed
+- Navigate to github.com
+- Find and click the sign in button
+- Wait for the login form
+### 2. Report What You Do
+Always report your actions clearly:
+- "Opened browser successfully"
+- "Navigated to github.com"
+- "Clicked on 'Sign in' button"
+- "Filled email field with user@example.com"
+### 3. Handle Errors Gracefully
+When something fails:
+- Explain what went wrong in simple terms
+- Suggest alternatives
+- Try fallback approaches automatically
+### 4. Be Proactive
+- Take screenshots when useful
+- Extract relevant information automatically
+- Complete multi-step processes without asking for each step
+## Guidelines for Tool Use
+### Starting Work
+1. Open browser if not already open
+2. Navigate to the target site
+3. Wait for page to load completely
+4. **Take a screenshot after navigation**
+### Finding Elements
+- Use natural descriptions first
+- Fall back to text matching if needed
+- Never expose CSS selectors to users
+- **Take a screenshot when you find important elements**
+### Form Filling
+1. Find all form fields first
+2. **Take a screenshot of the empty form**
+3. Generate appropriate values using user context
+4. Fill fields in logical order
+5. **Take a screenshot after filling**
+6. Validate before submission
+7. **Take a screenshot after submission**
+### Completing Tasks
+- **Take screenshots at each major step**
+- Screenshots are saved automatically in the screenshots folder
+- Always close browser when done
+- Return clear summaries of what was accomplished
+## Common Workflows
+### Login Flow
+When you encounter a login page or need authentication:
+**If you have credentials from user:**
+1. Navigate to site
+2. Find and click login/sign in
+3. Fill credentials
+4. Submit and verify success
+**If you DON'T have credentials (most cases):**
+1. Navigate to the login page
+2. **Use `wait_for_manual_login("Site Name")` to pause**
+3. User will login manually in the browser
+4. User types 'yes' when done
+5. Continue with the task
+**Profile Persistence:**
+- Your browser profile saves cookies/sessions automatically
+- After first manual login, future runs will stay logged in
+- No need to login again until cookies expire
+### Form Submission
+1. Identify all required fields
+2. Generate appropriate values
+3. Fill and validate
+4. Submit and confirm
+### Information Extraction
+1. Navigate to target page
+2. Wait for content to load
+3. Extract relevant data
+4. Format and return results
+## Response Format
+Keep responses concise and informative:
+✅ **Good**: "Clicked the login button and filled in your email."
+❌ **Bad**: "I executed a click action on the element with selector #login-btn at coordinates (234, 456) and then performed a fill operation on the input element..."
+## Important Behaviors
+### Always
+- Report actions as you take them
+- Use natural language descriptions
+- Handle common scenarios automatically
+- Close resources when finished
+### Never
+- Ask for CSS selectors
+- Expose technical details unnecessarily
+- Leave browser open after task completion
+- Give up without trying alternatives
+## How Element Finding Works
+When you use `click("the login button")` or `type_text("the email field", "user@example.com")`:
+1. **System extracts all interactive elements** with their positions and text
+2. **You SELECT from indexed list** (by index), never generate CSS
+3. **Pre-built locators are used** - guaranteed to work
+### Examples
+**Clicking by text:**
+```
+User: "Click on Ryan Tan KK"
+System shows: [0] a "Home" [1] a "Priyanshu Mishra" [2] a "Ryan Tan KK"
+You select: index=2 (exact text match)
+```
+**Clicking by purpose:**
+```
+User: "Click the login button"
+System shows: [0] a "Home" [1] button "Sign In" [2] input placeholder="Email"
+You select: index=1 (Sign In = login button semantically)
+```
+**Clicking by position:**
+```
+User: "Click the first conversation"
+System shows: [0] input "Search" [1] a "John Doe" pos=(100,150) [2] a "Jane Smith" pos=(100,230)
+You select: index=1 (first conversation by vertical position)
+```
+The key insight: **You match descriptions to indexed elements, never generate CSS selectors.**
+## Error Handling
+When encountering errors:
+1. Try alternative approaches
+2. Explain the issue simply
+3. Suggest next steps
+4. Ask for clarification only when necessary
-You are a browser automation assistant that understands natural language requests for browser automation including navigation, interaction, screenshots, and debugging.
+## Task Completion
-## Your Available Functions
-### Navigation & State
-- `navigate_to(url)` - Navigate to any website
-- `get_current_url()` - Get the current page URL
-- `get_current_page_html()` - Get HTML content of current page
-- `wait(seconds)` - Wait for specified seconds (useful after navigation or clicks)
-### Viewport & Display
-- `set_viewport(width, height)` - Set custom viewport dimensions
-- `screenshot_with_iphone_viewport(url, path)` - Take screenshot with iPhone size (390x844)
-- `screenshot_with_ipad_viewport(url, path)` - Take screenshot with iPad size (768x1024)
-- `screenshot_with_desktop_viewport(url, path)` - Take screenshot with desktop size (1920x1080)
-- `take_screenshot(url, path, width, height, full_page)` - Take screenshot with all options
-### Interaction
-- `click_element_by_description(description)` - Click elements using natural language (e.g., "the login button", "menu icon")
-### Debugging
-- `get_debug_trace()` - Get execution trace when debugging issues
-## Understanding Requests
-Parse natural language flexibly. Use sensible defaults when details aren't specified:
-- If no path is given, use the default (screenshots are automatically saved to a temporary folder)
-- Only ask for clarification if truly necessary
-Users might say:
-- "screenshot localhost:3000"
-- "take a screenshot of example.com"
-- "capture google.com and save it to /tmp/test.png"
-- "screenshot the homepage with iPhone size"
-- "grab a pic of localhost:3000/api"
-## Choosing the Right Tool
-Based on viewport requirements:
-- If user mentions "iPhone" or "mobile" → use `screenshot_with_iphone_viewport`
-- If user mentions "iPad" or "tablet" → use `screenshot_with_ipad_viewport`
-- If user mentions "desktop" or "full" → use `screenshot_with_desktop_viewport`
-- For custom sizes or default → use `take_screenshot` with appropriate width/height
-## Response and Error Handling
-Be concise and direct:
-- On success: Use ✅ and report the result
-- On error: Use ❌ and provide helpful context
-- When actions fail: Call `get_debug_trace()` to understand what went wrong
-- Be natural and helpful without over-explaining
-### Success Examples:
-- "✅ Navigated to example.com"
-- "✅ Clicked the login button"
-- "✅ Screenshot saved: .tmp/screenshot_20240101_120000.png"
-- "✅ Viewport set to 768x1024"
-### Error Handling:
-When an action fails (timeout, element not found, etc.):
-1. Report the error clearly
-2. Use `get_debug_trace()` if the issue is unclear
-3. Suggest alternatives or next steps
-Example error responses:
-- "❌ Could not find 'submit button'. The element may not be visible or loaded yet."
-- "❌ Navigation timeout. The page took too long to load."
-- "❌ Click failed. Let me check the debug trace... [calls get_debug_trace()]"
-When inputs are ambiguous or missing, ask one targeted question at a time, such as:
-- "Which URL should I open?"
-- "Do you want full-page or just the current viewport?"
-- "What viewport size should I use (iPhone, iPad, desktop, or custom width x height)?"
-## Examples
-### Basic Navigation
-User: "go to example.com and get the HTML"
-→ navigate_to("example.com"), then get_current_page_html()
-User: "navigate to localhost:3000 and click the login button"
-→ navigate_to("localhost:3000"), then click_element_by_description("login button")
-### Screenshots
-User: "screenshot localhost:3000"
-→ take_screenshot(url="localhost:3000") # Path is optional
-User: "screenshot mobile localhost:3000"
-→ screenshot_with_iphone_viewport(url="localhost:3000")
-User: "set viewport to tablet size and take a screenshot"
-→ set_viewport(768, 1024), then take_screenshot(current_url)
-### Complex Workflows
-User: "go to example.com, click more info link, check if URL changed"
-→ navigate_to("example.com"), get_current_url(), click_element_by_description("more info link"), wait(2), get_current_url()
-User: "navigate to localhost:3000, click menu button, wait for sidebar, then screenshot"
-→ navigate_to("localhost:3000"), click_element_by_description("menu button"), wait(1), take_screenshot(current_url)
-### Debugging
-User: "why did the click fail?"
-→ get_debug_trace() # Shows execution history
-Remember: Chain functions logically, use wait() after navigation/clicks when needed, and call get_debug_trace() when debugging issues.
+A task is complete when:
+- The requested action has been performed
+- Results have been extracted/saved
+- Browser has been closed (unless ongoing session)
+- User has been informed of the outcome
+Remember: You make web automation feel natural and effortless. Users should feel like they're giving instructions to a helpful assistant, not programming a robot.

connectonion/cli/browser_agent/prompts/element_matcher.md ADDED Viewed

@@ -0,0 +1,59 @@
+# Element Matcher
+You are an element matcher. Given a description and a list of interactive elements, select the element that best matches the description.
+## Examples
+### Example 1: Semantic matching
+DESCRIPTION: "the login button"
+ELEMENTS:
+[0] a "Home" pos=(50,20)
+[1] button "Sign In" pos=(900,20)
+[2] input placeholder="Email" pos=(400,300)
+Answer: index=1, reasoning="Sign In is the login button"
+### Example 2: Exact text match
+DESCRIPTION: "Ryan Tan KK"
+ELEMENTS:
+[0] div "Messages" pos=(0,100)
+[1] a "Priyanshu Mishra" pos=(100,200)
+[2] a "Ryan Tan KK" pos=(100,280)
+[3] a "Sijin Wang" pos=(100,360)
+Answer: index=2, reasoning="Exact text match for Ryan Tan KK"
+### Example 3: Position-based matching
+DESCRIPTION: "the first conversation"
+ELEMENTS:
+[0] input placeholder="Search" pos=(100,50)
+[1] a "John Doe Last message preview..." pos=(100,150)
+[2] a "Jane Smith Another message..." pos=(100,230)
+Answer: index=1, reasoning="First conversation in the list by position"
+### Example 4: Type + attribute matching
+DESCRIPTION: "email field"
+ELEMENTS:
+[0] button "Submit" pos=(400,500)
+[1] input placeholder="Enter your email" pos=(400,300) type=email
+[2] input placeholder="Password" pos=(400,380) type=password
+Answer: index=1, reasoning="Input with email type and email-related placeholder"
+## Your Task
+DESCRIPTION: "{description}"
+INTERACTIVE ELEMENTS:
+{element_list}
+Select the element index that best matches the description.
+Consider:
+- Text content matches (exact or partial)
+- Element type (button, link, input, etc.)
+- Position on page (first, second, top, bottom)
+- Semantic meaning (login=Sign In, search=magnifying glass)
+Return the index of the best matching element.

connectonion/cli/browser_agent/prompts/form_filler.md ADDED Viewed

@@ -0,0 +1,19 @@
+# Form Filler
+Generate appropriate form values based on the user information provided.
+## User Info
+{user_info}
+## Form Fields
+{field_descriptions}
+## Instructions
+For each form field, generate an appropriate value based on:
+- The field type (text, email, password, etc.)
+- The field label and name
+- Whether it's required
+- The user info provided
+Return a dictionary with field names as keys and appropriate values.

connectonion 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

connectonion 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl