npm - @minded-ai/mindedjs - Versions diffs - 2.0.13 → 2.0.14-beta-1 - Mend

@minded-ai/mindedjs 2.0.13 → 2.0.14-beta-1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/browserTask/README.md +419 -0
package/dist/browserTask/browserAgent.py +632 -0
package/dist/browserTask/captcha_isolated.png +0 -0
package/dist/browserTask/executeBrowserTask.ts +79 -0
package/dist/browserTask/requirements.txt +8 -0
package/dist/browserTask/setup.sh +144 -0
package/dist/cli/index.js +0 -0
package/dist/internalTools/retell.d.ts +12 -0
package/dist/internalTools/retell.d.ts.map +1 -0
package/dist/internalTools/retell.js +54 -0
package/dist/internalTools/retell.js.map +1 -0
package/dist/internalTools/sendPlaceholderMessage.d.ts +14 -0
package/dist/internalTools/sendPlaceholderMessage.d.ts.map +1 -0
package/dist/internalTools/sendPlaceholderMessage.js +61 -0
package/dist/internalTools/sendPlaceholderMessage.js.map +1 -0
package/dist/toolsLibrary/classifier.d.ts.map +1 -1
package/dist/toolsLibrary/classifier.js +103 -33
package/dist/toolsLibrary/classifier.js.map +1 -1
package/dist/utils/extractStateMemoryResponse.d.ts +5 -0
package/dist/utils/extractStateMemoryResponse.d.ts.map +1 -0
package/dist/utils/extractStateMemoryResponse.js +91 -0
package/dist/utils/extractStateMemoryResponse.js.map +1 -0
package/package.json +2 -2
package/src/toolsLibrary/classifier.ts +115 -37

package/dist/browserTask/browserAgent.py ADDED Viewed

@@ -0,0 +1,632 @@
+#!/usr/bin/env python3
+"""
+Browser Use AI Agent with Captcha Bypass
+Uses the Python SDK instead of CLI for better control and captcha handling
+"""
+import asyncio
+import sys
+import json
+import argparse
+import base64
+import traceback
+from typing import Optional, Dict, Any
+from pathlib import Path
+try:
+    import cv2
+    import pytesseract
+    import numpy as np
+    from PIL import Image
+    from io import BytesIO
+    from browser_use import Agent, Controller, ActionResult
+    from browser_use.llm import ChatOpenAI
+    from browser_use.browser.types import Page
+    import logging
+except ImportError as e:
+    print(f"Error importing required packages: {e}")
+    print("Please install required dependencies:")
+    print("pip install browser-use opencv-python pytesseract pillow")
+    sys.exit(1)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Create a controller for custom actions
+controller = Controller()
+@controller.action('Detect and solve CAPTCHA challenges on the current page')
+async def solve_captcha(page: Page) -> ActionResult:
+    """
+    Advanced CAPTCHA detection and solving tool using GPT-4 vision.
+    Fills CAPTCHA input fields but lets browser-use handle form submission.
+    """
+    try:
+        logger.info("Starting CAPTCHA detection and bypass...")
+        # Wait for page to load
+        await page.wait_for_timeout(3000)
+        # Try to find and screenshot the specific CAPTCHA image first
+        captcha_screenshot = await _capture_captcha_image(page)
+        if captcha_screenshot is None:
+            logger.warning("Could not find specific CAPTCHA image, falling back to full page screenshot")
+            captcha_screenshot = await page.screenshot(full_page=True)
+        # Use GPT-4 to solve the CAPTCHA
+        captcha_result = await _ai_solve_captcha_with_gpt4(captcha_screenshot)
+        if captcha_result.get('success'):
+            # Log which screenshot was analyzed
+            if 'screenshot_path' in captcha_result:
+                logger.info(f"🔍 Analysis used screenshot: {captcha_result['screenshot_path']}")
+            # Try to find input field and fill the solution
+            input_result = await _fill_captcha_solution(page, captcha_result['solution'])
+            return input_result
+        else:
+            # Log screenshot path even for failed attempts
+            if 'screenshot_path' in captcha_result:
+                logger.info(f"🔍 Failed analysis used screenshot: {captcha_result['screenshot_path']}")
+            return ActionResult(
+                extracted_content="No CAPTCHA detected or could not solve it",
+                error=captcha_result.get('message', 'CAPTCHA solving failed')
+            )
+    except Exception as e:
+        logger.error(f"Error in captcha bypass: {str(e)}")
+        return ActionResult(
+            extracted_content="CAPTCHA bypass failed",
+            error=f"CAPTCHA bypass error: {str(e)}"
+        )
+async def _capture_captcha_image(page: Page) -> bytes:
+    """
+    Capture only the CAPTCHA image element for more accurate AI analysis
+    """
+    try:
+        # List of common CAPTCHA image selectors
+        captcha_image_selectors = [
+            # Specific selector from the user's example
+            '#ContentUsersPage_rc1_CaptchaImageUP',
+            # Generic CAPTCHA image selectors
+            'img[id*="captcha" i]',
+            'img[id*="Captcha"]',
+            'img[class*="captcha" i]',
+            'img[src*="captcha" i]',
+            'img[alt*="captcha" i]',
+            # Telerik WebResource patterns (common in ASP.NET)
+            'img[src*="Telerik.Web.UI.WebResource.axd"]',
+            'img[src*="WebResource.axd"][src*="rca"]',
+            # reCAPTCHA and other common patterns
+            '.g-recaptcha img',
+            '.h-captcha img',
+            'canvas[aria-label*="captcha" i]',
+            # Generic patterns by size (typical CAPTCHA dimensions)
+            'img[style*="height:50px"]',
+            'img[style*="width:180px"]',
+        ]
+        captcha_element = None
+        successful_selector = None
+        for selector in captcha_image_selectors:
+            try:
+                captcha_element = await page.query_selector(selector)
+                if captcha_element:
+                    # Verify it's visible and has reasonable dimensions
+                    bounding_box = await captcha_element.bounding_box()
+                    if bounding_box and bounding_box['width'] > 10 and bounding_box['height'] > 10:
+                        successful_selector = selector
+                        logger.info(f"📸 Found CAPTCHA image using selector: {selector}")
+                        logger.info(f"📏 CAPTCHA dimensions: {bounding_box['width']}x{bounding_box['height']} pixels")
+                        break
+            except Exception as e:
+                logger.debug(f"Selector '{selector}' failed: {str(e)}")
+                continue
+        if not captcha_element:
+            logger.warning("❌ Could not find CAPTCHA image element")
+            return None
+        # Take screenshot of just the CAPTCHA element
+        logger.info("📸 Taking screenshot of CAPTCHA image element only...")
+        captcha_screenshot = await captcha_element.screenshot()
+        # Save a debug version to see what element was captured
+        screenshots_dir = Path("screenshots")
+        screenshots_dir.mkdir(exist_ok=True)
+        from datetime import datetime
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
+        debug_path = screenshots_dir / f"captcha_element_{timestamp}.png"
+        with open(debug_path, "wb") as f:
+            f.write(captcha_screenshot)
+        logger.info(f"🎯 CAPTCHA element screenshot saved: {debug_path}")
+        logger.info(f"✅ Successfully captured CAPTCHA using selector: {successful_selector}")
+        return captcha_screenshot
+    except Exception as e:
+        logger.error(f"Error capturing CAPTCHA image: {str(e)}")
+        return None
+async def _ai_solve_captcha_with_gpt4(screenshot: bytes) -> Dict[str, Any]:
+    """
+    Use GPT-4 Vision to solve CAPTCHA challenges
+    """
+    try:
+        from openai import AsyncOpenAI
+        import os
+        from datetime import datetime
+        # Initialize OpenAI client
+        client = AsyncOpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+        # Save screenshot for debugging
+        screenshots_dir = Path("screenshots")
+        screenshots_dir.mkdir(exist_ok=True)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]  # microseconds to milliseconds
+        # Determine if this is an element screenshot or full page
+        screenshot_type = "captcha_element" if len(screenshot) < 500000 else "full_page"  # rough size estimate
+        screenshot_path = screenshots_dir / f"{screenshot_type}_{timestamp}.png"
+        with open(screenshot_path, "wb") as f:
+            f.write(screenshot)
+        logger.info(f"📸 Screenshot saved to: {screenshot_path}")
+        # Convert screenshot to base64
+        screenshot_b64 = base64.b64encode(screenshot).decode('utf-8')
+        logger.info("🧠 Sending CAPTCHA image to GPT-4 Vision for analysis...")
+        # Create prompt for GPT-4 Vision
+        prompt = """
+        This image contains a CAPTCHA challenge. Please analyze it and provide the solution.
+        Common CAPTCHA types:
+        1. **Text CAPTCHA**: Distorted letters/numbers - read the exact text
+        2. **Math CAPTCHA**: Arithmetic problems (e.g., "3 + 7 = ?") - solve and provide the answer
+        3. **Simple Logic**: Basic questions or patterns
+        Instructions:
+        - Look carefully at all characters, including distorted or stylized text
+        - For math problems, calculate the answer
+        - Ignore background noise, lines, or visual distortions
+        - Focus only on the actual content that needs to be solved
+        Respond with ONLY the solution text/answer. Examples:
+        - "ABC123" (for text CAPTCHA showing these characters)
+        - "10" (for math problem "3 + 7 = ?")
+        - "HELLO" (for distorted text showing these letters)
+        If the image is unclear or you cannot determine the answer, respond with: "UNCLEAR"
+        """
+        response = await client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{screenshot_b64}",
+                                "detail": "high"
+                            }
+                        }
+                    ]
+                }
+            ],
+            max_tokens=300,
+            temperature=0.1
+        )
+        solution = response.choices[0].message.content.strip()
+        logger.info(f"GPT-4 Vision response: {solution}")
+        # Save the analysis result alongside the screenshot
+        analysis_path = screenshots_dir / f"{screenshot_type}_{timestamp}_analysis.txt"
+        with open(analysis_path, "w") as f:
+            f.write(f"Screenshot: {screenshot_path}\n")
+            f.write(f"Timestamp: {datetime.now()}\n")
+            f.write(f"GPT-4 Response: {solution}\n")
+            f.write(f"Success: {solution.upper() != 'UNCLEAR'}\n")
+        logger.info(f"📝 Analysis result saved to: {analysis_path}")
+        if solution.upper() == "UNCLEAR":
+            return {
+                'success': False,
+                'message': 'Could not determine CAPTCHA solution from image',
+                'screenshot_path': str(screenshot_path)
+            }
+        else:
+            logger.info(f"🎯 GPT-4 solved CAPTCHA: '{solution}'")
+            return {
+                'success': True,
+                'solution': solution,
+                'method': 'gpt4-vision',
+                'screenshot_path': str(screenshot_path)
+            }
+    except Exception as e:
+        logger.error(f"Error in GPT-4 CAPTCHA solving: {str(e)}")
+        return {
+            'success': False,
+            'message': f'GPT-4 CAPTCHA solving error: {str(e)}'
+        }
+async def _fill_captcha_solution(page, solution: str) -> ActionResult:
+    """
+    Find CAPTCHA input field and fill the solution (without submitting)
+    """
+    try:
+        # Common CAPTCHA input field selectors
+        captcha_selectors = [
+            # Case-insensitive CAPTCHA selectors
+            'input[name*="captcha" i]',
+            'input[id*="captcha" i]',
+            'input[class*="captcha" i]',
+            'input[placeholder*="captcha" i]',
+            # ASP.NET specific patterns
+            'input[name*="CaptchaTextBox"]',
+            'input[id*="CaptchaTextBox"]',
+            'input[name*="Captcha"]',
+            'input[id*="Captcha"]',
+            # Hebrew title attribute (for Israeli/Hebrew sites)
+            'input[title*="תווים בתמונה"]',  # "characters in the image"
+            'input[title*="תמונה"]',         # "image"
+            'input[title*="אימות"]',          # "verification"
+            # Generic patterns
+            'input[type="text"][name*="code"]',
+            'input[type="text"][id*="code"]',
+            'input[type="text"][maxlength="6"]',  # Common CAPTCHA length
+            'input[type="text"][maxlength="5"]',  # Common CAPTCHA length
+            'input[type="text"][maxlength="4"]',  # Common CAPTCHA length
+            # Class and structure patterns
+            '.captcha input',
+            '#captcha',
+            '[data-captcha] input',
+            'input[autocomplete="off"][maxlength]',  # Often CAPTCHAs disable autocomplete
+            # Specific selector for your element
+            '#ContentUsersPage_rc1_CaptchaTextBox',
+            'input[name="ctl00\\$ContentUsersPage\\$rc1\\$CaptchaTextBox"]',  # Escaped $ for CSS
+            'input[id="ContentUsersPage_rc1_CaptchaTextBox"]'
+        ]
+        captcha_input = None
+        successful_selector = None
+        for selector in captcha_selectors:
+            try:
+                captcha_input = await page.query_selector(selector)
+                if captcha_input:
+                    successful_selector = selector
+                    logger.info(f"✅ Found CAPTCHA input using selector: {selector}")
+                    # Log element details for debugging
+                    element_name = await captcha_input.get_attribute('name') or 'no-name'
+                    element_id = await captcha_input.get_attribute('id') or 'no-id'
+                    element_maxlength = await captcha_input.get_attribute('maxlength') or 'no-limit'
+                    logger.info(f"📝 CAPTCHA element details - Name: {element_name}, ID: {element_id}, MaxLength: {element_maxlength}")
+                    break
+            except Exception as e:
+                logger.debug(f"Selector '{selector}' failed: {str(e)}")
+                continue
+        if not captcha_input:
+            return ActionResult(
+                extracted_content="Could not find CAPTCHA input field",
+                error="CAPTCHA input field not found"
+            )
+        # Clear the input and enter the solution
+        await captcha_input.fill("")
+        await captcha_input.fill(solution)
+        logger.info(f"✅ Entered CAPTCHA solution: '{solution}' into field")
+        # Verify the value was set correctly
+        filled_value = await captcha_input.input_value()
+        if filled_value == solution:
+            logger.info(f"✅ Confirmed CAPTCHA field contains: '{filled_value}'")
+        else:
+            logger.warning(f"⚠️  CAPTCHA field shows '{filled_value}' but expected '{solution}'")
+        # Wait a moment for the input to register
+        await page.wait_for_timeout(500)
+        return ActionResult(
+            extracted_content=f"Successfully filled CAPTCHA field with: {solution}",
+            include_in_memory=True
+        )
+    except Exception as e:
+        logger.error(f"Error filling CAPTCHA solution: {str(e)}")
+        return ActionResult(
+            extracted_content=f"Error filling CAPTCHA: {str(e)}",
+            error=str(e)
+        )
+async def captcha_detection_hook(agent):
+    """
+    Lifecycle hook to automatically detect and solve CAPTCHAs
+    """
+    try:
+        page = await agent.browser_session.get_current_page()
+        # Check for CAPTCHA error messages that indicate we need to retry
+        captcha_error_indicators = [
+            'הקלדה שגוייה',  # Hebrew: "wrong input"
+            'incorrect captcha',
+            'invalid captcha',
+            'wrong captcha',
+            'captcha error',
+            'try again'
+        ]
+        page_content = await page.content()
+        page_content_lower = page_content.lower()
+        has_captcha_error = any(indicator in page_content_lower for indicator in captcha_error_indicators)
+        # Check for common CAPTCHA indicators in the page
+        captcha_indicators = [
+            'captcha',
+            'recaptcha',
+            'hcaptcha',
+            'verification',
+            'security check',
+            'prove you are human'
+        ]
+        has_captcha = any(indicator in page_content_lower for indicator in captcha_indicators)
+        if has_captcha:
+            # If there's a CAPTCHA error message, we should retry regardless of field content
+            if has_captcha_error:
+                logger.warning("🔄 CAPTCHA error detected! Clearing field and retrying...")
+                await _clear_captcha_field(page)
+            else:
+                # Check if CAPTCHA is already filled before attempting to solve
+                captcha_already_filled = await _is_captcha_already_filled(page)
+                if captcha_already_filled:
+                    logger.info("✅ CAPTCHA appears to be already filled and no errors detected, skipping automatic solving")
+                    return
+            # Initialize attempt tracking if not exists
+            if not hasattr(agent, '_captcha_attempts'):
+                agent._captcha_attempts = {}
+            # Get current page URL as a key for attempt tracking
+            current_url = page.url
+            attempt_count = agent._captcha_attempts.get(current_url, 0)
+            # Limit attempts to prevent infinite loops
+            max_attempts = 10
+            if attempt_count >= max_attempts:
+                logger.error(f"❌ Maximum CAPTCHA attempts ({max_attempts}) reached for {current_url}")
+                return
+            agent._captcha_attempts[current_url] = attempt_count + 1
+            logger.info(f"🎯 CAPTCHA attempt {attempt_count + 1}/{max_attempts} for page")
+            # Use the captcha solving action directly
+            result = await solve_captcha(page=page)
+            # Check if the result was successful by examining the content and error
+            if result.error is None and "Successfully filled CAPTCHA" in (result.extracted_content or ""):
+                logger.info(f"CAPTCHA filled successfully: {result.extracted_content}")
+                logger.info("CAPTCHA field is ready - browser-use can continue with the task")
+            else:
+                error_msg = result.error or result.extracted_content or "Unknown error"
+                logger.warning(f"CAPTCHA solving failed: {error_msg}")
+    except Exception as e:
+        logger.error(f"Error in CAPTCHA detection hook: {str(e)}")
+async def _is_captcha_already_filled(page: Page) -> bool:
+    """
+    Check if CAPTCHA input field is already filled to avoid solving it repeatedly
+    """
+    try:
+        # Common CAPTCHA input field selectors (same as in _fill_captcha_solution)
+        captcha_selectors = [
+            # Case-insensitive CAPTCHA selectors
+            'input[name*="captcha" i]',
+            'input[id*="captcha" i]',
+            'input[class*="captcha" i]',
+            'input[placeholder*="captcha" i]',
+            # ASP.NET specific patterns
+            'input[name*="CaptchaTextBox"]',
+            'input[id*="CaptchaTextBox"]',
+            'input[name*="Captcha"]',
+            'input[id*="Captcha"]',
+            # Hebrew title attribute (for Israeli/Hebrew sites)
+            'input[title*="תווים בתמונה"]',  # "characters in the image"
+            'input[title*="תמונה"]',         # "image"
+            'input[title*="אימות"]',          # "verification"
+            # Generic patterns
+            'input[type="text"][name*="code"]',
+            'input[type="text"][id*="code"]',
+            'input[type="text"][maxlength="6"]',  # Common CAPTCHA length
+            'input[type="text"][maxlength="5"]',  # Common CAPTCHA length
+            'input[type="text"][maxlength="4"]',  # Common CAPTCHA length
+            # Class and structure patterns
+            '.captcha input',
+            '#captcha',
+            '[data-captcha] input',
+            'input[autocomplete="off"][maxlength]',  # Often CAPTCHAs disable autocomplete
+            # Specific selector for your element
+            '#ContentUsersPage_rc1_CaptchaTextBox',
+            'input[name="ctl00\\$ContentUsersPage\\$rc1\\$CaptchaTextBox"]',  # Escaped $ for CSS
+            'input[id="ContentUsersPage_rc1_CaptchaTextBox"]'
+        ]
+        for selector in captcha_selectors:
+            try:
+                captcha_input = await page.query_selector(selector)
+                if captcha_input:
+                    # Check if the input field has a value
+                    current_value = await captcha_input.input_value()
+                    if current_value and len(current_value.strip()) > 0:
+                        logger.info(f"🔍 CAPTCHA field '{selector}' already contains: '{current_value}'")
+                        return True
+            except Exception as e:
+                logger.debug(f"Error checking selector '{selector}': {str(e)}")
+                continue
+        return False
+    except Exception as e:
+        logger.error(f"Error checking if CAPTCHA is filled: {str(e)}")
+        return False  # If we can't check, assume it's not filled to be safe
+async def _clear_captcha_field(page: Page) -> None:
+    """
+    Clear the CAPTCHA input field when there's an error and we need to retry
+    """
+    try:
+        # Same selectors as used elsewhere
+        captcha_selectors = [
+            '#ContentUsersPage_rc1_CaptchaTextBox',
+            'input[name*="captcha" i]',
+            'input[id*="captcha" i]',
+            'input[name*="CaptchaTextBox"]',
+            'input[id*="CaptchaTextBox"]',
+        ]
+        for selector in captcha_selectors:
+            try:
+                captcha_input = await page.query_selector(selector)
+                if captcha_input:
+                    await captcha_input.fill("")
+                    logger.info(f"🧹 Cleared CAPTCHA field: {selector}")
+                    return
+            except Exception as e:
+                logger.debug(f"Error clearing selector '{selector}': {str(e)}")
+                continue
+    except Exception as e:
+        logger.error(f"Error clearing CAPTCHA field: {str(e)}")
+async def run_browser_task(task: str, max_steps: int = 30) -> str:
+    """
+    Execute a browser task using the Python SDK with CAPTCHA bypass
+    """
+    try:
+        # Initialize the LLM
+        llm = ChatOpenAI(model="gpt-4o")
+        # Create the agent with CAPTCHA bypass controller
+        agent = Agent(
+            task=task,
+            llm=llm,
+            controller=controller,
+            max_actions_per_step=3
+        )
+        # Run the agent with lifecycle hooks for automatic CAPTCHA detection
+        logger.info(f"Starting browser task: {task}")
+        result = await agent.run(
+            max_steps=max_steps,
+            on_step_start=captcha_detection_hook
+        )
+        logger.info("Task completed successfully")
+        # Extract the final result
+        if hasattr(result, 'final_result') and callable(result.final_result):
+            final_output = result.final_result()
+        elif hasattr(result, 'message'):
+            final_output = result.message
+        else:
+            final_output = str(result)
+        return final_output or "Task completed successfully"
+    except Exception as e:
+        error_msg = f"Error executing browser task: {str(e)}\n{traceback.format_exc()}"
+        logger.error(error_msg)
+        raise Exception(error_msg)
+async def main():
+    """
+    Main entry point for the browser agent
+    """
+    parser = argparse.ArgumentParser(description='Browser Use Agent with CAPTCHA Bypass')
+    parser.add_argument('-p', '--prompt', required=True, help='Task prompt for the browser agent')
+    parser.add_argument('--max-steps', type=int, default=30, help='Maximum number of steps')
+    parser.add_argument('--output-format', choices=['text', 'json'], default='text', help='Output format')
+    args = parser.parse_args()
+    try:
+        result = await run_browser_task(args.prompt, args.max_steps)
+        if args.output_format == 'json':
+            output = {
+                'success': True,
+                'result': result,
+                'task': args.prompt
+            }
+            print(json.dumps(output, indent=2))
+        else:
+            print(result)
+    except Exception as e:
+        if args.output_format == 'json':
+            output = {
+                'success': False,
+                'error': str(e),
+                'task': args.prompt
+            }
+            print(json.dumps(output, indent=2))
+        else:
+            print(f"Error: {str(e)}", file=sys.stderr)
+        sys.exit(1)
+if __name__ == "__main__":
+    asyncio.run(main())

package/dist/browserTask/captcha_isolated.png ADDED Viewed

Binary file