PyPI - minitap-mobile-use - Versions diffs - 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

minitap-mobile-use 2.2.0py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (24) hide show

minitap/mobile_use/tools/mobile/clear_text.py CHANGED Viewed

@@ -23,6 +23,7 @@ from minitap.mobile_use.tools.utils import (
 )
 from minitap.mobile_use.utils.logger import get_logger
 from minitap.mobile_use.utils.ui_hierarchy import (
+    ElementBounds,
     find_element_by_resource_id,
     get_element_text,
     text_input_is_empty,
@@ -50,16 +51,20 @@ class TextClearer:
         screen_data = get_screen_data(screen_api_client=self.ctx.screen_api_client)
         self.state.latest_ui_hierarchy = screen_data.elements
-    def _get_element_info(self, resource_id: str) -> tuple[object | None, str | None, str | None]:
+    def _get_element_info(
+        self, resource_id: str | None
+    ) -> tuple[object | None, str | None, str | None]:
         if not self.state.latest_ui_hierarchy:
             self._refresh_ui_hierarchy()
         if not self.state.latest_ui_hierarchy:
             return None, None, None
-        element = find_element_by_resource_id(
-            ui_hierarchy=self.state.latest_ui_hierarchy, resource_id=resource_id
-        )
+        element = None
+        if resource_id:
+            element = find_element_by_resource_id(
+                ui_hierarchy=self.state.latest_ui_hierarchy, resource_id=resource_id
+            )
         if not element:
             return None, None, None
@@ -83,11 +88,27 @@ class TextClearer:
     def _should_clear_text(self, current_text: str | None, hint_text: str | None) -> bool:
         return current_text is not None and current_text != "" and current_text != hint_text
-    def _prepare_element_for_clearing(self, resource_id: str) -> bool:
-        if not focus_element_if_needed(ctx=self.ctx, resource_id=resource_id):
+    def _prepare_element_for_clearing(
+        self,
+        text_input_resource_id: str | None,
+        text_input_coordinates: ElementBounds | None,
+        text_input_text: str | None,
+    ) -> bool:
+        if not focus_element_if_needed(
+            ctx=self.ctx,
+            input_resource_id=text_input_resource_id,
+            input_coordinates=text_input_coordinates,
+            input_text=text_input_text,
+        ):
             return False
-        move_cursor_to_end_if_bounds(ctx=self.ctx, state=self.state, resource_id=resource_id)
+        move_cursor_to_end_if_bounds(
+            ctx=self.ctx,
+            state=self.state,
+            text_input_resource_id=text_input_resource_id,
+            text_input_coordinates=text_input_coordinates,
+            text_input_text=text_input_text,
+        )
         return True
     def _erase_text_attempt(self, text_length: int) -> str | None:
@@ -102,7 +123,12 @@ class TextClearer:
         return None
     def _clear_with_retries(
-        self, resource_id: str, initial_text: str, hint_text: str | None
+        self,
+        text_input_resource_id: str | None,
+        text_input_coordinates: ElementBounds | None,
+        text_input_text: str | None,
+        initial_text: str,
+        hint_text: str | None,
     ) -> tuple[bool, str | None, int]:
         current_text = initial_text
         erased_chars = 0
@@ -118,18 +144,25 @@ class TextClearer:
             erased_chars += chars_to_erase
             self._refresh_ui_hierarchy()
-            elt = find_element_by_resource_id(
-                ui_hierarchy=self.state.latest_ui_hierarchy or [],
-                resource_id=resource_id,
-            )
-            if elt:
-                current_text = get_element_text(elt)
-                logger.info(f"Current text: {current_text}")
-                if text_input_is_empty(text=current_text, hint_text=hint_text):
-                    break
+            elt = None
+            if text_input_resource_id:
+                elt = find_element_by_resource_id(
+                    ui_hierarchy=self.state.latest_ui_hierarchy or [],
+                    resource_id=text_input_resource_id,
+                )
+                if elt:
+                    current_text = get_element_text(elt)
+                    logger.info(f"Current text: {current_text}")
+                    if text_input_is_empty(text=current_text, hint_text=hint_text):
+                        break
             move_cursor_to_end_if_bounds(
-                ctx=self.ctx, state=self.state, resource_id=resource_id, elt=elt
+                ctx=self.ctx,
+                state=self.state,
+                text_input_resource_id=text_input_resource_id,
+                text_input_coordinates=text_input_coordinates,
+                text_input_text=text_input_text,
+                elt=elt,
             )
         return True, current_text, erased_chars
@@ -162,7 +195,9 @@ class TextClearer:
             hint_text=hint_text,
         )
-    def _handle_element_not_found(self, resource_id: str, hint_text: str | None) -> ClearTextResult:
+    def _handle_element_not_found(
+        self, resource_id: str | None, hint_text: str | None
+    ) -> ClearTextResult:
         error = erase_text_controller(ctx=self.ctx)
         self._refresh_ui_hierarchy()
@@ -176,16 +211,23 @@ class TextClearer:
             hint_text=hint_text,
         )
-    def clear_text_by_resource_id(self, resource_id: str) -> ClearTextResult:
-        element, current_text, hint_text = self._get_element_info(resource_id)
+    def clear_input_text(
+        self,
+        text_input_resource_id: str | None,
+        text_input_coordinates: ElementBounds | None,
+        text_input_text: str | None,
+    ) -> ClearTextResult:
+        element, current_text, hint_text = self._get_element_info(text_input_resource_id)
         if not element:
-            return self._handle_element_not_found(resource_id, hint_text)
+            return self._handle_element_not_found(text_input_resource_id, hint_text)
         if not self._should_clear_text(current_text, hint_text):
             return self._handle_no_clearing_needed(current_text, hint_text)
-        if not self._prepare_element_for_clearing(resource_id):
+        if not self._prepare_element_for_clearing(
+            text_input_resource_id, text_input_coordinates, text_input_text
+        ):
             return self._create_result(
                 success=False,
                 error_message="Failed to focus element",
@@ -195,7 +237,9 @@ class TextClearer:
             )
         success, final_text, chars_erased = self._clear_with_retries(
-            resource_id=resource_id,
+            text_input_resource_id=text_input_resource_id,
+            text_input_coordinates=text_input_coordinates,
+            text_input_text=text_input_text,
             initial_text=current_text or "",
             hint_text=hint_text,
         )
@@ -218,12 +262,16 @@ def get_clear_text_tool(ctx: MobileUseContext):
         state: Annotated[State, InjectedState],
         agent_thought: str,
         text_input_resource_id: str,
+        text_input_coordinates: ElementBounds | None,
+        text_input_text: str | None,
     ):
         """
         Clears all the text from the text field, by focusing it if needed.
         """
         clearer = TextClearer(ctx, state)
-        result = clearer.clear_text_by_resource_id(text_input_resource_id)
+        result = clearer.clear_input_text(
+            text_input_resource_id, text_input_coordinates, text_input_text
+        )
         content = (
             clear_text_wrapper.on_failure_fn(result.error_message)

minitap/mobile_use/tools/mobile/copy_text_from.py CHANGED Viewed

@@ -1,18 +1,20 @@
+from typing import Annotated
 from langchain_core.messages import ToolMessage
 from langchain_core.tools import tool
 from langchain_core.tools.base import InjectedToolCallId
+from langgraph.prebuilt import InjectedState
 from langgraph.types import Command
+from pydantic import Field
 from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
+from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.controllers.mobile_command_controller import SelectorRequest
 from minitap.mobile_use.controllers.mobile_command_controller import (
     copy_text_from as copy_text_from_controller,
 )
-from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
-from pydantic import Field
-from typing import Annotated
-from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.graph.state import State
-from langgraph.prebuilt import InjectedState
+from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
 def get_copy_text_from_tool(ctx: MobileUseContext):

minitap/mobile_use/tools/mobile/{take_screenshot.py → glimpse_screen.py} RENAMED Viewed

@@ -1,8 +1,11 @@
+from typing import Annotated
 from langchain_core.messages import ToolMessage
 from langchain_core.tools import tool
 from langchain_core.tools.base import InjectedToolCallId
 from langgraph.prebuilt import InjectedState
 from langgraph.types import Command
 from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.controllers.mobile_command_controller import (
@@ -11,18 +14,18 @@ from minitap.mobile_use.controllers.mobile_command_controller import (
 from minitap.mobile_use.graph.state import State
 from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
 from minitap.mobile_use.utils.media import compress_base64_jpeg
-from typing import Annotated
-def get_take_screenshot_tool(ctx: MobileUseContext):
+def get_glimpse_screen_tool(ctx: MobileUseContext):
     @tool
-    def take_screenshot(
+    def glimpse_screen(
         tool_call_id: Annotated[str, InjectedToolCallId],
         state: Annotated[State, InjectedState],
         agent_thought: str,
     ):
         """
-        Take a screenshot of the device.
+        Captures the current screen as an image.
+        The resulting screenshot is added to the context for the next reasoning step.
         """
         compressed_image_base64 = None
         has_failed = False
@@ -36,9 +39,9 @@ def get_take_screenshot_tool(ctx: MobileUseContext):
         tool_message = ToolMessage(
             tool_call_id=tool_call_id,
-            content=take_screenshot_wrapper.on_failure_fn()
+            content=glimpse_screen_wrapper.on_failure_fn()
             if has_failed
-            else take_screenshot_wrapper.on_success_fn(),
+            else glimpse_screen_wrapper.on_success_fn(),
             additional_kwargs={"error": output} if has_failed else {},
             status="error" if has_failed else "success",
         )
@@ -56,11 +59,12 @@ def get_take_screenshot_tool(ctx: MobileUseContext):
             ),
         )
-    return take_screenshot
+    return glimpse_screen
-take_screenshot_wrapper = ToolWrapper(
-    tool_fn_getter=get_take_screenshot_tool,
-    on_success_fn=lambda: "Screenshot taken successfully.",
-    on_failure_fn=lambda: "Failed to take screenshot.",
+glimpse_screen_wrapper = ToolWrapper(
+    tool_fn_getter=get_glimpse_screen_tool,
+    on_success_fn=lambda: "Visual context captured successfully."
+    + "It is now available for immediate analysis.",
+    on_failure_fn=lambda: "Failed to capture visual context.",
 )

minitap/mobile_use/tools/mobile/input_text.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import Literal
+from typing import Annotated, Literal
 from langchain_core.messages import ToolMessage
 from langchain_core.tools import tool
@@ -8,10 +8,12 @@ from langchain_core.tools.base import InjectedToolCallId
 from langgraph.prebuilt import InjectedState
 from langgraph.types import Command
 from pydantic import BaseModel
-from typing import Annotated
 from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
 from minitap.mobile_use.context import MobileUseContext
+from minitap.mobile_use.controllers.mobile_command_controller import (
+    get_screen_data,
+)
 from minitap.mobile_use.controllers.mobile_command_controller import (
     input_text as input_text_controller,
 )
@@ -19,6 +21,11 @@ from minitap.mobile_use.graph.state import State
 from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
 from minitap.mobile_use.tools.utils import focus_element_if_needed, move_cursor_to_end_if_bounds
 from minitap.mobile_use.utils.logger import get_logger
+from minitap.mobile_use.utils.ui_hierarchy import (
+    ElementBounds,
+    find_element_by_resource_id,
+    get_element_text,
+)
 logger = get_logger(__name__)
@@ -47,7 +54,9 @@ def get_input_text_tool(ctx: MobileUseContext):
         state: Annotated[State, InjectedState],
         agent_thought: str,
         text: str,
-        text_input_resource_id: str,
+        text_input_resource_id: str | None,
+        text_input_coordinates: ElementBounds | None,
+        text_input_text: str | None,
     ):
         """
         Focus a text field and type text into it.
@@ -55,23 +64,83 @@ def get_input_text_tool(ctx: MobileUseContext):
         - Ensure the corresponding element is focused (tap if necessary).
         - If bounds are available, tap near the end to place the cursor at the end.
         - Type the provided `text` using the controller.
+        Args:
+            tool_call_id: The ID of the tool call.
+            state: The state of the agent.
+            agent_thought: The thought of the agent.
+            text: The text to type.
+            text_input_resource_id: The resource ID of the text input (if available).
+            text_input_coordinates: The bounds (ElementBounds) of the text input (if available).
+            text_input_text: The current text content of the text input (if available).
         """
-        focused = focus_element_if_needed(ctx=ctx, resource_id=text_input_resource_id)
-        if focused:
-            move_cursor_to_end_if_bounds(ctx=ctx, state=state, resource_id=text_input_resource_id)
+        focused = focus_element_if_needed(
+            ctx=ctx,
+            input_resource_id=text_input_resource_id,
+            input_coordinates=text_input_coordinates,
+            input_text=text_input_text,
+        )
+        if not focused:
+            error_message = "Failed to focus the text input element before typing."
+            tool_message = ToolMessage(
+                tool_call_id=tool_call_id,
+                content=input_text_wrapper.on_failure_fn(text, error_message),
+                additional_kwargs={"error": error_message},
+                status="error",
+            )
+            return Command(
+                update=state.sanitize_update(
+                    ctx=ctx,
+                    update={
+                        "agents_thoughts": [agent_thought, error_message],
+                        EXECUTOR_MESSAGES_KEY: [tool_message],
+                    },
+                    agent="executor",
+                ),
+            )
+        move_cursor_to_end_if_bounds(
+            ctx=ctx,
+            state=state,
+            text_input_resource_id=text_input_resource_id,
+            text_input_coordinates=text_input_coordinates,
+            text_input_text=text_input_text,
+        )
         result = _controller_input_text(ctx=ctx, text=text)
         status: Literal["success", "error"] = "success" if result.ok else "error"
-        content_msg = (
-            input_text_wrapper.on_success_fn(text)
+        text_input_content = ""
+        if status == "success":
+            if text_input_resource_id is not None:
+                # Verification phase for elements with resource_id
+                screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
+                state.latest_ui_hierarchy = screen_data.elements
+                element = find_element_by_resource_id(
+                    ui_hierarchy=state.latest_ui_hierarchy, resource_id=text_input_resource_id
+                )
+                if not element:
+                    result = InputResult(ok=False, error="Element not found")
+                if element:
+                    text_input_content = get_element_text(element)
+            else:
+                # For elements without resource_id, skip verification and use direct message
+                pass
+        agent_outcome = (
+            input_text_wrapper.on_success_fn(text, text_input_content, text_input_resource_id)
             if result.ok
-            else input_text_wrapper.on_failure_fn(text)
+            else input_text_wrapper.on_failure_fn(text, result.error)
         )
         tool_message = ToolMessage(
             tool_call_id=tool_call_id,
-            content=content_msg,
+            content=agent_outcome,
             additional_kwargs={"error": result.error} if not result.ok else {},
             status=status,
         )
@@ -80,7 +149,7 @@ def get_input_text_tool(ctx: MobileUseContext):
             update=state.sanitize_update(
                 ctx=ctx,
                 update={
-                    "agents_thoughts": [agent_thought],
+                    "agents_thoughts": [agent_thought, agent_outcome],
                     EXECUTOR_MESSAGES_KEY: [tool_message],
                 },
                 agent="executor",
@@ -90,8 +159,20 @@ def get_input_text_tool(ctx: MobileUseContext):
     return input_text
+def _on_input_success(text, text_input_content, text_input_resource_id):
+    """Success message handler for input text operations."""
+    if text_input_resource_id is not None:
+        return (
+            f"Typed {repr(text)}.\n"
+            f"Here is the whole content of input with id {repr(text_input_resource_id)}: "
+            f"{repr(text_input_content)}"
+        )
+    else:
+        return "Typed text, should now verify before moving forward"
 input_text_wrapper = ToolWrapper(
     tool_fn_getter=get_input_text_tool,
-    on_success_fn=lambda text: f"Successfully typed {text}",
-    on_failure_fn=lambda text: f"Failed to input text {text}",
+    on_success_fn=_on_input_success,
+    on_failure_fn=lambda text, error: f"Failed to input text {repr(text)}. Reason: {error}",
 )

minitap/mobile_use/tools/mobile/paste_text.py CHANGED Viewed

@@ -1,16 +1,22 @@
+from typing import Annotated
 from langchain_core.messages import ToolMessage
 from langchain_core.tools import tool
 from langchain_core.tools.base import InjectedToolCallId
+from langgraph.prebuilt import InjectedState
 from langgraph.types import Command
 from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
 from minitap.mobile_use.context import MobileUseContext
+from minitap.mobile_use.controllers.mobile_command_controller import (
+    get_screen_data,
+)
 from minitap.mobile_use.controllers.mobile_command_controller import (
     paste_text as paste_text_controller,
 )
 from minitap.mobile_use.graph.state import State
-from langgraph.prebuilt import InjectedState
 from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
-from typing import Annotated
+from minitap.mobile_use.utils.ui_hierarchy import find_element_by_resource_id, get_element_text
 def get_paste_text_tool(ctx: MobileUseContext):
@@ -19,6 +25,7 @@ def get_paste_text_tool(ctx: MobileUseContext):
         tool_call_id: Annotated[str, InjectedToolCallId],
         state: Annotated[State, InjectedState],
         agent_thought: str,
+        focused_element_resource_id: str,
     ):
         """
         Pastes text previously copied via `copyTextFrom` into the currently focused field.
@@ -32,12 +39,29 @@ def get_paste_text_tool(ctx: MobileUseContext):
             - pasteText
         """
         output = paste_text_controller(ctx=ctx)
+        text_input_content = ""
+        screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
+        state.latest_ui_hierarchy = screen_data.elements
+        element = find_element_by_resource_id(
+            ui_hierarchy=state.latest_ui_hierarchy, resource_id=focused_element_resource_id
+        )
+        if element:
+            text_input_content = get_element_text(element)
         has_failed = output is not None
+        agent_outcome = (
+            paste_text_wrapper.on_success_fn(text_input_content)
+            if not has_failed
+            else paste_text_wrapper.on_failure_fn(text_input_content)
+        )
         tool_message = ToolMessage(
             tool_call_id=tool_call_id,
-            content=paste_text_wrapper.on_failure_fn()
-            if has_failed
-            else paste_text_wrapper.on_success_fn(),
+            content=agent_outcome,
             additional_kwargs={"error": output} if has_failed else {},
             status="error" if has_failed else "success",
         )
@@ -45,7 +69,7 @@ def get_paste_text_tool(ctx: MobileUseContext):
             update=state.sanitize_update(
                 ctx=ctx,
                 update={
-                    "agents_thoughts": [agent_thought],
+                    "agents_thoughts": [agent_thought, agent_outcome],
                     EXECUTOR_MESSAGES_KEY: [tool_message],
                 },
                 agent="executor",
@@ -57,6 +81,8 @@ def get_paste_text_tool(ctx: MobileUseContext):
 paste_text_wrapper = ToolWrapper(
     tool_fn_getter=get_paste_text_tool,
-    on_success_fn=lambda: "Text pasted successfully.",
-    on_failure_fn=lambda: "Failed to paste text.",
+    on_success_fn=lambda input_content: "Text pasted successfully. Here is the actual"
+    + f"content of the text field : {repr(input_content)}",
+    on_failure_fn=lambda input_content: "Failed to paste text."
+    + f"Here is the actual content of the text field : {repr(input_content)}",
 )

minitap-mobile-use 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

Potentially problematic release.

minitap-mobile-use 2.2.0py3-none-any.whl → 2.3.0py3-none-any.whl