PyPI - minitap-mobile-use - Versions diffs - 2.5.3__py3-none-any.whl → 2.7.0__py3-none-any.whl - Mend

minitap-mobile-use 2.5.3py3-none-any.whl → 2.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (43) hide show

minitap/mobile_use/agents/contextor/contextor.py +0 -8
minitap/mobile_use/agents/cortex/cortex.md +122 -36
minitap/mobile_use/agents/cortex/cortex.py +32 -17
minitap/mobile_use/agents/cortex/types.py +18 -4
minitap/mobile_use/agents/executor/executor.md +3 -3
minitap/mobile_use/agents/executor/executor.py +10 -3
minitap/mobile_use/agents/hopper/hopper.md +30 -2
minitap/mobile_use/agents/hopper/hopper.py +19 -15
minitap/mobile_use/agents/orchestrator/orchestrator.py +14 -5
minitap/mobile_use/agents/outputter/outputter.py +13 -3
minitap/mobile_use/agents/planner/planner.md +20 -9
minitap/mobile_use/agents/planner/planner.py +12 -5
minitap/mobile_use/agents/screen_analyzer/human.md +16 -0
minitap/mobile_use/agents/screen_analyzer/screen_analyzer.py +111 -0
minitap/mobile_use/clients/ios_client.py +7 -3
minitap/mobile_use/config.py +87 -24
minitap/mobile_use/controllers/mobile_command_controller.py +354 -88
minitap/mobile_use/controllers/platform_specific_commands_controller.py +41 -27
minitap/mobile_use/controllers/types.py +95 -0
minitap/mobile_use/graph/graph.py +55 -11
minitap/mobile_use/graph/state.py +10 -3
minitap/mobile_use/main.py +12 -4
minitap/mobile_use/sdk/agent.py +113 -72
minitap/mobile_use/sdk/examples/smart_notification_assistant.py +59 -10
minitap/mobile_use/sdk/services/platform.py +15 -1
minitap/mobile_use/sdk/types/platform.py +1 -0
minitap/mobile_use/sdk/types/task.py +10 -1
minitap/mobile_use/servers/device_hardware_bridge.py +13 -6
minitap/mobile_use/services/llm.py +5 -2
minitap/mobile_use/tools/index.py +7 -9
minitap/mobile_use/tools/mobile/{clear_text.py → focus_and_clear_text.py} +7 -7
minitap/mobile_use/tools/mobile/{input_text.py → focus_and_input_text.py} +8 -8
minitap/mobile_use/tools/mobile/long_press_on.py +130 -15
minitap/mobile_use/tools/mobile/swipe.py +3 -26
minitap/mobile_use/tools/mobile/tap.py +41 -28
minitap/mobile_use/tools/mobile/wait_for_delay.py +84 -0
minitap/mobile_use/utils/cli_helpers.py +10 -6
{minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/METADATA +1 -1
{minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/RECORD +41 -39
minitap/mobile_use/tools/mobile/glimpse_screen.py +0 -74
minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +0 -64
{minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/WHEEL +0 -0
{minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/entry_points.txt +0 -0

minitap/mobile_use/agents/contextor/contextor.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from minitap.mobile_use.agents.executor.utils import is_last_tool_message_take_screenshot
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
 from minitap.mobile_use.controllers.platform_specific_commands_controller import (
@@ -26,16 +25,9 @@ class ContextorNode:
         focused_app_info = get_focused_app_info(self.ctx)
         device_date = get_device_date(self.ctx)
-        should_add_screenshot_context = is_last_tool_message_take_screenshot(
-            list(state.executor_messages)
-        )
         return await state.asanitize_update(
             ctx=self.ctx,
             update={
-                "latest_screenshot_base64": device_data.base64
-                if should_add_screenshot_context
-                else None,
                 "latest_ui_hierarchy": device_data.elements,
                 "focused_app_info": focused_app_info,
                 "screen_size": (device_data.width, device_data.height),

minitap/mobile_use/agents/cortex/cortex.md CHANGED Viewed

@@ -21,16 +21,22 @@ If you detect a cycle, you are **FORBIDDEN** from repeating it. You must pivot y
 To understand the device state, you have two senses, each with its purpose:
 1.  **UI Hierarchy (Your sense of "Touch"):**
-    *   **What it is:** A structured list of all elements on the screen.
-    *   **Use it for:** Finding elements by `resource-id`, checking for specific text, and understanding the layout structure.
-    *   **Limitation:** It does NOT tell you what the screen *looks* like. It can be incomplete, and it contains no information about images, colors, or whether an element is visually obscured.
-2.  **`glimpse_screen` (Your sense of "Sight"):**
-    *   **What it is:** A tool that provides a real, up-to-date image of the screen.
-    *   **Use it for:** Confirming what is actually visible. This is your source of TRUTH for all visual information (icons, images, element positions, colors).
-    *   **Golden Rule:** When the UI hierarchy is ambiguous, seems incomplete, or when you need to verify a visual detail before acting, **`glimpse_screen` is always the most effective and reliable action.** Never guess what the screen looks like; use your sight to be sure.
+    - **What it is:** A structured list of all elements on the screen.
+    - **Use it for:** Finding elements by `resource-id`, checking for specific text, and understanding the layout structure.
+    - **Limitation:** It does NOT tell you what the screen _looks_ like. It can be incomplete, and it contains no information about images, colors, or whether an element is visually obscured.
-  **CRITICAL NOTE ON SIGHT:** The visual information from `glimpse_screen` is **ephemeral**. It is available for **THIS decision turn ONLY**. You MUST extract all necessary information from it IMMEDIATELY, as it will be cleared before the next step.
+2.  **`screen_analyzer` (Your sense of "Sight"):**
+    - **What it is:** A specialized agent that captures the screen and uses a vision model to answer specific questions about what is visible.
+    - **When to use it:** ONLY when the UI hierarchy is insufficient to make a decision. Use it sparingly for:
+      - Verifying visual elements that are not in the UI hierarchy (images, icons, colors)
+      - Confirming element visibility when hierarchy seems incomplete or ambiguous
+      - Identifying visual content that cannot be determined from text alone
+    - **When NOT to use it:** If the UI hierarchy contains the information you need (resource-ids, text, bounds), use that instead. Screen analysis is slower and should be a last resort.
+    - **How to use it:** Set the `screen_analysis_prompt` field in your output with a specific, focused question (e.g., "Is there a red notification badge on the Messages icon?", "What color is the submit button?").
+    - **Golden Rule:** Prefer the UI hierarchy first. Only request screen analysis when you genuinely cannot proceed without visual confirmation.
+**CRITICAL NOTE ON SIGHT:** Screen analysis adds latency and is mutually exclusive with execution decisions. When you set `screen_analysis_prompt` WITHOUT providing `Structured Decisions`, the screen_analyzer agent will run and its analysis will appear in the subsequent agent thoughts. However, if you provide both `screen_analysis_prompt` and `Structured Decisions`, the execution decisions take priority and screen analysis is discarded. Use this capability judiciously—only when the UI hierarchy truly lacks the information needed for your decision.
 ### CRITICAL ACTION DIRECTIVES
@@ -40,8 +46,9 @@ To understand the device state, you have two senses, each with its purpose:
 ### Context You Receive:
 - 📱 **Device state**:
-  - Latest **UI hierarchy** and (if available) a **screenshot**.
-  - **CRITICAL NOTE ON SIGHT:** The visual information from `glimpse_screen` is **ephemeral**. It is available for **THIS decision turn ONLY**. You MUST extract all necessary information from it IMMEDIATELY, as it will be cleared before the next step.
+  - Latest **UI hierarchy**
+  - Results from the **screen_analyzer** agent (if you previously requested analysis via `screen_analysis_prompt`, you'll see the result in agent thoughts)
 - 🧭 **Task context**:
   - The user's **initial goal**
@@ -55,6 +62,7 @@ To understand the device state, you have two senses, each with its purpose:
 Focus on the **current PENDING subgoal and the next subgoals not yet started**.
 **CRITICAL: Before making any decision, you MUST thoroughly analyze the agent thoughts history to:**
 - **Detect patterns of failure or repeated attempts** that suggest the current approach isn't working
 - **Identify contradictions** between what was planned and what actually happened
 - **Spot errors in previous reasoning** that need to be corrected
@@ -62,6 +70,7 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
 - **Avoid repeating failed approaches** by recognizing when to change strategy
 1. **Analyze the agent thoughts first** - Review all previous agent thoughts to understand:
    - What strategies have been tried and their outcomes
    - Any errors or misconceptions in previous reasoning
    - Patterns that indicate success or failure
@@ -76,18 +85,17 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
 - Recent tool effects and whether they matched expectations from agent thoughts
 - **Any corrections needed to previous reasoning or strategy**
 ### The Rule of Element Interaction
 **You MUST follow it for every element interaction.**
-When you target a UI element (for a `tap`, `input_text`, `clear_text`, etc.), you **MUST** provide a comprehensive `target` object containing every piece of information you can find about **that single element**.
+When you target a UI element (for a `tap`, `focus_and_input_text`, `focus_and_clear_text`, etc.), you **MUST** provide a comprehensive `target` object containing every piece of information you can find about **that single element**.
-*   **1. `resource_id`**: Include this if it is present in the UI hierarchy.
-*   **2. `resource_id_index`**: If there are multiple elements with the same `resource_id`, provide the zero-based index of the specific one you are targeting.
-*   **3. `coordinates`**: Include the full bounds (`x`, `y`, `width`, `height`) if they are available.
-*   **4. `text`**: Include the *current text* content of the element (e.g., placeholder text for an input).
-*   **5. `text_index`**: If there are multiple elements with the same `text`, provide the zero-based index of the specific one you are targeting.
+- **1. `resource_id`**: Include this if it is present in the UI hierarchy.
+- **2. `resource_id_index`**: If there are multiple elements with the same `resource_id`, provide the zero-based index of the specific one you are targeting.
+- **3. `coordinates`**: Include the full bounds (`x`, `y`, `width`, `height`) if they are available.
+- **4. `text`**: Include the _current text_ content of the element (e.g., placeholder text for an input).
+- **5. `text_index`**: If there are multiple elements with the same `text`, provide the zero-based index of the specific one you are targeting.
 **CRITICAL: The index must correspond to its identifier.** `resource_id_index` is only used when targeting by `resource_id`. `text_index` is only used when targeting by `text`. This ensures the fallback logic targets the correct element.
@@ -96,18 +104,17 @@ When you target a UI element (for a `tap`, `input_text`, `clear_text`, etc.), yo
 ### The Rule of Unpredictable Actions
 Certain actions have outcomes that can significantly and sometimes unpredictably change the UI. These include:
 - `back`
 - `launch_app`
 - `stop_app`
 - `open_link`
 - `tap` on an element that is clearly for navigation (e.g., a "Back" button, a menu item, a link to another screen).
-**CRITICAL RULE: If your decision includes one of these unpredictable actions, it MUST be the only action in your `Structured Decisions` for this turn. Else, use flows to group actions together.**
+**CRITICAL RULE: If your decision includes one of these unpredictable actions, it MUST be the only action in your `Structured Decisions` for this turn. Else, provide multiple decisions in your `Structured Decisions`, in the right order, to group actions together.**
 This is not optional. Failing to isolate these actions will cause the system to act on an outdated understanding of the screen, leading to catastrophic errors. For example, after a `back` command, you MUST wait to see the new screen before deciding what to tap next.
-You may only group simple, predictable actions together, such as tapping a text field and then immediately typing into it (`tap` followed by `input_text`).
 ### Outputting Your Decisions
 If you decide to act, output a **valid JSON stringified structured set of instructions** for the Executor.
@@ -117,9 +124,9 @@ If you decide to act, output a **valid JSON stringified structured set of instru
 - Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
 - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `resource-id-index: 0`, `text: "Alice"`, `resource-id-index: 0`, `x: 100, y: 200, width: 100, height: 100`).
 - **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
--   **Always use a single `input_text` action** to type in a field. This tool handles focusing the element and placing the cursor correctly. If the tool feedback indicates verification is needed or shows None/empty content, perform verification before proceeding.
+- **Always use a single `focus_and_input_text` action** to type in a field. This tool handles focusing the element, placing the cursor correctly and typing the text. If the tool feedback indicates verification is needed or shows None/empty content, perform verification before proceeding.
 - **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
-- **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
+- **For text clearing**: When you need to completely clear text from an input field, always call the `focus_and_clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
 ### Output
@@ -129,15 +136,27 @@ If you decide to act, output a **valid JSON stringified structured set of instru
 - **Structured Decisions** _(optional)_:
   A **valid stringified JSON** describing what should be executed **right now** to advance through the subgoals as much as possible.
-- **Agent Thought** _(2-4 sentences)_:
-  **MANDATORY: Start by analyzing previous agent thoughts** - Did previous reasoning contain errors? Are we repeating failed approaches? What worked before in similar situations?
-  Then explain your current decision based on this analysis. If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
+- **Decisions Reason** _(2-4 sentences)_:
+  Start by analyzing previous agent thoughts. Then explain your current decision. Explicitly mention if correcting errors or changing strategy. Include checkpoints for indefinite actions (e.g., "Swiping up - last seen recipe was X").
-  This also helps other agents understand your decision and learn from future failures. **Explicitly mention if you're correcting a previous error or changing strategy based on agent thoughts analysis.**
-  You must also use this field to mention checkpoints when you perform actions without definite ending: for instance "Swiping up to reveal more recipes - last seen recipe was <ID or NAME>, stop when no more".
+- **Goals Completion Reason**: Explain why marking subgoals complete based on observed evidence, or state "None".
-**Important:** `complete_subgoals_by_ids` and the structured decisions are mutually exclusive: if you provide both, the structured decisions will be ignored. Therefore, you must always prioritize completing subgoals over providing structured decisions.
+- **Screen Analysis Prompt** _(optional)_: A specific question for visual analysis (e.g., "Is there a search icon visible?"). Leave empty if not needed.
+**Important Decision Rules:**
+1. **Goal Completion + Execution Decisions**: You CAN provide both `complete_subgoals_by_ids` AND `Structured Decisions` in the same turn. This is the PREFERRED approach when:
+   - Agent thoughts show a previous action has ALREADY succeeded → Complete that subgoal
+   - The current screen requires new actions → Provide structured decisions
+   - **CRITICAL**: Only complete goals based on OBSERVED evidence from agent thoughts. NEVER complete goals "in advance" assuming an action will succeed.
+2. **Screen Analysis + Execution Decisions ARE MUTUALLY EXCLUSIVE**: If you provide both `screen_analysis_prompt` AND `Structured Decisions`, the execution decisions will take priority and screen analysis will be ignored. This should NEVER happen. Use screen analysis only when you need visual insights for the NEXT turn, not the current one.
+3. **Maximum Decisions Per Turn**: You can make up to 2 types of decisions simultaneously (never all 3):
+   - Complete examined subgoals (based on agent thoughts showing completion) + Execute actions on the current screen
+   - OR Complete examined subgoals + Request screen analysis (only when no execution decisions are needed)
+   - **Note:** Screen analysis and execution decisions cannot coexist—execution always takes priority if both are provided.
 ---
@@ -153,25 +172,92 @@ If you decide to act, output a **valid JSON stringified structured set of instru
 "{\"action\": \"launch_app\", \"app_name\": \"WhatsApp\"}"
 ```
-#### Agent Thought:
+#### Decisions Reason:
+> I need to launch the WhatsApp app to achieve the current subgoal. The `launch_app` tool is the most reliable method for opening applications.
-> I need to launch the WhatsApp app. I will use the `launch_app` tool to open it.
+#### Goals Completion Reason:
-### Exemple 2
+> None
+### Example 2: Execution Decisions + Goal Completion
 #### Current Subgoal:
-> "Search for Alice in WhatsApp"
+> "Send 'Hello!' to Alice on WhatsApp"
+#### Context:
+- **Agent thoughts history shows**: Previous turn executed `input_text` to type "Hello!" in the message field. Executor feedback confirms the text was successfully entered.
+- **Current UI state**: The UI hierarchy shows the message "Hello!" is in the input field, and a send button with resource_id `com.whatsapp:id/send` is present.
+#### Complete Subgoals By IDs:
+```text
+["subgoal-4-type-message"]
+```
 #### Structured Decisions:
 ```text
-"[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/menuitem_search\", \"resource_id_index\": 1, \"text\": \"Search\", \"text_index\": 0, \"coordinates\": {\"x\": 880, \"y\": 150, \"width\": 120, \"height\": 120}}}]"
+"[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/send\", \"resource_id_index\": 0, \"coordinates\": {\"x\": 950, \"y\": 1800, \"width\": 100, \"height\": 100}}}]"
 ```
-#### Agent Thought:
+#### Decisions Reason:
+> Analysis: Agent thoughts confirm the text "Hello!" was successfully entered in the previous turn (executor feedback showed successful input). The current UI shows the message in the field and the send button is visible. I am completing the typing subgoal based on OBSERVED evidence, and tapping send to proceed. Providing full target information following the element rule.
+#### Goals Completion Reason:
+> Completing "type-message" subgoal because agent thoughts show the Executor successfully entered "Hello!" in the previous turn, and the current UI hierarchy confirms the text is present in the message field.
+#### Screen Analysis Prompt:
+```text
+None
+```
+**Why this makes sense:** We're completing a goal that ALREADY happened (typing) based on observed evidence from agent thoughts, while simultaneously executing the next action (sending). We're not anticipating the send will succeed—we're only completing what has been confirmed.
+### Example 3: Screen Analysis + Goal Completion
+#### Current Subgoal:
+> "Verify the message was delivered to Alice"
+#### Context:
+- **Agent thoughts history shows**: Previous turn executed `tap` on the send button. Executor feedback confirms the tap was successful.
+- **Current UI state**: The UI hierarchy shows we're still in the WhatsApp chat with Alice. The hierarchy contains text elements but doesn't clearly indicate delivery status.
+- **Next step consideration**: We need visual confirmation of delivery checkmarks, which are not reliably exposed in the UI hierarchy.
+#### Complete Subgoals By IDs:
+```text
+["subgoal-5-send-message"]
+```
+#### Structured Decisions:
+```text
+None
+```
+#### Decisions Reason:
+> None
+#### Goals Completion Reason:
+> Completing "send-message" subgoal because agent thoughts show the send button tap was executed successfully in the previous turn, and we remain in the chat screen (not an error state).
+#### Screen Analysis Prompt:
+```text
+Are there delivery checkmarks (single or double) visible next to the message "Hello!" in the chat? Describe their appearance.
+```
-> Analysis: No previous attempts, this is a fresh approach. I will tap the search icon to begin searching. I am providing its resource_id, coordinates, and text content to ensure the Executor can find it reliably, following the element rule.
+**Why this makes sense:** We're completing the goal that ALREADY happened (sending the message) based on observed evidence from agent thoughts. We need screen analysis to verify delivery status for the next subgoal, but we have no execution decisions to make on the current screen. This respects the mutual exclusivity between execution decisions and screen analysis.
 ### Input

minitap/mobile_use/agents/cortex/cortex.py CHANGED Viewed

@@ -18,7 +18,6 @@ from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.graph.state import State
 from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
 from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
-from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
 from minitap.mobile_use.utils.decorators import wrap_with_callbacks
 from minitap.mobile_use.utils.logger import get_logger
@@ -62,10 +61,6 @@ class CortexNode:
         for thought in state.agents_thoughts:
             messages.append(AIMessage(content=thought))
-        if state.latest_screenshot_base64:
-            messages.append(get_screenshot_message_for_llm(state.latest_screenshot_base64))
-            logger.info("Added screenshot to context")
         if state.latest_ui_hierarchy:
             ui_hierarchy_dict: list[dict] = state.latest_ui_hierarchy
             ui_hierarchy_str = json.dumps(ui_hierarchy_dict, indent=2, ensure_ascii=False)
@@ -86,27 +81,47 @@ class CortexNode:
             ),
         )  # type: ignore
-        is_subgoal_completed = (
-            response.complete_subgoals_by_ids is not None
-            and len(response.complete_subgoals_by_ids) > 0
-            and (len(response.decisions) == 0 or response.decisions in ["{}", "[]", "null", ""])
-        )
-        if not is_subgoal_completed:
-            response.complete_subgoals_by_ids = []
+        EMPTY_STRING_TOKENS = ["{}", "[]", "null", "", "None"]
+        if response.decisions in EMPTY_STRING_TOKENS:
+            response.decisions = None
+        if response.goals_completion_reason in EMPTY_STRING_TOKENS:
+            response.goals_completion_reason = None
+        if response.screen_analysis_prompt in EMPTY_STRING_TOKENS:
+            response.screen_analysis_prompt = None
+        # Enforce mutual exclusivity: screen_analysis_prompt and decisions cannot coexist
+        # If both are provided, prioritize decisions and discard screen_analysis_prompt
+        if response.decisions is not None and response.screen_analysis_prompt is not None:
+            logger.warning(
+                "Both 'decisions' and 'screen_analysis_prompt' were provided. "
+                "Prioritizing execution decisions and discarding screen analysis request."
+            )
+            response.screen_analysis_prompt = None
+        thought_parts = []
+        if response.decisions_reason:
+            thought_parts.append(f"Decisions reason: {response.decisions_reason}")
+        if response.goals_completion_reason:
+            thought_parts.append(f"Goals completion reason: {response.goals_completion_reason}")
+        if response.screen_analysis_prompt:
+            thought_parts.append(f"Screen analysis query: {response.screen_analysis_prompt}")
+        agent_thought = "\n\n".join(thought_parts)
         return await state.asanitize_update(
             ctx=self.ctx,
             update={
-                "agents_thoughts": [response.agent_thought],
-                "structured_decisions": response.decisions if not is_subgoal_completed else None,
-                "complete_subgoals_by_ids": response.complete_subgoals_by_ids or [],
-                "latest_screenshot_base64": None,
+                "agents_thoughts": [agent_thought],
+                "structured_decisions": response.decisions,
+                "complete_subgoals_by_ids": response.complete_subgoals_by_ids,
+                "screen_analysis_prompt": response.screen_analysis_prompt,
                 "latest_ui_hierarchy": None,
                 "focused_app_info": None,
                 "device_date": None,
                 # Executor related fields
                 EXECUTOR_MESSAGES_KEY: [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
-                "cortex_last_thought": response.agent_thought,
+                "cortex_last_thought": agent_thought,
             },
             agent="cortex",
         )

minitap/mobile_use/agents/cortex/types.py CHANGED Viewed

@@ -2,8 +2,22 @@ from pydantic import BaseModel, Field
 class CortexOutput(BaseModel):
-    decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
-    agent_thought: str = Field(..., description="The agent's thought")
-    complete_subgoals_by_ids: list[str] | None = Field(
-        [], description="List of subgoal IDs to complete"
+    decisions: str | None = Field(
+        default=None, description="The decisions to be made. A stringified JSON object"
+    )
+    decisions_reason: str | None = Field(default=None, description="The reason for the decisions")
+    goals_completion_reason: str | None = Field(
+        default=None,
+        description="The reason for the goals completion, if there are any goals to be completed.",
+    )
+    complete_subgoals_by_ids: list[str] = Field(
+        default_factory=list, description="List of subgoal IDs to complete"
+    )
+    screen_analysis_prompt: str | None = Field(
+        default=None,
+        description=(
+            "Optional prompt for the screen_analyzer agent. "
+            "Set this if you need visual analysis of the current screen. "
+            "The screen_analyzer will take a screenshot and answer your specific question."
+        ),
     )

minitap/mobile_use/agents/executor/executor.md CHANGED Viewed

@@ -50,7 +50,7 @@ Call the `tap_on_element` tool with:
 #### 📝 Text Input Best Practice
-When using the `input_text` tool:
+When using the `focus_and_input_text` tool:
 - **Provide all available information** in the target object to identify text input element
   - `resource_id`: The resource ID of the text input element (when available)
@@ -69,11 +69,11 @@ When using the `input_text` tool:
 #### 🔄 Text Clearing Best Practice
-When you need to completely clear text from an input field, always use the clear_text tool with the correct resource_id.
+When you need to completely clear text from an input field, always use the focus_and_clear_text tool with the correct resource_id.
 This tool automatically takes care of focusing the element (if needed), and ensuring the field is fully emptied.
-Only and if only the clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
+Only and if only the focus_and_clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
 #### 🔁 Final Notes

minitap/mobile_use/agents/executor/executor.py CHANGED Viewed

@@ -8,7 +8,7 @@ from langchain_google_vertexai.chat_models import ChatVertexAI
 from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.graph.state import State
-from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
+from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
 from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, get_tools_from_wrappers
 from minitap.mobile_use.utils.decorators import wrap_with_callbacks
 from minitap.mobile_use.utils.logger import get_logger
@@ -53,6 +53,7 @@ class ExecutorNode:
         ]
         llm = get_llm(ctx=self.ctx, name="executor")
+        llm_fallback = get_llm(ctx=self.ctx, name="executor", use_fallback=True)
         llm_bind_tools_kwargs: dict = {
             "tools": get_tools_from_wrappers(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
         }
@@ -62,8 +63,14 @@ class ExecutorNode:
             llm_bind_tools_kwargs["parallel_tool_calls"] = True
         llm = llm.bind_tools(**llm_bind_tools_kwargs)
-        response = await invoke_llm_with_timeout_message(
-            llm.ainvoke(messages), agent_name="Executor"
+        llm_fallback = llm_fallback.bind_tools(**llm_bind_tools_kwargs)
+        response = await with_fallback(
+            main_call=lambda: invoke_llm_with_timeout_message(
+                llm.ainvoke(messages), agent_name="Executor"
+            ),
+            fallback_call=lambda: invoke_llm_with_timeout_message(
+                llm_fallback.ainvoke(messages), agent_name="Executor (Fallback)"
+            ),
         )
         return await state.asanitize_update(
             ctx=self.ctx,

minitap/mobile_use/agents/hopper/hopper.md CHANGED Viewed

@@ -1,5 +1,33 @@
 ## Hopper
-The user will send you a batch of data you must dig in order to extract the most relevant information to reach the user's goal. Keep the information as is, do not modify it since the user will trigger actions based on it.
+The user will send you a **batch of data**. Your role is to **dig through it** and extract the most relevant information needed to reach the user's goal.
-You'll need to output the extracted information in the `output` field, and you will describe what you did in the `step`  field.
+- **Keep the extracted information exactly as it appears** in the input. Do not reformat, paraphrase, or alter it.
+- The user may rely on this raw data for triggering actions, so fidelity matters.
+---
+### Output Fields
+- **output**: the extracted information.
+- **reason**: a short explanation of what you looked for and how you decided what to extract.
+---
+### Rules
+1. **Search thoroughly**: The data may contain hundreds of entries. Scan the entire input carefully before concluding.
+2. **Match app names to package names**: When looking for an app package, look for package names where the app name (or a close variation) appears in the package identifier. Common patterns:
+   - App name in lowercase as part of the package
+   - Company/developer name followed by app name
+   - Brand name or abbreviated form of the app name
+   - Sometimes a codename or internal name related to the app
+3. **Prefer the most direct match**: If multiple packages contain similar terms, prefer the one where the app name appears most directly in the package identifier.
+4. **Consider variations**: App names may appear in different forms (abbreviated, translated, or with slight modifications) in package names.
+5. If the relevant information is **not found**, return `None`.
+6. If multiple plausible matches exist and you cannot determine which is correct, return `None` instead of guessing.

minitap/mobile_use/agents/hopper/hopper.py CHANGED Viewed

@@ -5,17 +5,15 @@ from langchain_core.messages import HumanMessage, SystemMessage
 from pydantic import BaseModel, Field
 from minitap.mobile_use.context import MobileUseContext
-from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
+from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
 class HopperOutput(BaseModel):
-    step: str = Field(
-        description=(
-            "The step that has been done, must be a valid one following the "
-            "current steps and the current goal to achieve."
-        )
-    )
     output: str = Field(description="The interesting data extracted from the input data.")
+    reason: str = Field(
+        description="A short explanation of what you looked for"
+        + " and how you decided what to extract."
+    )
 async def hopper(
@@ -32,12 +30,18 @@ async def hopper(
         HumanMessage(content=f"{request}\nHere is the data you must dig:\n{data}"),
     ]
-    llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0)
-    structured_llm = llm.with_structured_output(HopperOutput)
-    response: HopperOutput = await invoke_llm_with_timeout_message(
-        structured_llm.ainvoke(messages), agent_name="Hopper"
-    )  # type: ignore
-    return HopperOutput(
-        step=response.step,
-        output=response.output,
+    llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0).with_structured_output(
+        HopperOutput
     )
+    llm_fallback = get_llm(
+        ctx=ctx, name="hopper", is_utils=True, use_fallback=True, temperature=0
+    ).with_structured_output(HopperOutput)
+    response: HopperOutput = await with_fallback(
+        main_call=lambda: invoke_llm_with_timeout_message(
+            llm.ainvoke(messages), agent_name="Hopper"
+        ),
+        fallback_call=lambda: invoke_llm_with_timeout_message(
+            llm_fallback.ainvoke(messages), agent_name="Hopper (Fallback)"
+        ),
+    )  # type: ignore
+    return response

minitap/mobile_use/agents/orchestrator/orchestrator.py CHANGED Viewed

@@ -15,7 +15,7 @@ from minitap.mobile_use.agents.planner.utils import (
 )
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.graph.state import State
-from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
+from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
 from minitap.mobile_use.utils.decorators import wrap_with_callbacks
 from minitap.mobile_use.utils.logger import get_logger
@@ -74,10 +74,19 @@ class OrchestratorNode:
             HumanMessage(content=human_message),
         ]
-        llm = get_llm(ctx=self.ctx, name="orchestrator", temperature=1)
-        llm = llm.with_structured_output(OrchestratorOutput)
-        response: OrchestratorOutput = await invoke_llm_with_timeout_message(
-            llm.ainvoke(messages), agent_name="Orchestrator"
+        llm = get_llm(ctx=self.ctx, name="orchestrator", temperature=1).with_structured_output(
+            OrchestratorOutput
+        )
+        llm_fallback = get_llm(
+            ctx=self.ctx, name="orchestrator", use_fallback=True, temperature=1
+        ).with_structured_output(OrchestratorOutput)
+        response: OrchestratorOutput = await with_fallback(
+            main_call=lambda: invoke_llm_with_timeout_message(
+                llm.ainvoke(messages), agent_name="Orchestrator"
+            ),
+            fallback_call=lambda: invoke_llm_with_timeout_message(
+                llm_fallback.ainvoke(messages), agent_name="Orchestrator (Fallback)"
+            ),
         )  # type: ignore
         if response.needs_replaning:
             thoughts = [response.reason]

minitap/mobile_use/agents/outputter/outputter.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydantic import BaseModel
 from minitap.mobile_use.config import OutputConfig
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.graph.state import State
-from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
+from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
 from minitap.mobile_use.utils.conversations import is_ai_message
 from minitap.mobile_use.utils.logger import get_logger
@@ -46,7 +46,11 @@ async def outputter(
         messages.append(HumanMessage(content=output_config.output_description))
     llm = get_llm(ctx=ctx, name="outputter", is_utils=True, temperature=1)
+    llm_fallback = get_llm(
+        ctx=ctx, name="outputter", is_utils=True, use_fallback=True, temperature=1
+    )
     structured_llm = llm
+    structured_llm_fallback = llm_fallback
     if output_config.structured_output:
         schema: dict | type[BaseModel] | None = None
@@ -61,9 +65,15 @@ async def outputter(
         if schema is not None:
             structured_llm = llm.with_structured_output(schema)
+            structured_llm_fallback = llm_fallback.with_structured_output(schema)
-    response = await invoke_llm_with_timeout_message(
-        structured_llm.ainvoke(messages), agent_name="Outputter"
+    response = await with_fallback(
+        main_call=lambda: invoke_llm_with_timeout_message(
+            structured_llm.ainvoke(messages), agent_name="Outputter"
+        ),
+        fallback_call=lambda: invoke_llm_with_timeout_message(
+            structured_llm_fallback.ainvoke(messages), agent_name="Outputter (Fallback)"
+        ),
     )  # type: ignore
     if isinstance(response, BaseModel):
         if output_config.output_description and hasattr(response, "content"):

minitap-mobile-use 2.5.3__py3-none-any.whl → 2.7.0__py3-none-any.whl

Potentially problematic release.

minitap-mobile-use 2.5.3py3-none-any.whl → 2.7.0py3-none-any.whl