PyPI - minitap-mobile-use - Versions diffs - 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

minitap-mobile-use 2.0.0py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (55) hide show

minitap/mobile_use/agents/cortex/cortex.md +17 -10
minitap/mobile_use/agents/cortex/cortex.py +12 -2
minitap/mobile_use/agents/cortex/types.py +2 -2
minitap/mobile_use/agents/executor/executor.md +16 -10
minitap/mobile_use/agents/executor/executor.py +6 -18
minitap/mobile_use/agents/executor/tool_node.py +105 -0
minitap/mobile_use/agents/hopper/hopper.md +2 -10
minitap/mobile_use/agents/hopper/hopper.py +4 -9
minitap/mobile_use/agents/orchestrator/human.md +3 -4
minitap/mobile_use/agents/orchestrator/orchestrator.md +25 -7
minitap/mobile_use/agents/orchestrator/orchestrator.py +56 -56
minitap/mobile_use/agents/orchestrator/types.py +5 -8
minitap/mobile_use/agents/planner/planner.md +14 -13
minitap/mobile_use/agents/planner/planner.py +4 -1
minitap/mobile_use/agents/planner/types.py +8 -2
minitap/mobile_use/agents/planner/utils.py +11 -0
minitap/mobile_use/clients/device_hardware_client.py +3 -0
minitap/mobile_use/config.py +2 -0
minitap/mobile_use/constants.py +1 -0
minitap/mobile_use/controllers/mobile_command_controller.py +10 -11
minitap/mobile_use/graph/graph.py +9 -31
minitap/mobile_use/graph/state.py +26 -6
minitap/mobile_use/main.py +6 -2
minitap/mobile_use/sdk/agent.py +54 -39
minitap/mobile_use/sdk/builders/agent_config_builder.py +17 -4
minitap/mobile_use/sdk/types/agent.py +5 -0
minitap/mobile_use/servers/stop_servers.py +10 -15
minitap/mobile_use/services/llm.py +1 -0
minitap/mobile_use/tools/index.py +2 -4
minitap/mobile_use/tools/mobile/back.py +7 -11
minitap/mobile_use/tools/mobile/copy_text_from.py +7 -11
minitap/mobile_use/tools/mobile/erase_text.py +7 -9
minitap/mobile_use/tools/mobile/find_packages.py +69 -0
minitap/mobile_use/tools/mobile/input_text.py +131 -32
minitap/mobile_use/tools/mobile/launch_app.py +7 -11
minitap/mobile_use/tools/mobile/long_press_on.py +7 -9
minitap/mobile_use/tools/mobile/open_link.py +7 -11
minitap/mobile_use/tools/mobile/paste_text.py +7 -11
minitap/mobile_use/tools/mobile/press_key.py +7 -11
minitap/mobile_use/tools/mobile/stop_app.py +7 -9
minitap/mobile_use/tools/mobile/swipe.py +7 -11
minitap/mobile_use/tools/mobile/take_screenshot.py +7 -11
minitap/mobile_use/tools/mobile/tap.py +7 -9
minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +7 -9
minitap/mobile_use/tools/tool_wrapper.py +1 -23
minitap/mobile_use/utils/recorder.py +11 -10
minitap/mobile_use/utils/ui_hierarchy.py +88 -1
{minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.0.1.dist-info}/METADATA +2 -2
minitap_mobile_use-2.0.1.dist-info/RECORD +94 -0
minitap/mobile_use/agents/executor/executor_context_cleaner.py +0 -27
minitap/mobile_use/tools/mobile/list_packages.py +0 -78
minitap/mobile_use/tools/mobile/run_flow.py +0 -57
minitap_mobile_use-2.0.0.dist-info/RECORD +0 -95
{minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.0.1.dist-info}/WHEEL +0 -0
{minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.0.1.dist-info}/entry_points.txt +0 -0

minitap/mobile_use/agents/cortex/cortex.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ## You are the **Cortex**
-Your job is to **analyze the current {{ platform }} mobile device state** and produce **structured decisions** to achieve the current subgoal.
+Your job is to **analyze the current {{ platform }} mobile device state** and produce **structured decisions** to achieve the current subgoal and more consecutive subgoals if possible.
 You must act like a human brain, responsible for giving instructions to your hands (the **Executor** agent). Therefore, you must act with the same imprecision and uncertainty as a human when performing swipe actions: humans don't know where exactly they are swiping (always prefer percentages of width and height instead of absolute coordinates), they just know they are swiping up or down, left or right, and with how much force (usually amplified compared to what's truly needed - go overboard of sliders for instance).
@@ -19,33 +19,41 @@ You are provided with:
   - The user's **initial goal**
   - The **subgoal plan** with their statuses
-  - The **current subgoal** to act on (the one in `PENDING` in the plan)
+  - The **current subgoal** (the one in `PENDING` in the plan)
   - A list of **agent thoughts** (previous reasoning, observations about the environment)
   - **Executor agent feedback** on the latest UI decisions
 ### Your Mission:
-Focus on the **current subgoal**.
+Focus on the **current PENDING subgoal and the next subgoals not yet started**.
 1. **Analyze the UI** and environment to understand what action is required.
-   2.1. If the **subgoal is completed**, set the `complete_subgoal` field to `True`. To justify your conclusion, you will fill in the `agent_thought` field based on:
+2.1. If some of the subgoals must be **completed** based on your observations, add them to `complete_subgoals_by_ids`. To justify your conclusion, you will fill in the `agent_thought` field based on:
 - The current UI state
 - Past agent thoughts
 - Recent tool effects
-  2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
-- These must be **concrete low-level actions**: back,tap, swipe, launch app, list packages, close app, input text, paste, erase, text, copy, etc.
+2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
+- These must be **concrete low-level actions**: back, tap, swipe, launch app, find packages, close app, input text, paste, erase text, copy, etc.
+- Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
+- When you need to open an app, use the `find_packages` low-level action to try and get its name.
 - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
 - **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
+- **Never use a sequence of `tap` + `input_text` to type into a field. Always use a single `input_text` action** with the correct `resource_id` (this already ensures the element is focused and the cursor is moved to the end).
 - When you want to launch/stop an app, prefer using its package name.
 - **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
 - **For text clearing**: When you need to completely clear text from an input field, always use **LONG PRESS** first to select the text field, then erase. Do NOT use tap + erase as this only clears from cursor position.
 ### Output
-- **Structured Decisions**:
-  A **valid stringified JSON** describing what should be executed **right now** to advance the current subgoal **IF THE SUBGOAL IS NOT COMPLETED**.
+- **complete_subgoals_by_ids** _(optional)_:
+  A list of subgoal IDs that should be marked as completed.
+- **Structured Decisions** _(optional)_:
+  A **valid stringified JSON** describing what should be executed **right now** to advance through the subgoals as much as possible.
 - **Agent Thought** _(1-2 sentences)_:
   If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
@@ -53,8 +61,7 @@ Focus on the **current subgoal**.
   This also helps other agents understand your decision and learn from future failures.
   You must also use this field to mention checkpoints when you perform actions without definite ending: for instance "Swiping up to reveal more recipes - last seen recipe was <ID or NAME>, stop when no more".
-- **Subgoal Completion** _(boolean)_:
-  Set to true if the current subgoal has been successfully completed - you **cannot set it to true and provide structured decisions at the same time**. You must base your decision ONLY on what you have as input (device state, agent thoughts, executor feedback, etc) - NEVER based on the decisions you have produced.
+**Important:** `complete_subgoals_by_ids` and the structured decisions are mutually exclusive: if you provide both, the structured decisions will be ignored. Therefore, you must always prioritize completing subgoals over providing structured decisions.
 ---

minitap/mobile_use/agents/cortex/cortex.py CHANGED Viewed

@@ -12,6 +12,7 @@ from langchain_core.messages import (
 from langgraph.graph.message import REMOVE_ALL_MESSAGES
 from minitap.mobile_use.agents.cortex.types import CortexOutput
 from minitap.mobile_use.agents.planner.utils import get_current_subgoal
+from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.graph.state import State
 from minitap.mobile_use.services.llm import get_llm, with_fallback
@@ -79,20 +80,29 @@ class CortexNode:
             fallback_call=lambda: llm_fallback.ainvoke(messages),
         )  # type: ignore
-        is_subgoal_completed = response.complete_current_subgoal
+        is_subgoal_completed = (
+            response.complete_subgoals_by_ids is not None
+            and len(response.complete_subgoals_by_ids) > 0
+            and len(response.decisions) == 0
+        )
+        if not is_subgoal_completed:
+            response.complete_subgoals_by_ids = []
         return state.sanitize_update(
             ctx=self.ctx,
             update={
                 "agents_thoughts": [response.agent_thought],
                 "structured_decisions": response.decisions if not is_subgoal_completed else None,
+                "complete_subgoals_by_ids": response.complete_subgoals_by_ids or [],
                 "latest_screenshot_base64": None,
                 "latest_ui_hierarchy": None,
                 "focused_app_info": None,
                 "device_date": None,
                 # Executor related fields
-                "executor_messages": [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
+                EXECUTOR_MESSAGES_KEY: [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
                 "cortex_last_thought": response.agent_thought,
             },
+            agent="cortex",
         )

minitap/mobile_use/agents/cortex/types.py CHANGED Viewed

@@ -6,6 +6,6 @@ from pydantic import BaseModel, Field
 class CortexOutput(BaseModel):
     decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
     agent_thought: str = Field(..., description="The agent's thought")
-    complete_current_subgoal: Optional[bool] = Field(
-        False, description="Whether the current subgoal is complete"
+    complete_subgoals_by_ids: Optional[list[str]] = Field(
+        [], description="List of subgoal IDs to complete"
     )

minitap/mobile_use/agents/executor/executor.md CHANGED Viewed

@@ -5,21 +5,17 @@ Your job is to **interpret the structured decisions** provided by the **Cortex**
 ### 🎯 Your Objective:
 Given the `structured_decisions` (a stringified object) from the **Cortex** agent
-and the previous tool calls, you must:
+and your previous actions, you must:
 1. **Parse the structured decisions** into usable Python objects.
-2. **Determine the most appropriate tool** to execute the intended action - **you can ONLY USE ONE**
-3. **Invoke tool accurately**, passing the required parameters.
-4. For **the tool you invoke**, always provide a clear `agent_thought` argument:
+2. **Determine the appropriate tools** to execute the intended action - **the order of the tools you return is the order in which they will be executed**
+3. **Invoke tools accurately**, passing the required parameters.
+4. For **each tool you invoke**, always provide a clear `agent_thought` argument:
    - This is a natural-language sentence (or two) **explaining why** this tool is being invoked.
    - Keep it short but informative.
    - This is essential for debugging, traceability, and adaptation by other agents.
-5. For **the tool you invoke**, always provide the `executor_metadata` argument:
-   - If you know you won't be able to achieve all Cortex decisions using the tool call you've chosen, set `retrigger` to `true` - otherwise set it to `false`
 ---
 ### 🧠 Example
@@ -45,17 +41,27 @@ Call the `tap_on_element` tool with:
 - `resource_id = "com.whatsapp:id/conversation_item"`
 - `text = "Alice"`
 - `agent_thought = "I'm tapping on the chat item labeled 'Alice' to open the conversation."`
-- `executor_metadata = {"retrigger": false}`
 ---
 ### ⚙️ Tools
-- Tools may include actions like: `tap`, `swipe`, `start_app`, `stop_app`, `list_packages`, `get_current_focus`, etc.
+- Tools may include actions like: `tap`, `swipe`, `start_app`, `stop_app`, `find_packages`, `get_current_focus`, etc.
 - You **must not hardcode tool definitions** here.
 - Just use the right tool based on what the `structured_decisions` requires.
 - The tools are provided dynamically via LangGraph's tool binding mechanism.
+#### 📝 Text Input Best Practice
+When using the `input_text` tool:
+- **Always provide the `resource_id` of the element** you want to type into.
+- The tool will automatically:
+  1. **Focus the element first**
+  2. **Move the cursor to the end** of the existing text
+  3. **Then type the new text**
 #### 🔄 Text Clearing Best Practice
 When you need to completely clear text from an input field, **DO NOT** simply use `erase_text` alone, as it only erases from the cursor position, backward. Instead:

minitap/mobile_use/agents/executor/executor.py CHANGED Viewed

@@ -2,8 +2,8 @@ from pathlib import Path
 from jinja2 import Template
 from langchain_core.messages import HumanMessage, SystemMessage
-from langchain_core.messages.ai import AIMessage
 from langchain_google_genai import ChatGoogleGenerativeAI
+from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.graph.state import State
 from minitap.mobile_use.services.llm import get_llm
@@ -34,20 +34,9 @@ class ExecutorNode:
                         "No structured decisions found, I cannot execute anything."
                     ],
                 },
+                agent="executor",
             )
-        if len(state.executor_messages) > 0 and isinstance(state.executor_messages[-1], AIMessage):
-            if len(state.executor_messages[-1].tool_calls) > 0:  # type: ignore
-                # A previous tool call raised an uncaught exception while retrigerring the executor
-                return state.sanitize_update(
-                    ctx=self.ctx,
-                    update={
-                        "executor_retrigger": False,
-                        "executor_failed": True,
-                        "executor_messages": [state.messages[-1]],
-                    },
-                )
         system_message = Template(
             Path(__file__).parent.joinpath("executor.md").read_text(encoding="utf-8")
         ).render(platform=self.ctx.device.mobile_platform.value)
@@ -62,14 +51,13 @@ class ExecutorNode:
         ]
         llm = get_llm(ctx=self.ctx, name="executor")
-        llm_bind_tools_kwargs = {
+        llm_bind_tools_kwargs: dict = {
             "tools": get_tools_from_wrappers(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
-            "tool_choice": "auto",  # automatically select a tool call or none
         }
         # ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
         if not isinstance(llm, ChatGoogleGenerativeAI):
-            llm_bind_tools_kwargs["parallel_tool_calls"] = False
+            llm_bind_tools_kwargs["parallel_tool_calls"] = True
         llm = llm.bind_tools(**llm_bind_tools_kwargs)
         response = await llm.ainvoke(messages)
@@ -78,7 +66,7 @@ class ExecutorNode:
             ctx=self.ctx,
             update={
                 "cortex_last_thought": cortex_last_thought,
-                "executor_messages": [response],
-                "messages": [response],
+                EXECUTOR_MESSAGES_KEY: [response],
             },
+            agent="executor",
         )

minitap/mobile_use/agents/executor/tool_node.py ADDED Viewed

@@ -0,0 +1,105 @@
+import asyncio
+from typing import Any, Optional
+from langgraph.types import Command
+from pydantic import BaseModel
+from typing_extensions import override
+from langchain_core.runnables import RunnableConfig
+from langgraph.store.base import BaseStore
+from langchain_core.messages import AnyMessage, ToolCall, ToolMessage
+from langgraph.prebuilt import ToolNode
+class ExecutorToolNode(ToolNode):
+    """
+    ToolNode that runs tool calls one after the other - not simultaneously.
+    If one error occurs, the remaining tool calls are aborted!
+    """
+    @override
+    async def _afunc(
+        self,
+        input: list[AnyMessage] | dict[str, Any] | BaseModel,
+        config: RunnableConfig,
+        *,
+        store: Optional[BaseStore],
+    ):
+        return await self.__func(is_async=True, input=input, config=config, store=store)
+    @override
+    def _func(
+        self,
+        input: list[AnyMessage] | dict[str, Any] | BaseModel,
+        config: RunnableConfig,
+        *,
+        store: Optional[BaseStore],
+    ) -> Any:
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.__func(is_async=False, input=input, config=config, store=store)
+        )
+    async def __func(
+        self,
+        is_async: bool,
+        input: list[AnyMessage] | dict[str, Any] | BaseModel,
+        config: RunnableConfig,
+        *,
+        store: Optional[BaseStore],
+    ) -> Any:
+        tool_calls, input_type = self._parse_input(input, store)
+        outputs: list[Command | ToolMessage] = []
+        failed = False
+        for call in tool_calls:
+            if failed:
+                output = self._get_erroneous_command(
+                    call=call,
+                    message="Aborted: a previous tool call failed!",
+                )
+            else:
+                if is_async:
+                    output = await self._arun_one(call, input_type, config)
+                else:
+                    output = self._run_one(call, input_type, config)
+                failed = self._has_tool_call_failed(call, output)
+                if failed is None:
+                    output = self._get_erroneous_command(
+                        call=call,
+                        message=f"Unexpected tool output type: {type(output)}",
+                    )
+                    failed = True
+            outputs.append(output)
+        return self._combine_tool_outputs(outputs, input_type)  # type: ignore
+    def _has_tool_call_failed(
+        self,
+        call: ToolCall,
+        output: ToolMessage | Command,
+    ) -> Optional[bool]:
+        if isinstance(output, ToolMessage):
+            return output.status == "error"
+        if isinstance(output, Command):
+            output_msg = self._get_tool_message(output)
+            return output_msg.status == "error"
+        return None
+    def _get_erroneous_command(self, call: ToolCall, message: str) -> Command:
+        tool_message = ToolMessage(
+            name=call["name"], tool_call_id=call["id"], content=message, status="error"
+        )
+        return Command(update={self.messages_key: [tool_message]})
+    def _get_tool_message(self, cmd: Command) -> ToolMessage:
+        if isinstance(cmd.update, dict):
+            msg = cmd.update.get(self.messages_key)
+            if isinstance(msg, list):
+                if len(msg) == 0:
+                    raise ValueError("No messages found in command update")
+                if not isinstance(msg[-1], ToolMessage):
+                    raise ValueError("Last message in command update is not a tool message")
+                return msg[-1]
+            elif isinstance(msg, ToolMessage):
+                return msg
+            elif msg is None:
+                raise ValueError(f"Missing '{self.messages_key}' in command update")
+            raise ValueError(f"Unexpected message type in command update: {type(msg)}")
+        raise ValueError("Command update is not a dict")

minitap/mobile_use/agents/hopper/hopper.md CHANGED Viewed

@@ -1,13 +1,5 @@
 ## Hopper
-Your goal is to analyze the input data and to pick only the most relevant information based on the current steps. We aim to reach the goal defined by the user as : {{ initial_goal }}
+The user will send you a batch of data you must dig in order to extract the most relevant information to reach the user's goal. Keep the information as is, do not modify it since the user will trigger actions based on it.
-### Input
-You have the list of steps we've done so far. We use those steps to track our progress to reach our goal. Here they are : {{ messages }}
-Finally, here is the data that we receive form executing the last task. We will dig this data to pick only the most relevant information to reach our goal. Keep the information as is, do not modify it since we will trigger actions based on it. Output this information in the output field, and you will describe what you did in the step field.
-Here is the data you must dig :
-{{ data }}
+You'll need to output the extracted information in the `output` field, and you will describe what you did in the `step`  field.

minitap/mobile_use/agents/hopper/hopper.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from pathlib import Path
-from typing import Sequence
 from jinja2 import Template
-from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
+from langchain_core.messages import HumanMessage, SystemMessage
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.services.llm import get_llm
 from pydantic import BaseModel, Field
@@ -20,20 +19,16 @@ class HopperOutput(BaseModel):
 async def hopper(
     ctx: MobileUseContext,
-    initial_goal: str,
-    messages: Sequence[BaseMessage],
+    request: str,
     data: str,
 ) -> HopperOutput:
     print("Starting Hopper Agent", flush=True)
     system_message = Template(
         Path(__file__).parent.joinpath("hopper.md").read_text(encoding="utf-8")
-    ).render(
-        initial_goal=initial_goal,
-        messages=messages,
-    )
+    ).render()
     messages = [
         SystemMessage(content=system_message),
-        HumanMessage(content=data),
+        HumanMessage(content=f"{request}\nHere is the data you must dig:\n{data}"),
     ]
     llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0)

minitap/mobile_use/agents/orchestrator/human.md CHANGED Viewed

@@ -1,13 +1,12 @@
-Here is your input.
----
+Here is the input for your analysis:
 **Initial goal** : {{ initial_goal }}
 **Subgoal plan**
 {{ subgoal_plan }}
-**Current subgoal** : {{ current_subgoal }}
+**Subgoals to examine (provided by the Cortex)**
+{{ subgoals_to_examine }}
 **Agent thoughts**
 {{ agent_thoughts }}

minitap/mobile_use/agents/orchestrator/orchestrator.md CHANGED Viewed

@@ -1,18 +1,36 @@
 You are the **Orchestrator**.
-Your role is to **decide what to do next**, based on the current execution state of a plan running on an **{{ platform }} mobile device**. You must assess the situation and choose between resuming, continuing, or replanning.
+Your role is to **decide what to do next**, based on the current execution state of a plan running on an **{{ platform }} mobile device**. You must assess the situation and determine whether the provided subgoals have been completed, or if they need to remain pending.
+Based on the input data, you must also determine if the subgoal plan must be replanned.
 ### Responsibilities
-You're given:
+You will be given:
 - The current **subgoal plan**
-- The current **subgoal** (which is marked as **PENDING** in the plan, but repeated here for your convenience)
+- The **subgoal to examine** (which are marked as **PENDING** and **NOT STARTED** in the plan)
 - A list of **agent thoughts** (insights, obstacles, or reasoning gathered during execution)
 - The original **initial goal**
-You must then **choose what to do next**:
+You must then:
-- `"resume"`: The current subgoal is clearly not finished, let's resume it. The status of the current subgoal will stay as `PENDING`.
-- `"continue"`: Move to the next subgoal in the list. The current subgoal will be marked as `SUCCESS`. If the current subgoal is the final step of the plan: The "reason" field must contain the final answer to the user’s initial goal. If the current subgoal is not the final step: The "reason" field must explain why this subgoal is now considered complete before moving on.
-- `"replan"`: The current plan no longer fits : the current subgoal will be marked as `FAILURE`. we need to define a new plan.
+1. For **each subgoal to examine provided by the user** (not all subgoals):
+    - if it's clearly finished and can be marked as complete, regardless of whether it was started or not -> add its ID to `completed_subgoal_ids`
+    Then fill the `reason` field with:
+    - the final answer to the initial goal if all subgoals are expected to be completed, OR
+    - an explanation of your decisions for the report.
+2. Set `needs_replaning` to `TRUE` if the current plan no longer fits because of repeated failed attempts. In that case, the current subgoal will be marked as `FAILURE`, and a new plan will be defined. Explain in the `reason` field why the plan no longer fits.
+### Agent Roles & Thought Ownership
+All thoughts belong to the specific agent that generated them. There are four collaborating agents:
+- **Orchestrator (You):** Coordinates the entire process. Decides what to do next based on the execution state and whether the plan needs replanning.
+- **Planner:** Designs the subgoal plan and updates it when necessary (replanning). Does not execute actions.
+- **Cortex (Brain & Eyes):** It does not directly interact with the device, but it has full awareness of the screen state. Its role is to reason about this state and determine the next actions (e.g., tap, swipe, scroll) required to advance through the plan.
+- **Executor (Hands):** it executes the Cortex’s chosen actions on the device.
+The cortex has the ability to complete multiple subgoals (the PENDING one and NOT STARTED ones), which are the ones you'll need to examine. Although the plan should normally be completed in order - this is not a strict requirement based on the context.
+In its agent thoughts, the cortex may talk as if it were the one taking the action (e.g. "Tapping the button", ...) - but remember than only the executor can interact with the device.

minitap/mobile_use/agents/orchestrator/orchestrator.py CHANGED Viewed

@@ -3,12 +3,13 @@ from pathlib import Path
 from jinja2 import Template
 from langchain_core.messages import HumanMessage, SystemMessage
-from minitap.mobile_use.agents.orchestrator.types import OrchestratorOutput, OrchestratorStatus
+from minitap.mobile_use.agents.orchestrator.types import OrchestratorOutput
 from minitap.mobile_use.agents.planner.utils import (
     all_completed,
-    complete_current_subgoal,
+    complete_subgoals_by_ids,
     fail_current_subgoal,
     get_current_subgoal,
+    get_subgoals_by_ids,
     nothing_started,
     start_next_subgoal,
 )
@@ -31,24 +32,27 @@ class OrchestratorNode:
         on_failure=lambda _: logger.error("Orchestrator Agent"),
     )
     async def __call__(self, state: State):
-        if nothing_started(state.subgoal_plan):
+        no_subgoal_started = nothing_started(state.subgoal_plan)
+        current_subgoal = get_current_subgoal(state.subgoal_plan)
+        if no_subgoal_started or not current_subgoal:
             state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
             new_subgoal = get_current_subgoal(state.subgoal_plan)
-            return state.sanitize_update(
-                ctx=self.ctx,
-                update={
-                    "agents_thoughts": [f"Starting the first subgoal: {new_subgoal}"],
-                    "subgoal_plan": state.subgoal_plan,
-                },
-            )
-        current_subgoal = get_current_subgoal(state.subgoal_plan)
+            thoughts = [
+                (
+                    f"Starting the first subgoal: {new_subgoal}"
+                    if no_subgoal_started
+                    else f"Starting the next subgoal: {new_subgoal}"
+                )
+            ]
+            return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
-        if not current_subgoal:
-            return state.sanitize_update(
-                ctx=self.ctx,
-                update={"agents_thoughts": ["No subgoal to go for."]},
-            )
+        subgoals_to_examine = get_subgoals_by_ids(
+            subgoals=state.subgoal_plan,
+            ids=state.complete_subgoals_by_ids,
+        )
+        if len(subgoals_to_examine) <= 0:
+            return _get_state_update(ctx=self.ctx, state=state, thoughts=["No subgoal to examine."])
         system_message = Template(
             Path(__file__).parent.joinpath("orchestrator.md").read_text(encoding="utf-8")
@@ -58,7 +62,7 @@ class OrchestratorNode:
         ).render(
             initial_goal=state.initial_goal,
             subgoal_plan="\n".join(str(s) for s in state.subgoal_plan),
-            current_subgoal=str(current_subgoal),
+            subgoals_to_examine="\n".join(str(s) for s in subgoals_to_examine),
             agent_thoughts="\n".join(state.agents_thoughts),
         )
         messages = [
@@ -70,45 +74,41 @@ class OrchestratorNode:
         llm = llm.with_structured_output(OrchestratorOutput)
         response: OrchestratorOutput = await llm.ainvoke(messages)  # type: ignore
-        if response.status == OrchestratorStatus.CONTINUE:
-            state.subgoal_plan = complete_current_subgoal(state.subgoal_plan)
-            thoughts = [response.reason]
-            if all_completed(state.subgoal_plan):
-                logger.success("All the subgoals have been completed successfully.")
-                return state.sanitize_update(
-                    ctx=self.ctx,
-                    update={
-                        "subgoal_plan": state.subgoal_plan,
-                        "agents_thoughts": thoughts,
-                    },
-                )
-            state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
-            new_subgoal = get_current_subgoal(state.subgoal_plan)
-            thoughts.append(f"==== NEXT SUBGOAL: {new_subgoal} ====")
-            return state.sanitize_update(
-                ctx=self.ctx,
-                update={
-                    "agents_thoughts": thoughts,
-                    "subgoal_plan": state.subgoal_plan,
-                },
-            )
-        elif response.status == OrchestratorStatus.REPLAN:
+        if response.needs_replaning:
             thoughts = [response.reason]
             state.subgoal_plan = fail_current_subgoal(state.subgoal_plan)
             thoughts.append("==== END OF PLAN, REPLANNING ====")
-            return state.sanitize_update(
-                ctx=self.ctx,
-                update={
-                    "agents_thoughts": thoughts,
-                    "subgoal_plan": state.subgoal_plan,
-                },
-            )
-        return state.sanitize_update(
-            ctx=self.ctx,
-            update={
-                "agents_thoughts": [response.reason],
-            },
+            return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
+        state.subgoal_plan = complete_subgoals_by_ids(
+            subgoals=state.subgoal_plan,
+            ids=response.completed_subgoal_ids,
         )
+        thoughts = [response.reason]
+        if all_completed(state.subgoal_plan):
+            logger.success("All the subgoals have been completed successfully.")
+            return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
+        if current_subgoal.id not in response.completed_subgoal_ids:
+            # The current subgoal is not yet complete.
+            return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
+        state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
+        new_subgoal = get_current_subgoal(state.subgoal_plan)
+        thoughts.append(f"==== NEXT SUBGOAL: {new_subgoal} ====")
+        return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
+def _get_state_update(
+    ctx: MobileUseContext,
+    state: State,
+    thoughts: list[str],
+    update_plan: bool = False,
+):
+    update = {
+        "agents_thoughts": thoughts,
+        "complete_subgoals_by_ids": [],
+    }
+    if update_plan:
+        update["subgoal_plan"] = state.subgoal_plan
+    return state.sanitize_update(ctx=ctx, update=update, agent="orchestrator")

minitap/mobile_use/agents/orchestrator/types.py CHANGED Viewed

@@ -1,14 +1,11 @@
-from enum import Enum
+from typing import Annotated
 from pydantic import BaseModel
-class OrchestratorStatus(Enum):
-    CONTINUE = "continue"
-    RESUME = "resume"
-    REPLAN = "replan"
 class OrchestratorOutput(BaseModel):
-    status: OrchestratorStatus
+    completed_subgoal_ids: Annotated[
+        list[str], "IDs of subgoals that can now be marked as complete"
+    ] = []
+    needs_replaning: Annotated[bool, "Whether the orchestrator needs to replan the subgoal plan"]
     reason: str

minitap-mobile-use 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

Potentially problematic release.

minitap-mobile-use 2.0.0py3-none-any.whl → 2.0.1py3-none-any.whl