minitap-mobile-use 2.0.1__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (62) hide show
  1. minitap/mobile_use/agents/cortex/cortex.md +7 -5
  2. minitap/mobile_use/agents/cortex/cortex.py +4 -1
  3. minitap/mobile_use/agents/cortex/types.py +1 -3
  4. minitap/mobile_use/agents/executor/executor.md +4 -5
  5. minitap/mobile_use/agents/executor/executor.py +3 -1
  6. minitap/mobile_use/agents/executor/tool_node.py +6 -6
  7. minitap/mobile_use/agents/outputter/outputter.py +1 -2
  8. minitap/mobile_use/agents/planner/planner.md +11 -2
  9. minitap/mobile_use/agents/planner/planner.py +7 -2
  10. minitap/mobile_use/agents/planner/types.py +3 -4
  11. minitap/mobile_use/agents/summarizer/summarizer.py +2 -1
  12. minitap/mobile_use/config.py +31 -16
  13. minitap/mobile_use/context.py +3 -4
  14. minitap/mobile_use/controllers/mobile_command_controller.py +36 -24
  15. minitap/mobile_use/controllers/platform_specific_commands_controller.py +3 -4
  16. minitap/mobile_use/graph/graph.py +1 -0
  17. minitap/mobile_use/graph/state.py +9 -9
  18. minitap/mobile_use/main.py +7 -8
  19. minitap/mobile_use/sdk/agent.py +25 -26
  20. minitap/mobile_use/sdk/builders/agent_config_builder.py +9 -10
  21. minitap/mobile_use/sdk/builders/task_request_builder.py +9 -9
  22. minitap/mobile_use/sdk/examples/smart_notification_assistant.py +1 -2
  23. minitap/mobile_use/sdk/types/agent.py +5 -5
  24. minitap/mobile_use/sdk/types/task.py +19 -18
  25. minitap/mobile_use/sdk/utils.py +4 -3
  26. minitap/mobile_use/servers/config.py +1 -2
  27. minitap/mobile_use/servers/device_hardware_bridge.py +3 -4
  28. minitap/mobile_use/servers/start_servers.py +4 -4
  29. minitap/mobile_use/servers/stop_servers.py +2 -3
  30. minitap/mobile_use/services/llm.py +24 -6
  31. minitap/mobile_use/tools/index.py +26 -14
  32. minitap/mobile_use/tools/mobile/back.py +1 -1
  33. minitap/mobile_use/tools/mobile/clear_text.py +277 -0
  34. minitap/mobile_use/tools/mobile/copy_text_from.py +1 -1
  35. minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
  36. minitap/mobile_use/tools/mobile/find_packages.py +1 -1
  37. minitap/mobile_use/tools/mobile/input_text.py +4 -80
  38. minitap/mobile_use/tools/mobile/launch_app.py +1 -1
  39. minitap/mobile_use/tools/mobile/long_press_on.py +2 -4
  40. minitap/mobile_use/tools/mobile/open_link.py +1 -1
  41. minitap/mobile_use/tools/mobile/paste_text.py +1 -1
  42. minitap/mobile_use/tools/mobile/press_key.py +1 -1
  43. minitap/mobile_use/tools/mobile/stop_app.py +2 -4
  44. minitap/mobile_use/tools/mobile/swipe.py +107 -9
  45. minitap/mobile_use/tools/mobile/take_screenshot.py +1 -1
  46. minitap/mobile_use/tools/mobile/tap.py +2 -4
  47. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +2 -4
  48. minitap/mobile_use/tools/tool_wrapper.py +6 -1
  49. minitap/mobile_use/tools/utils.py +86 -0
  50. minitap/mobile_use/utils/cli_helpers.py +1 -2
  51. minitap/mobile_use/utils/cli_selection.py +5 -6
  52. minitap/mobile_use/utils/decorators.py +21 -20
  53. minitap/mobile_use/utils/logger.py +3 -4
  54. minitap/mobile_use/utils/media.py +1 -1
  55. minitap/mobile_use/utils/recorder.py +2 -9
  56. minitap/mobile_use/utils/ui_hierarchy.py +13 -5
  57. {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.2.0.dist-info}/METADATA +35 -5
  58. minitap_mobile_use-2.2.0.dist-info/RECORD +96 -0
  59. minitap/mobile_use/tools/mobile/erase_text.py +0 -122
  60. minitap_mobile_use-2.0.1.dist-info/RECORD +0 -94
  61. {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.2.0.dist-info}/WHEEL +0 -0
  62. {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.2.0.dist-info}/entry_points.txt +0 -0
@@ -35,17 +35,19 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
35
35
  - Past agent thoughts
36
36
  - Recent tool effects
37
37
 
38
- 2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
38
+ 2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
39
39
 
40
- - These must be **concrete low-level actions**: back, tap, swipe, launch app, find packages, close app, input text, paste, erase text, copy, etc.
41
- - Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
42
- - When you need to open an app, use the `find_packages` low-level action to try and get its name.
40
+ - These must be **concrete low-level actions**.
41
+ - The executor has the following available tools: {{ executor_tools_list }}.
42
+ - Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
43
+ - To open URLs/links directly, use the `open_link` tool - it will automatically handle opening in the appropriate browser. It also handles deep links.
44
+ - When you need to open an app, use the `find_packages` low-level action to try and get its name. Then, simply use the `launch_app` low-level action to launch it.
43
45
  - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
44
46
  - **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
45
47
  - **Never use a sequence of `tap` + `input_text` to type into a field. Always use a single `input_text` action** with the correct `resource_id` (this already ensures the element is focused and the cursor is moved to the end).
46
48
  - When you want to launch/stop an app, prefer using its package name.
47
49
  - **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
48
- - **For text clearing**: When you need to completely clear text from an input field, always use **LONG PRESS** first to select the text field, then erase. Do NOT use tap + erase as this only clears from cursor position.
50
+ - **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
49
51
 
50
52
  ### Output
51
53
 
@@ -10,12 +10,14 @@ from langchain_core.messages import (
10
10
  ToolMessage,
11
11
  )
12
12
  from langgraph.graph.message import REMOVE_ALL_MESSAGES
13
+
13
14
  from minitap.mobile_use.agents.cortex.types import CortexOutput
14
15
  from minitap.mobile_use.agents.planner.utils import get_current_subgoal
15
16
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
16
17
  from minitap.mobile_use.context import MobileUseContext
17
18
  from minitap.mobile_use.graph.state import State
18
19
  from minitap.mobile_use.services.llm import get_llm, with_fallback
20
+ from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
19
21
  from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
20
22
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
21
23
  from minitap.mobile_use.utils.logger import get_logger
@@ -44,6 +46,7 @@ class CortexNode:
44
46
  current_subgoal=get_current_subgoal(state.subgoal_plan),
45
47
  agents_thoughts=state.agents_thoughts,
46
48
  executor_feedback=executor_feedback,
49
+ executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
47
50
  )
48
51
  messages = [
49
52
  SystemMessage(content=system_message),
@@ -83,7 +86,7 @@ class CortexNode:
83
86
  is_subgoal_completed = (
84
87
  response.complete_subgoals_by_ids is not None
85
88
  and len(response.complete_subgoals_by_ids) > 0
86
- and len(response.decisions) == 0
89
+ and (len(response.decisions) == 0 or response.decisions in ["{}", "[]", "null", ""])
87
90
  )
88
91
  if not is_subgoal_completed:
89
92
  response.complete_subgoals_by_ids = []
@@ -1,11 +1,9 @@
1
- from typing import Optional
2
-
3
1
  from pydantic import BaseModel, Field
4
2
 
5
3
 
6
4
  class CortexOutput(BaseModel):
7
5
  decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
8
6
  agent_thought: str = Field(..., description="The agent's thought")
9
- complete_subgoals_by_ids: Optional[list[str]] = Field(
7
+ complete_subgoals_by_ids: list[str] | None = Field(
10
8
  [], description="List of subgoal IDs to complete"
11
9
  )
@@ -64,14 +64,13 @@ When using the `input_text` tool:
64
64
 
65
65
  #### 🔄 Text Clearing Best Practice
66
66
 
67
- When you need to completely clear text from an input field, **DO NOT** simply use `erase_text` alone, as it only erases from the cursor position, backward. Instead:
67
+ When you need to completely clear text from an input field, always use the clear_text tool with the correct resource_id.
68
68
 
69
- 1. **Use `long_press_on` first** to select the text field and bring up selection options
70
- 2. **Then use `erase_text`** to clear the selected content
69
+ This tool automatically takes care of focusing the element (if needed), and ensuring the field is fully emptied.
71
70
 
72
- This approach ensures the **entire text content** is removed, not just the portion before the cursor position. The long press will typically select all text in the field, making the subsequent erase operation more effective.
71
+ Only and if only the clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
73
72
 
74
- ### 🔁 Final Notes
73
+ #### 🔁 Final Notes
75
74
 
76
75
  - **You do not need to reason or decide strategy** — that's the Cortex's job.
77
76
  - You simply interpret and execute — like hands following the brain.
@@ -3,6 +3,8 @@ from pathlib import Path
3
3
  from jinja2 import Template
4
4
  from langchain_core.messages import HumanMessage, SystemMessage
5
5
  from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from langchain_google_vertexai.chat_models import ChatVertexAI
7
+
6
8
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
7
9
  from minitap.mobile_use.context import MobileUseContext
8
10
  from minitap.mobile_use.graph.state import State
@@ -56,7 +58,7 @@ class ExecutorNode:
56
58
  }
57
59
 
58
60
  # ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
59
- if not isinstance(llm, ChatGoogleGenerativeAI):
61
+ if not isinstance(llm, ChatGoogleGenerativeAI | ChatVertexAI):
60
62
  llm_bind_tools_kwargs["parallel_tool_calls"] = True
61
63
 
62
64
  llm = llm.bind_tools(**llm_bind_tools_kwargs)
@@ -1,8 +1,8 @@
1
1
  import asyncio
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
  from langgraph.types import Command
4
4
  from pydantic import BaseModel
5
- from typing_extensions import override
5
+ from typing import override
6
6
  from langchain_core.runnables import RunnableConfig
7
7
  from langgraph.store.base import BaseStore
8
8
  from langchain_core.messages import AnyMessage, ToolCall, ToolMessage
@@ -21,7 +21,7 @@ class ExecutorToolNode(ToolNode):
21
21
  input: list[AnyMessage] | dict[str, Any] | BaseModel,
22
22
  config: RunnableConfig,
23
23
  *,
24
- store: Optional[BaseStore],
24
+ store: BaseStore | None,
25
25
  ):
26
26
  return await self.__func(is_async=True, input=input, config=config, store=store)
27
27
 
@@ -31,7 +31,7 @@ class ExecutorToolNode(ToolNode):
31
31
  input: list[AnyMessage] | dict[str, Any] | BaseModel,
32
32
  config: RunnableConfig,
33
33
  *,
34
- store: Optional[BaseStore],
34
+ store: BaseStore | None,
35
35
  ) -> Any:
36
36
  loop = asyncio.get_event_loop()
37
37
  return loop.run_until_complete(
@@ -44,7 +44,7 @@ class ExecutorToolNode(ToolNode):
44
44
  input: list[AnyMessage] | dict[str, Any] | BaseModel,
45
45
  config: RunnableConfig,
46
46
  *,
47
- store: Optional[BaseStore],
47
+ store: BaseStore | None,
48
48
  ) -> Any:
49
49
  tool_calls, input_type = self._parse_input(input, store)
50
50
  outputs: list[Command | ToolMessage] = []
@@ -74,7 +74,7 @@ class ExecutorToolNode(ToolNode):
74
74
  self,
75
75
  call: ToolCall,
76
76
  output: ToolMessage | Command,
77
- ) -> Optional[bool]:
77
+ ) -> bool | None:
78
78
  if isinstance(output, ToolMessage):
79
79
  return output.status == "error"
80
80
  if isinstance(output, Command):
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  from pathlib import Path
3
- from typing import Dict, Type, Union
4
3
 
5
4
  from jinja2 import Template
6
5
  from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
@@ -49,7 +48,7 @@ async def outputter(
49
48
  structured_llm = llm
50
49
 
51
50
  if output_config.structured_output:
52
- schema: Union[Dict, Type[BaseModel], None] = None
51
+ schema: dict | type[BaseModel] | None = None
53
52
  so = output_config.structured_output
54
53
 
55
54
  if isinstance(so, dict):
@@ -12,7 +12,9 @@ You work like an agile tech lead: defining the key milestones without locking in
12
12
  - Subgoals should reflect real interactions with mobile UIs (e.g. "Open app", "Tap search bar", "Scroll to item", "Send message to Bob", etc).
13
13
  - Don't assume the full UI is visible yet. Plan based on how most mobile apps work, and keep flexibility.
14
14
  - List of agents thoughts is empty which is expected, since it is the first plan.
15
- - Don't use precise UI actions when formulating subgoals like "copy", "paste", "tap", "swipe", ... unless explicitly asked in the initial goal.
15
+ - Avoid too granular UI actions based tasks (e.g. "tap", "swipe", "copy", "paste") unless explicitly required.
16
+ - The executor has the following available tools: {{ executor_tools_list }}.
17
+ When one of these tools offers a direct shortcut (e.g. `openLink` instead of manually launching a browser and typing a URL), prefer it over decomposed manual steps.
16
18
 
17
19
  2. **Replanning**
18
20
  If you're asked to **revise a previous plan**, you'll also receive:
@@ -47,12 +49,19 @@ If you're replaning and need to keep a previous subgoal, you **must keep the sam
47
49
  - Type the message "I’m running late" (ID: None)
48
50
  - Send the message (ID: None)
49
51
 
52
+ #### **Initial Goal**: "Go on https://tesla.com, and tell me what is the first car being displayed"
53
+
54
+ **Plan**:
55
+
56
+ - Open the link https://tesla.com (ID: None)
57
+ - Find the first car displayed on the home page (ID: None)
58
+
50
59
  #### **Replanning Example**
51
60
 
52
61
  **Original Plan**: same as above with IDs set
53
62
  **Agent Thoughts**:
54
63
 
55
- - Couldnt find Alice in recent chats
64
+ - Couldn't find Alice in recent chats
56
65
  - Search bar was present on top of the chat screen
57
66
  - Keyboard appeared after tapping search
58
67
 
@@ -1,13 +1,15 @@
1
- from pathlib import Path
2
1
  import uuid
2
+ from pathlib import Path
3
3
 
4
4
  from jinja2 import Template
5
5
  from langchain_core.messages import HumanMessage, SystemMessage
6
+
6
7
  from minitap.mobile_use.agents.planner.types import PlannerOutput, Subgoal, SubgoalStatus
7
8
  from minitap.mobile_use.agents.planner.utils import one_of_them_is_failure
8
9
  from minitap.mobile_use.context import MobileUseContext
9
10
  from minitap.mobile_use.graph.state import State
10
11
  from minitap.mobile_use.services.llm import get_llm
12
+ from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
11
13
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
12
14
  from minitap.mobile_use.utils.logger import get_logger
13
15
 
@@ -28,7 +30,10 @@ class PlannerNode:
28
30
 
29
31
  system_message = Template(
30
32
  Path(__file__).parent.joinpath("planner.md").read_text(encoding="utf-8")
31
- ).render(platform=self.ctx.device.mobile_platform.value)
33
+ ).render(
34
+ platform=self.ctx.device.mobile_platform.value,
35
+ executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
36
+ )
32
37
  human_message = Template(
33
38
  Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
34
39
  ).render(
@@ -1,12 +1,11 @@
1
1
  from enum import Enum
2
- from typing import Optional
3
2
 
4
3
  from pydantic import BaseModel
5
- from typing_extensions import Annotated
4
+ from typing import Annotated
6
5
 
7
6
 
8
7
  class PlannerSubgoalOutput(BaseModel):
9
- id: Annotated[Optional[str], "If not provided, it will be generated"] = None
8
+ id: Annotated[str | None, "If not provided, it will be generated"] = None
10
9
  description: str
11
10
 
12
11
 
@@ -25,7 +24,7 @@ class Subgoal(BaseModel):
25
24
  id: Annotated[str, "Unique identifier of the subgoal"]
26
25
  description: Annotated[str, "Description of the subgoal"]
27
26
  completion_reason: Annotated[
28
- Optional[str], "Reason why the subgoal was completed (failure or success)"
27
+ str | None, "Reason why the subgoal was completed (failure or success)"
29
28
  ] = None
30
29
  status: SubgoalStatus
31
30
 
@@ -3,6 +3,7 @@ from langchain_core.messages import (
3
3
  RemoveMessage,
4
4
  ToolMessage,
5
5
  )
6
+
6
7
  from minitap.mobile_use.constants import MAX_MESSAGES_IN_HISTORY
7
8
  from minitap.mobile_use.context import MobileUseContext
8
9
  from minitap.mobile_use.graph.state import State
@@ -22,7 +23,7 @@ class SummarizerNode:
22
23
  start_removal = False
23
24
 
24
25
  for msg in reversed(state.messages[:nb_removal_candidates]):
25
- if isinstance(msg, (ToolMessage, HumanMessage)):
26
+ if isinstance(msg, ToolMessage | HumanMessage):
26
27
  start_removal = True
27
28
  if start_removal and msg.id:
28
29
  remove_messages.append(RemoveMessage(id=msg.id))
@@ -1,9 +1,11 @@
1
1
  import json
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Annotated, Any, Literal, Optional, Union
4
+ from typing import Annotated, Any, Literal
5
5
 
6
+ import google.auth
6
7
  from dotenv import load_dotenv
8
+ from google.auth.exceptions import DefaultCredentialsError
7
9
  from pydantic import BaseModel, Field, SecretStr, ValidationError, model_validator
8
10
  from pydantic_settings import BaseSettings
9
11
 
@@ -17,17 +19,17 @@ logger = get_logger(__name__)
17
19
 
18
20
 
19
21
  class Settings(BaseSettings):
20
- OPENAI_API_KEY: Optional[SecretStr] = None
21
- GOOGLE_API_KEY: Optional[SecretStr] = None
22
- XAI_API_KEY: Optional[SecretStr] = None
23
- OPEN_ROUTER_API_KEY: Optional[SecretStr] = None
22
+ OPENAI_API_KEY: SecretStr | None = None
23
+ GOOGLE_API_KEY: SecretStr | None = None
24
+ XAI_API_KEY: SecretStr | None = None
25
+ OPEN_ROUTER_API_KEY: SecretStr | None = None
24
26
 
25
- OPENAI_BASE_URL: Optional[str] = None
27
+ OPENAI_BASE_URL: str | None = None
26
28
 
27
- DEVICE_SCREEN_API_BASE_URL: Optional[str] = None
28
- DEVICE_HARDWARE_BRIDGE_BASE_URL: Optional[str] = None
29
- ADB_HOST: Optional[str] = None
30
- ADB_PORT: Optional[int] = None
29
+ DEVICE_SCREEN_API_BASE_URL: str | None = None
30
+ DEVICE_HARDWARE_BRIDGE_BASE_URL: str | None = None
31
+ ADB_HOST: str | None = None
32
+ ADB_PORT: int | None = None
31
33
 
32
34
  model_config = {"env_file": ".env", "extra": "ignore"}
33
35
 
@@ -71,7 +73,7 @@ def prepare_output_files() -> tuple[str | None, str | None]:
71
73
  return validated_events_path, validated_results_path
72
74
 
73
75
 
74
- def record_events(output_path: Path | None, events: Union[list[str], BaseModel, Any]):
76
+ def record_events(output_path: Path | None, events: list[str] | BaseModel | Any):
75
77
  if not output_path:
76
78
  return
77
79
 
@@ -88,7 +90,7 @@ def record_events(output_path: Path | None, events: Union[list[str], BaseModel,
88
90
 
89
91
  ### LLM Configuration
90
92
 
91
- LLMProvider = Literal["openai", "google", "openrouter", "xai"]
93
+ LLMProvider = Literal["openai", "google", "openrouter", "xai", "vertexai"]
92
94
  LLMUtilsNode = Literal["outputter", "hopper"]
93
95
  AgentNode = Literal["planner", "orchestrator", "cortex", "executor"]
94
96
  AgentNodeWithFallback = Literal["cortex"]
@@ -98,6 +100,17 @@ DEFAULT_LLM_CONFIG_FILENAME = "llm-config.defaults.jsonc"
98
100
  OVERRIDE_LLM_CONFIG_FILENAME = "llm-config.override.jsonc"
99
101
 
100
102
 
103
+ def validate_vertex_ai_credentials():
104
+ try:
105
+ _, project = google.auth.default()
106
+ if not project:
107
+ raise Exception("VertexAI requires a Google Cloud project to be set.")
108
+ except DefaultCredentialsError as e:
109
+ raise Exception(
110
+ f"VertexAI requires valid Google Application Default Credentials (ADC): {e}"
111
+ )
112
+
113
+
101
114
  class LLM(BaseModel):
102
115
  provider: LLMProvider
103
116
  model: str
@@ -110,6 +123,8 @@ class LLM(BaseModel):
110
123
  case "google":
111
124
  if not settings.GOOGLE_API_KEY:
112
125
  raise Exception(f"{name} requires GOOGLE_API_KEY in .env")
126
+ case "vertexai":
127
+ validate_vertex_ai_credentials()
113
128
  case "openrouter":
114
129
  if not settings.OPEN_ROUTER_API_KEY:
115
130
  raise Exception(f"{name} requires OPEN_ROUTER_API_KEY in .env")
@@ -170,7 +185,7 @@ def get_default_llm_config() -> LLMConfig:
170
185
  try:
171
186
  if not os.path.exists(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME):
172
187
  raise Exception("Default llm config not found")
173
- with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME, "r") as f:
188
+ with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME) as f:
174
189
  default_config_dict = load_jsonc(f)
175
190
  return LLMConfig.model_validate(default_config_dict["default"])
176
191
  except Exception as e:
@@ -211,7 +226,7 @@ def parse_llm_config() -> LLMConfig:
211
226
  override_config_dict = {}
212
227
  if os.path.exists(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME):
213
228
  logger.info("Loading custom llm config...")
214
- with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME, "r") as f:
229
+ with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME) as f:
215
230
  override_config_dict = load_jsonc(f)
216
231
  else:
217
232
  logger.warning("Custom llm config not found, loading default config")
@@ -237,7 +252,7 @@ def initialize_llm_config() -> LLMConfig:
237
252
 
238
253
  class OutputConfig(BaseModel):
239
254
  structured_output: Annotated[
240
- Optional[Union[type[BaseModel], dict]],
255
+ type[BaseModel] | dict | None,
241
256
  Field(
242
257
  default=None,
243
258
  description=(
@@ -247,7 +262,7 @@ class OutputConfig(BaseModel):
247
262
  ),
248
263
  ]
249
264
  output_description: Annotated[
250
- Optional[str],
265
+ str | None,
251
266
  Field(
252
267
  default=None,
253
268
  description=(
@@ -6,12 +6,11 @@ Uses ContextVar to avoid prop drilling and maintain clean function signatures.
6
6
 
7
7
  from enum import Enum
8
8
  from pathlib import Path
9
- from typing import Optional
10
9
 
11
10
  from adbutils import AdbClient
12
11
  from openai import BaseModel
13
12
  from pydantic import ConfigDict
14
- from typing_extensions import Literal
13
+ from typing import Literal
15
14
 
16
15
  from minitap.mobile_use.clients.device_hardware_client import DeviceHardwareClient
17
16
  from minitap.mobile_use.clients.screen_api_client import ScreenApiClient
@@ -56,8 +55,8 @@ class MobileUseContext(BaseModel):
56
55
  hw_bridge_client: DeviceHardwareClient
57
56
  screen_api_client: ScreenApiClient
58
57
  llm_config: LLMConfig
59
- adb_client: Optional[AdbClient] = None
60
- execution_setup: Optional[ExecutionSetup] = None
58
+ adb_client: AdbClient | None = None
59
+ execution_setup: ExecutionSetup | None = None
61
60
 
62
61
  def get_adb_client(self) -> AdbClient:
63
62
  if self.adb_client is None:
@@ -1,6 +1,6 @@
1
1
  import uuid
2
2
  from enum import Enum
3
- from typing import Annotated, Literal, Optional, Union
3
+ from typing import Annotated, Literal
4
4
 
5
5
  import yaml
6
6
  from langgraph.types import Command
@@ -43,7 +43,7 @@ class RunFlowRequest(BaseModel):
43
43
  dry_run: bool = Field(default=False, alias="dryRun")
44
44
 
45
45
 
46
- def run_flow(ctx: MobileUseContext, flow_steps: list, dry_run: bool = False) -> Optional[dict]:
46
+ def run_flow(ctx: MobileUseContext, flow_steps: list, dry_run: bool = False) -> dict | None:
47
47
  """
48
48
  Run a flow i.e, a sequence of commands.
49
49
  Returns None on success, or the response body of the failed command.
@@ -137,20 +137,20 @@ class SelectorRequestWithPercentages(BaseModel):
137
137
  return {"point": self.percentages.to_str()}
138
138
 
139
139
 
140
- SelectorRequest = Union[
141
- IdSelectorRequest,
142
- SelectorRequestWithCoordinates,
143
- SelectorRequestWithPercentages,
144
- TextSelectorRequest,
145
- IdWithTextSelectorRequest,
146
- ]
140
+ SelectorRequest = (
141
+ IdSelectorRequest
142
+ | SelectorRequestWithCoordinates
143
+ | SelectorRequestWithPercentages
144
+ | TextSelectorRequest
145
+ | IdWithTextSelectorRequest
146
+ )
147
147
 
148
148
 
149
149
  def tap(
150
150
  ctx: MobileUseContext,
151
151
  selector_request: SelectorRequest,
152
152
  dry_run: bool = False,
153
- index: Optional[int] = None,
153
+ index: int | None = None,
154
154
  ):
155
155
  """
156
156
  Tap on a selector.
@@ -171,7 +171,7 @@ def long_press_on(
171
171
  ctx: MobileUseContext,
172
172
  selector_request: SelectorRequest,
173
173
  dry_run: bool = False,
174
- index: Optional[int] = None,
174
+ index: int | None = None,
175
175
  ):
176
176
  long_press_on_body = selector_request.to_dict()
177
177
  if not long_press_on_body:
@@ -211,7 +211,7 @@ SwipeDirection = Annotated[
211
211
  class SwipeRequest(BaseModel):
212
212
  model_config = ConfigDict(extra="forbid")
213
213
  swipe_mode: SwipeStartEndCoordinatesRequest | SwipeStartEndPercentagesRequest | SwipeDirection
214
- duration: Optional[int] = None # in ms, default is 400ms
214
+ duration: int | None = None # in ms, default is 400ms
215
215
 
216
216
  def to_dict(self):
217
217
  res = {}
@@ -257,7 +257,7 @@ def paste_text(ctx: MobileUseContext, dry_run: bool = False):
257
257
  return run_flow(ctx, ["pasteText"], dry_run=dry_run)
258
258
 
259
259
 
260
- def erase_text(ctx: MobileUseContext, nb_chars: Optional[int] = None, dry_run: bool = False):
260
+ def erase_text(ctx: MobileUseContext, nb_chars: int | None = None, dry_run: bool = False):
261
261
  """
262
262
  Removes characters from the currently selected textfield (if any)
263
263
  Removes 50 characters if nb_chars is not specified.
@@ -275,7 +275,7 @@ def launch_app(ctx: MobileUseContext, package_name: str, dry_run: bool = False):
275
275
  return run_flow_with_wait_for_animation_to_end(ctx, flow_input, dry_run=dry_run)
276
276
 
277
277
 
278
- def stop_app(ctx: MobileUseContext, package_name: Optional[str] = None, dry_run: bool = False):
278
+ def stop_app(ctx: MobileUseContext, package_name: str | None = None, dry_run: bool = False):
279
279
  if package_name is None:
280
280
  flow_input = ["stopApp"]
281
281
  else:
@@ -311,13 +311,13 @@ def press_key(ctx: MobileUseContext, key: Key, dry_run: bool = False):
311
311
 
312
312
 
313
313
  class WaitTimeout(Enum):
314
- SHORT = 500
315
- MEDIUM = 1000
316
- LONG = 5000
314
+ SHORT = "500"
315
+ MEDIUM = "1000"
316
+ LONG = "5000"
317
317
 
318
318
 
319
319
  def wait_for_animation_to_end(
320
- ctx: MobileUseContext, timeout: Optional[WaitTimeout] = None, dry_run: bool = False
320
+ ctx: MobileUseContext, timeout: WaitTimeout | None = None, dry_run: bool = False
321
321
  ):
322
322
  if timeout is None:
323
323
  return run_flow(ctx, ["waitForAnimationToEnd"], dry_run=dry_run)
@@ -327,7 +327,7 @@ def wait_for_animation_to_end(
327
327
  def run_flow_with_wait_for_animation_to_end(
328
328
  ctx: MobileUseContext, base_flow: list, dry_run: bool = False
329
329
  ):
330
- base_flow.append({"waitForAnimationToEnd": {"timeout": WaitTimeout.MEDIUM.value}})
330
+ base_flow.append({"waitForAnimationToEnd": {"timeout": int(WaitTimeout.MEDIUM.value)}})
331
331
  return run_flow(ctx, base_flow, dry_run=dry_run)
332
332
 
333
333
 
@@ -362,15 +362,27 @@ if __name__ == "__main__":
362
362
  agents_thoughts=[],
363
363
  )
364
364
 
365
- from minitap.mobile_use.tools.mobile.input_text import get_input_text_tool
366
-
367
- input_resource_id = "com.google.android.apps.nexuslauncher:id/search_container_hotseat"
368
- command_output: Command = get_input_text_tool(ctx=ctx).invoke(
365
+ # from minitap.mobile_use.tools.mobile.input_text import get_input_text_tool
366
+
367
+ # input_resource_id = "com.google.android.apps.nexuslauncher:id/search_container_hotseat"
368
+ # command_output: Command = get_input_text_tool(ctx=ctx).invoke(
369
+ # {
370
+ # "tool_call_id": uuid.uuid4().hex,
371
+ # "agent_thought": "",
372
+ # "text_input_resource_id": input_resource_id,
373
+ # "text": "Hello World",
374
+ # "state": dummy_state,
375
+ # "executor_metadata": None,
376
+ # }
377
+ # )
378
+ from minitap.mobile_use.tools.mobile.clear_text import get_clear_text_tool
379
+
380
+ input_resource_id = "com.google.android.apps.nexuslauncher:id/input"
381
+ command_output: Command = get_clear_text_tool(ctx=ctx).invoke(
369
382
  {
370
383
  "tool_call_id": uuid.uuid4().hex,
371
384
  "agent_thought": "",
372
385
  "text_input_resource_id": input_resource_id,
373
- "text": "Hello World",
374
386
  "state": dummy_state,
375
387
  "executor_metadata": None,
376
388
  }
@@ -1,6 +1,5 @@
1
1
  from datetime import date
2
2
  import json
3
- from typing import Optional
4
3
 
5
4
  from adbutils import AdbDevice
6
5
  from minitap.mobile_use.utils.logger import MobileUseLogger
@@ -20,8 +19,8 @@ def get_adb_device(ctx: MobileUseContext) -> AdbDevice:
20
19
 
21
20
 
22
21
  def get_first_device(
23
- logger: Optional[MobileUseLogger] = None,
24
- ) -> tuple[Optional[str], Optional[DevicePlatform]]:
22
+ logger: MobileUseLogger | None = None,
23
+ ) -> tuple[str | None, DevicePlatform | None]:
25
24
  """Gets the first available device."""
26
25
  try:
27
26
  android_output = run_shell_command_on_host("adb devices")
@@ -50,7 +49,7 @@ def get_first_device(
50
49
  return None, None
51
50
 
52
51
 
53
- def get_focused_app_info(ctx: MobileUseContext) -> Optional[str]:
52
+ def get_focused_app_info(ctx: MobileUseContext) -> str | None:
54
53
  if ctx.device.mobile_platform == DevicePlatform.IOS:
55
54
  return None
56
55
  device = get_adb_device(ctx)
@@ -6,6 +6,7 @@ from langchain_core.messages import (
6
6
  from langgraph.constants import END, START
7
7
  from langgraph.graph import StateGraph
8
8
  from langgraph.graph.state import CompiledStateGraph
9
+
9
10
  from minitap.mobile_use.agents.contextor.contextor import ContextorNode
10
11
  from minitap.mobile_use.agents.cortex.cortex import CortexNode
11
12
  from minitap.mobile_use.agents.executor.executor import ExecutorNode
@@ -1,7 +1,7 @@
1
1
  from langchain_core.messages import AIMessage, AnyMessage
2
2
  from langgraph.graph import add_messages
3
3
  from langgraph.prebuilt.chat_agent_executor import AgentStatePydantic
4
- from typing_extensions import Annotated, Optional
4
+ from typing import Annotated
5
5
 
6
6
  from minitap.mobile_use.agents.planner.types import Subgoal
7
7
  from minitap.mobile_use.config import AgentNode
@@ -24,16 +24,16 @@ class State(AgentStatePydantic):
24
24
  subgoal_plan: Annotated[list[Subgoal], "The current plan, made of subgoals"]
25
25
 
26
26
  # contextor related keys
27
- latest_screenshot_base64: Annotated[Optional[str], "Latest screenshot of the device", take_last]
27
+ latest_screenshot_base64: Annotated[str | None, "Latest screenshot of the device", take_last]
28
28
  latest_ui_hierarchy: Annotated[
29
- Optional[list[dict]], "Latest UI hierarchy of the device", take_last
29
+ list[dict] | None, "Latest UI hierarchy of the device", take_last
30
30
  ]
31
- focused_app_info: Annotated[Optional[str], "Focused app info", take_last]
32
- device_date: Annotated[Optional[str], "Date of the device", take_last]
31
+ focused_app_info: Annotated[str | None, "Focused app info", take_last]
32
+ device_date: Annotated[str | None, "Date of the device", take_last]
33
33
 
34
34
  # cortex related keys
35
35
  structured_decisions: Annotated[
36
- Optional[str],
36
+ str | None,
37
37
  "Structured decisions made by the cortex, for the executor to follow",
38
38
  take_last,
39
39
  ]
@@ -45,7 +45,7 @@ class State(AgentStatePydantic):
45
45
 
46
46
  # executor related keys
47
47
  executor_messages: Annotated[list[AnyMessage], "Sequential Executor messages", add_messages]
48
- cortex_last_thought: Annotated[Optional[str], "Last thought of the cortex for the executor"]
48
+ cortex_last_thought: Annotated[str | None, "Last thought of the cortex for the executor"]
49
49
 
50
50
  # common keys
51
51
  agents_thoughts: Annotated[
@@ -58,13 +58,13 @@ class State(AgentStatePydantic):
58
58
  self,
59
59
  ctx: MobileUseContext,
60
60
  update: dict,
61
- agent: Optional[AgentNode] = None,
61
+ agent: AgentNode | None = None,
62
62
  ):
63
63
  """
64
64
  Sanitizes the state update to ensure it is valid and apply side effect logic where required.
65
65
  The agent is required if the update contains the "agents_thoughts" key.
66
66
  """
67
- updated_agents_thoughts: Optional[str | list[str]] = update.get("agents_thoughts", None)
67
+ updated_agents_thoughts: str | list[str] | None = update.get("agents_thoughts", None)
68
68
  if updated_agents_thoughts is not None:
69
69
  if isinstance(updated_agents_thoughts, str):
70
70
  updated_agents_thoughts = [updated_agents_thoughts]