minitap-mobile-use 2.0.1__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (97) hide show
  1. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/PKG-INFO +35 -5
  2. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/README.md +29 -0
  3. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/cortex/cortex.md +7 -5
  4. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/cortex/cortex.py +4 -1
  5. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/cortex/types.py +1 -3
  6. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/executor/executor.md +4 -5
  7. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/executor/executor.py +3 -1
  8. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/executor/tool_node.py +6 -6
  9. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/outputter/outputter.py +1 -2
  10. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/planner/planner.md +11 -2
  11. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/planner/planner.py +7 -2
  12. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/planner/types.py +3 -4
  13. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/summarizer/summarizer.py +2 -1
  14. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/config.py +31 -16
  15. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/context.py +3 -4
  16. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/controllers/mobile_command_controller.py +36 -24
  17. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/controllers/platform_specific_commands_controller.py +3 -4
  18. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/graph/graph.py +1 -0
  19. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/graph/state.py +9 -9
  20. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/main.py +7 -8
  21. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/agent.py +25 -26
  22. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/builders/agent_config_builder.py +9 -10
  23. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/builders/task_request_builder.py +9 -9
  24. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/examples/smart_notification_assistant.py +1 -2
  25. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/types/agent.py +5 -5
  26. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/types/task.py +19 -18
  27. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/utils.py +4 -3
  28. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/servers/config.py +1 -2
  29. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/servers/device_hardware_bridge.py +3 -4
  30. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/servers/start_servers.py +4 -4
  31. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/servers/stop_servers.py +2 -3
  32. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/services/llm.py +24 -6
  33. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/index.py +26 -14
  34. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/back.py +1 -1
  35. minitap_mobile_use-2.2.0/minitap/mobile_use/tools/mobile/clear_text.py +277 -0
  36. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/copy_text_from.py +1 -1
  37. minitap_mobile_use-2.0.1/minitap/mobile_use/tools/mobile/swipe.py → minitap_mobile_use-2.2.0/minitap/mobile_use/tools/mobile/erase_one_char.py +18 -14
  38. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/find_packages.py +1 -1
  39. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/input_text.py +4 -80
  40. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/launch_app.py +1 -1
  41. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/long_press_on.py +2 -4
  42. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/open_link.py +1 -1
  43. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/paste_text.py +1 -1
  44. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/press_key.py +1 -1
  45. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/stop_app.py +2 -4
  46. minitap_mobile_use-2.2.0/minitap/mobile_use/tools/mobile/swipe.py +150 -0
  47. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/take_screenshot.py +1 -1
  48. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/tap.py +2 -4
  49. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +2 -4
  50. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/tools/tool_wrapper.py +6 -1
  51. minitap_mobile_use-2.2.0/minitap/mobile_use/tools/utils.py +86 -0
  52. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/cli_helpers.py +1 -2
  53. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/cli_selection.py +5 -6
  54. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/decorators.py +21 -20
  55. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/logger.py +3 -4
  56. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/media.py +1 -1
  57. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/recorder.py +2 -9
  58. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/ui_hierarchy.py +13 -5
  59. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/pyproject.toml +8 -8
  60. minitap_mobile_use-2.0.1/minitap/mobile_use/tools/mobile/erase_text.py +0 -122
  61. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/LICENSE +0 -0
  62. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/__init__.py +0 -0
  63. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/contextor/contextor.py +0 -0
  64. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/executor/utils.py +0 -0
  65. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/hopper/hopper.md +0 -0
  66. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/hopper/hopper.py +0 -0
  67. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/orchestrator/human.md +0 -0
  68. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/orchestrator/orchestrator.md +0 -0
  69. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/orchestrator/orchestrator.py +0 -0
  70. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/orchestrator/types.py +0 -0
  71. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/outputter/human.md +0 -0
  72. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/outputter/test_outputter.py +0 -0
  73. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/planner/human.md +0 -0
  74. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/agents/planner/utils.py +0 -0
  75. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/clients/device_hardware_client.py +0 -0
  76. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/clients/ios_client.py +0 -0
  77. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/clients/screen_api_client.py +0 -0
  78. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/constants.py +0 -0
  79. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/controllers/__init__.py +0 -0
  80. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/__init__.py +0 -0
  81. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/builders/__init__.py +0 -0
  82. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/builders/index.py +0 -0
  83. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/constants.py +0 -0
  84. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/examples/README.md +0 -0
  85. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/examples/__init__.py +0 -0
  86. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/examples/simple_photo_organizer.py +0 -0
  87. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/types/__init__.py +0 -0
  88. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/sdk/types/exceptions.py +0 -0
  89. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/servers/device_screen_api.py +0 -0
  90. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/servers/utils.py +0 -0
  91. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/services/accessibility.py +0 -0
  92. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/conversations.py +0 -0
  93. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/errors.py +0 -0
  94. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/file.py +0 -0
  95. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/requests_utils.py +0 -0
  96. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/shell_utils.py +0 -0
  97. {minitap_mobile_use-2.0.1 → minitap_mobile_use-2.2.0}/minitap/mobile_use/utils/time.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: minitap-mobile-use
3
- Version: 2.0.1
3
+ Version: 2.2.0
4
4
  Summary: AI-powered multi-agent system that automates real Android and iOS devices through low-level control using LangGraph.
5
5
  Author: Pierre-Louis Favreau, Jean-Pierre Lo, Nicolas Dehandschoewercker
6
6
  License: MIT License
@@ -24,11 +24,11 @@ License: MIT License
24
24
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
25
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
26
  SOFTWARE.
27
- Requires-Dist: langgraph==0.5.0
27
+ Requires-Dist: langgraph>=0.6.6
28
28
  Requires-Dist: adbutils==2.9.3
29
- Requires-Dist: langchain-google-genai==2.1.5
30
- Requires-Dist: langchain==0.3.26
31
- Requires-Dist: langchain-core==0.3.66
29
+ Requires-Dist: langchain-google-genai>=2.1.10
30
+ Requires-Dist: langchain>=0.3.27
31
+ Requires-Dist: langchain-core>=0.3.75
32
32
  Requires-Dist: jinja2==3.1.6
33
33
  Requires-Dist: python-dotenv==1.1.1
34
34
  Requires-Dist: pydantic-settings==2.10.1
@@ -42,6 +42,7 @@ Requires-Dist: fastapi==0.111.0
42
42
  Requires-Dist: uvicorn[standard]==0.30.1
43
43
  Requires-Dist: colorama>=0.4.6
44
44
  Requires-Dist: psutil>=5.9.0
45
+ Requires-Dist: langchain-google-vertexai>=2.0.28
45
46
  Requires-Dist: ruff==0.5.3 ; extra == 'dev'
46
47
  Requires-Dist: pytest==8.4.1 ; extra == 'dev'
47
48
  Requires-Dist: pytest-cov==5.0.0 ; extra == 'dev'
@@ -69,6 +70,10 @@ Description-Content-Type: text/markdown
69
70
  <a href="https://x.com/minitap_ai?t=iRWtI497UhRGLeCKYQekig&s=09"><b>Twitter / X</b></a>
70
71
  </p>
71
72
 
73
+ [![PyPI version](https://img.shields.io/pypi/v/minitap-mobile-use.svg?color=blue)](https://pypi.org/project/minitap-mobile-use/)
74
+ [![Python Version](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/)
75
+ [![License](https://img.shields.io/badge/license-MIT-blue)](https://github.com/minitap-ai/mobile-use/blob/main/LICENSE)
76
+
72
77
  </div>
73
78
 
74
79
  Mobile-use is a powerful, open-source AI agent that controls your Android or IOS device using natural language. It understands your commands and interacts with the UI to perform tasks, from sending messages to navigating complex apps.
@@ -107,11 +112,26 @@ Ready to automate your mobile experience? Follow these steps to get mobile-use u
107
112
 
108
113
  2. **(Optional) Customize LLM Configuration:**
109
114
  To use different models or providers, create your own LLM configuration file.
115
+
110
116
  ```bash
111
117
  cp llm-config.override.template.jsonc llm-config.override.jsonc
112
118
  ```
119
+
113
120
  Then, edit `llm-config.override.jsonc` to fit your needs.
114
121
 
122
+ You can also use local LLMs or any other openai-api compatible providers :
123
+
124
+ 1. Set `OPENAI_BASE_URL` and `OPENAI_API_KEY` in your `.env`
125
+ 2. In your `llm-config.override.jsonc`, set `openai` as the provider for the agent nodes you want, and choose a model supported by your provider.
126
+
127
+ > [!NOTE]
128
+ > If you want to use Google Vertex AI, you must either:
129
+ >
130
+ > - Have credentials configured for your environment (gcloud, workload identity, etc…)
131
+ > - Store the path to a service account JSON file as the GOOGLE_APPLICATION_CREDENTIALS environment variable
132
+ >
133
+ > More information: - [Credential types](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) - [google.auth API reference](https://googleapis.dev/python/google-auth/latest/reference/google.auth.html#module-google.auth)
134
+
115
135
  ### Quick Launch (Docker)
116
136
 
117
137
  > [!NOTE]
@@ -257,6 +277,16 @@ python ./src/mobile_use/main.py \
257
277
  > [!NOTE]
258
278
  > If you haven't configured a specific model, mobile-use will prompt you to choose one from the available options.
259
279
 
280
+ ## 🔎 Agentic System Overview
281
+
282
+ <div align="center">
283
+
284
+ ![Graph Visualization](doc/graph.png)
285
+
286
+ _This diagram is automatically updated from the codebase. This is our current agentic system architecture._
287
+
288
+ </div>
289
+
260
290
  ## ❤️ Contributing
261
291
 
262
292
  We love contributions! Whether you're fixing a bug, adding a feature, or improving documentation, your help is welcome. Please read our **[Contributing Guidelines](CONTRIBUTING.md)** to get started.
@@ -16,6 +16,10 @@
16
16
  <a href="https://x.com/minitap_ai?t=iRWtI497UhRGLeCKYQekig&s=09"><b>Twitter / X</b></a>
17
17
  </p>
18
18
 
19
+ [![PyPI version](https://img.shields.io/pypi/v/minitap-mobile-use.svg?color=blue)](https://pypi.org/project/minitap-mobile-use/)
20
+ [![Python Version](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/)
21
+ [![License](https://img.shields.io/badge/license-MIT-blue)](https://github.com/minitap-ai/mobile-use/blob/main/LICENSE)
22
+
19
23
  </div>
20
24
 
21
25
  Mobile-use is a powerful, open-source AI agent that controls your Android or IOS device using natural language. It understands your commands and interacts with the UI to perform tasks, from sending messages to navigating complex apps.
@@ -54,11 +58,26 @@ Ready to automate your mobile experience? Follow these steps to get mobile-use u
54
58
 
55
59
  2. **(Optional) Customize LLM Configuration:**
56
60
  To use different models or providers, create your own LLM configuration file.
61
+
57
62
  ```bash
58
63
  cp llm-config.override.template.jsonc llm-config.override.jsonc
59
64
  ```
65
+
60
66
  Then, edit `llm-config.override.jsonc` to fit your needs.
61
67
 
68
+ You can also use local LLMs or any other openai-api compatible providers :
69
+
70
+ 1. Set `OPENAI_BASE_URL` and `OPENAI_API_KEY` in your `.env`
71
+ 2. In your `llm-config.override.jsonc`, set `openai` as the provider for the agent nodes you want, and choose a model supported by your provider.
72
+
73
+ > [!NOTE]
74
+ > If you want to use Google Vertex AI, you must either:
75
+ >
76
+ > - Have credentials configured for your environment (gcloud, workload identity, etc…)
77
+ > - Store the path to a service account JSON file as the GOOGLE_APPLICATION_CREDENTIALS environment variable
78
+ >
79
+ > More information: - [Credential types](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) - [google.auth API reference](https://googleapis.dev/python/google-auth/latest/reference/google.auth.html#module-google.auth)
80
+
62
81
  ### Quick Launch (Docker)
63
82
 
64
83
  > [!NOTE]
@@ -204,6 +223,16 @@ python ./src/mobile_use/main.py \
204
223
  > [!NOTE]
205
224
  > If you haven't configured a specific model, mobile-use will prompt you to choose one from the available options.
206
225
 
226
+ ## 🔎 Agentic System Overview
227
+
228
+ <div align="center">
229
+
230
+ ![Graph Visualization](doc/graph.png)
231
+
232
+ _This diagram is automatically updated from the codebase. This is our current agentic system architecture._
233
+
234
+ </div>
235
+
207
236
  ## ❤️ Contributing
208
237
 
209
238
  We love contributions! Whether you're fixing a bug, adding a feature, or improving documentation, your help is welcome. Please read our **[Contributing Guidelines](CONTRIBUTING.md)** to get started.
@@ -35,17 +35,19 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
35
35
  - Past agent thoughts
36
36
  - Recent tool effects
37
37
 
38
- 2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
38
+ 2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
39
39
 
40
- - These must be **concrete low-level actions**: back, tap, swipe, launch app, find packages, close app, input text, paste, erase text, copy, etc.
41
- - Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
42
- - When you need to open an app, use the `find_packages` low-level action to try and get its name.
40
+ - These must be **concrete low-level actions**.
41
+ - The executor has the following available tools: {{ executor_tools_list }}.
42
+ - Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
43
+ - To open URLs/links directly, use the `open_link` tool - it will automatically handle opening in the appropriate browser. It also handles deep links.
44
+ - When you need to open an app, use the `find_packages` low-level action to try and get its name. Then, simply use the `launch_app` low-level action to launch it.
43
45
  - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
44
46
  - **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
45
47
  - **Never use a sequence of `tap` + `input_text` to type into a field. Always use a single `input_text` action** with the correct `resource_id` (this already ensures the element is focused and the cursor is moved to the end).
46
48
  - When you want to launch/stop an app, prefer using its package name.
47
49
  - **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
48
- - **For text clearing**: When you need to completely clear text from an input field, always use **LONG PRESS** first to select the text field, then erase. Do NOT use tap + erase as this only clears from cursor position.
50
+ - **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
49
51
 
50
52
  ### Output
51
53
 
@@ -10,12 +10,14 @@ from langchain_core.messages import (
10
10
  ToolMessage,
11
11
  )
12
12
  from langgraph.graph.message import REMOVE_ALL_MESSAGES
13
+
13
14
  from minitap.mobile_use.agents.cortex.types import CortexOutput
14
15
  from minitap.mobile_use.agents.planner.utils import get_current_subgoal
15
16
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
16
17
  from minitap.mobile_use.context import MobileUseContext
17
18
  from minitap.mobile_use.graph.state import State
18
19
  from minitap.mobile_use.services.llm import get_llm, with_fallback
20
+ from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
19
21
  from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
20
22
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
21
23
  from minitap.mobile_use.utils.logger import get_logger
@@ -44,6 +46,7 @@ class CortexNode:
44
46
  current_subgoal=get_current_subgoal(state.subgoal_plan),
45
47
  agents_thoughts=state.agents_thoughts,
46
48
  executor_feedback=executor_feedback,
49
+ executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
47
50
  )
48
51
  messages = [
49
52
  SystemMessage(content=system_message),
@@ -83,7 +86,7 @@ class CortexNode:
83
86
  is_subgoal_completed = (
84
87
  response.complete_subgoals_by_ids is not None
85
88
  and len(response.complete_subgoals_by_ids) > 0
86
- and len(response.decisions) == 0
89
+ and (len(response.decisions) == 0 or response.decisions in ["{}", "[]", "null", ""])
87
90
  )
88
91
  if not is_subgoal_completed:
89
92
  response.complete_subgoals_by_ids = []
@@ -1,11 +1,9 @@
1
- from typing import Optional
2
-
3
1
  from pydantic import BaseModel, Field
4
2
 
5
3
 
6
4
  class CortexOutput(BaseModel):
7
5
  decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
8
6
  agent_thought: str = Field(..., description="The agent's thought")
9
- complete_subgoals_by_ids: Optional[list[str]] = Field(
7
+ complete_subgoals_by_ids: list[str] | None = Field(
10
8
  [], description="List of subgoal IDs to complete"
11
9
  )
@@ -64,14 +64,13 @@ When using the `input_text` tool:
64
64
 
65
65
  #### 🔄 Text Clearing Best Practice
66
66
 
67
- When you need to completely clear text from an input field, **DO NOT** simply use `erase_text` alone, as it only erases from the cursor position, backward. Instead:
67
+ When you need to completely clear text from an input field, always use the clear_text tool with the correct resource_id.
68
68
 
69
- 1. **Use `long_press_on` first** to select the text field and bring up selection options
70
- 2. **Then use `erase_text`** to clear the selected content
69
+ This tool automatically takes care of focusing the element (if needed), and ensuring the field is fully emptied.
71
70
 
72
- This approach ensures the **entire text content** is removed, not just the portion before the cursor position. The long press will typically select all text in the field, making the subsequent erase operation more effective.
71
+ Only and if only the clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
73
72
 
74
- ### 🔁 Final Notes
73
+ #### 🔁 Final Notes
75
74
 
76
75
  - **You do not need to reason or decide strategy** — that's the Cortex's job.
77
76
  - You simply interpret and execute — like hands following the brain.
@@ -3,6 +3,8 @@ from pathlib import Path
3
3
  from jinja2 import Template
4
4
  from langchain_core.messages import HumanMessage, SystemMessage
5
5
  from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from langchain_google_vertexai.chat_models import ChatVertexAI
7
+
6
8
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
7
9
  from minitap.mobile_use.context import MobileUseContext
8
10
  from minitap.mobile_use.graph.state import State
@@ -56,7 +58,7 @@ class ExecutorNode:
56
58
  }
57
59
 
58
60
  # ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
59
- if not isinstance(llm, ChatGoogleGenerativeAI):
61
+ if not isinstance(llm, ChatGoogleGenerativeAI | ChatVertexAI):
60
62
  llm_bind_tools_kwargs["parallel_tool_calls"] = True
61
63
 
62
64
  llm = llm.bind_tools(**llm_bind_tools_kwargs)
@@ -1,8 +1,8 @@
1
1
  import asyncio
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
  from langgraph.types import Command
4
4
  from pydantic import BaseModel
5
- from typing_extensions import override
5
+ from typing import override
6
6
  from langchain_core.runnables import RunnableConfig
7
7
  from langgraph.store.base import BaseStore
8
8
  from langchain_core.messages import AnyMessage, ToolCall, ToolMessage
@@ -21,7 +21,7 @@ class ExecutorToolNode(ToolNode):
21
21
  input: list[AnyMessage] | dict[str, Any] | BaseModel,
22
22
  config: RunnableConfig,
23
23
  *,
24
- store: Optional[BaseStore],
24
+ store: BaseStore | None,
25
25
  ):
26
26
  return await self.__func(is_async=True, input=input, config=config, store=store)
27
27
 
@@ -31,7 +31,7 @@ class ExecutorToolNode(ToolNode):
31
31
  input: list[AnyMessage] | dict[str, Any] | BaseModel,
32
32
  config: RunnableConfig,
33
33
  *,
34
- store: Optional[BaseStore],
34
+ store: BaseStore | None,
35
35
  ) -> Any:
36
36
  loop = asyncio.get_event_loop()
37
37
  return loop.run_until_complete(
@@ -44,7 +44,7 @@ class ExecutorToolNode(ToolNode):
44
44
  input: list[AnyMessage] | dict[str, Any] | BaseModel,
45
45
  config: RunnableConfig,
46
46
  *,
47
- store: Optional[BaseStore],
47
+ store: BaseStore | None,
48
48
  ) -> Any:
49
49
  tool_calls, input_type = self._parse_input(input, store)
50
50
  outputs: list[Command | ToolMessage] = []
@@ -74,7 +74,7 @@ class ExecutorToolNode(ToolNode):
74
74
  self,
75
75
  call: ToolCall,
76
76
  output: ToolMessage | Command,
77
- ) -> Optional[bool]:
77
+ ) -> bool | None:
78
78
  if isinstance(output, ToolMessage):
79
79
  return output.status == "error"
80
80
  if isinstance(output, Command):
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  from pathlib import Path
3
- from typing import Dict, Type, Union
4
3
 
5
4
  from jinja2 import Template
6
5
  from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
@@ -49,7 +48,7 @@ async def outputter(
49
48
  structured_llm = llm
50
49
 
51
50
  if output_config.structured_output:
52
- schema: Union[Dict, Type[BaseModel], None] = None
51
+ schema: dict | type[BaseModel] | None = None
53
52
  so = output_config.structured_output
54
53
 
55
54
  if isinstance(so, dict):
@@ -12,7 +12,9 @@ You work like an agile tech lead: defining the key milestones without locking in
12
12
  - Subgoals should reflect real interactions with mobile UIs (e.g. "Open app", "Tap search bar", "Scroll to item", "Send message to Bob", etc).
13
13
  - Don't assume the full UI is visible yet. Plan based on how most mobile apps work, and keep flexibility.
14
14
  - List of agents thoughts is empty which is expected, since it is the first plan.
15
- - Don't use precise UI actions when formulating subgoals like "copy", "paste", "tap", "swipe", ... unless explicitly asked in the initial goal.
15
+ - Avoid too granular UI actions based tasks (e.g. "tap", "swipe", "copy", "paste") unless explicitly required.
16
+ - The executor has the following available tools: {{ executor_tools_list }}.
17
+ When one of these tools offers a direct shortcut (e.g. `openLink` instead of manually launching a browser and typing a URL), prefer it over decomposed manual steps.
16
18
 
17
19
  2. **Replanning**
18
20
  If you're asked to **revise a previous plan**, you'll also receive:
@@ -47,12 +49,19 @@ If you're replaning and need to keep a previous subgoal, you **must keep the sam
47
49
  - Type the message "I’m running late" (ID: None)
48
50
  - Send the message (ID: None)
49
51
 
52
+ #### **Initial Goal**: "Go on https://tesla.com, and tell me what is the first car being displayed"
53
+
54
+ **Plan**:
55
+
56
+ - Open the link https://tesla.com (ID: None)
57
+ - Find the first car displayed on the home page (ID: None)
58
+
50
59
  #### **Replanning Example**
51
60
 
52
61
  **Original Plan**: same as above with IDs set
53
62
  **Agent Thoughts**:
54
63
 
55
- - Couldnt find Alice in recent chats
64
+ - Couldn't find Alice in recent chats
56
65
  - Search bar was present on top of the chat screen
57
66
  - Keyboard appeared after tapping search
58
67
 
@@ -1,13 +1,15 @@
1
- from pathlib import Path
2
1
  import uuid
2
+ from pathlib import Path
3
3
 
4
4
  from jinja2 import Template
5
5
  from langchain_core.messages import HumanMessage, SystemMessage
6
+
6
7
  from minitap.mobile_use.agents.planner.types import PlannerOutput, Subgoal, SubgoalStatus
7
8
  from minitap.mobile_use.agents.planner.utils import one_of_them_is_failure
8
9
  from minitap.mobile_use.context import MobileUseContext
9
10
  from minitap.mobile_use.graph.state import State
10
11
  from minitap.mobile_use.services.llm import get_llm
12
+ from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
11
13
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
12
14
  from minitap.mobile_use.utils.logger import get_logger
13
15
 
@@ -28,7 +30,10 @@ class PlannerNode:
28
30
 
29
31
  system_message = Template(
30
32
  Path(__file__).parent.joinpath("planner.md").read_text(encoding="utf-8")
31
- ).render(platform=self.ctx.device.mobile_platform.value)
33
+ ).render(
34
+ platform=self.ctx.device.mobile_platform.value,
35
+ executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
36
+ )
32
37
  human_message = Template(
33
38
  Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
34
39
  ).render(
@@ -1,12 +1,11 @@
1
1
  from enum import Enum
2
- from typing import Optional
3
2
 
4
3
  from pydantic import BaseModel
5
- from typing_extensions import Annotated
4
+ from typing import Annotated
6
5
 
7
6
 
8
7
  class PlannerSubgoalOutput(BaseModel):
9
- id: Annotated[Optional[str], "If not provided, it will be generated"] = None
8
+ id: Annotated[str | None, "If not provided, it will be generated"] = None
10
9
  description: str
11
10
 
12
11
 
@@ -25,7 +24,7 @@ class Subgoal(BaseModel):
25
24
  id: Annotated[str, "Unique identifier of the subgoal"]
26
25
  description: Annotated[str, "Description of the subgoal"]
27
26
  completion_reason: Annotated[
28
- Optional[str], "Reason why the subgoal was completed (failure or success)"
27
+ str | None, "Reason why the subgoal was completed (failure or success)"
29
28
  ] = None
30
29
  status: SubgoalStatus
31
30
 
@@ -3,6 +3,7 @@ from langchain_core.messages import (
3
3
  RemoveMessage,
4
4
  ToolMessage,
5
5
  )
6
+
6
7
  from minitap.mobile_use.constants import MAX_MESSAGES_IN_HISTORY
7
8
  from minitap.mobile_use.context import MobileUseContext
8
9
  from minitap.mobile_use.graph.state import State
@@ -22,7 +23,7 @@ class SummarizerNode:
22
23
  start_removal = False
23
24
 
24
25
  for msg in reversed(state.messages[:nb_removal_candidates]):
25
- if isinstance(msg, (ToolMessage, HumanMessage)):
26
+ if isinstance(msg, ToolMessage | HumanMessage):
26
27
  start_removal = True
27
28
  if start_removal and msg.id:
28
29
  remove_messages.append(RemoveMessage(id=msg.id))
@@ -1,9 +1,11 @@
1
1
  import json
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Annotated, Any, Literal, Optional, Union
4
+ from typing import Annotated, Any, Literal
5
5
 
6
+ import google.auth
6
7
  from dotenv import load_dotenv
8
+ from google.auth.exceptions import DefaultCredentialsError
7
9
  from pydantic import BaseModel, Field, SecretStr, ValidationError, model_validator
8
10
  from pydantic_settings import BaseSettings
9
11
 
@@ -17,17 +19,17 @@ logger = get_logger(__name__)
17
19
 
18
20
 
19
21
  class Settings(BaseSettings):
20
- OPENAI_API_KEY: Optional[SecretStr] = None
21
- GOOGLE_API_KEY: Optional[SecretStr] = None
22
- XAI_API_KEY: Optional[SecretStr] = None
23
- OPEN_ROUTER_API_KEY: Optional[SecretStr] = None
22
+ OPENAI_API_KEY: SecretStr | None = None
23
+ GOOGLE_API_KEY: SecretStr | None = None
24
+ XAI_API_KEY: SecretStr | None = None
25
+ OPEN_ROUTER_API_KEY: SecretStr | None = None
24
26
 
25
- OPENAI_BASE_URL: Optional[str] = None
27
+ OPENAI_BASE_URL: str | None = None
26
28
 
27
- DEVICE_SCREEN_API_BASE_URL: Optional[str] = None
28
- DEVICE_HARDWARE_BRIDGE_BASE_URL: Optional[str] = None
29
- ADB_HOST: Optional[str] = None
30
- ADB_PORT: Optional[int] = None
29
+ DEVICE_SCREEN_API_BASE_URL: str | None = None
30
+ DEVICE_HARDWARE_BRIDGE_BASE_URL: str | None = None
31
+ ADB_HOST: str | None = None
32
+ ADB_PORT: int | None = None
31
33
 
32
34
  model_config = {"env_file": ".env", "extra": "ignore"}
33
35
 
@@ -71,7 +73,7 @@ def prepare_output_files() -> tuple[str | None, str | None]:
71
73
  return validated_events_path, validated_results_path
72
74
 
73
75
 
74
- def record_events(output_path: Path | None, events: Union[list[str], BaseModel, Any]):
76
+ def record_events(output_path: Path | None, events: list[str] | BaseModel | Any):
75
77
  if not output_path:
76
78
  return
77
79
 
@@ -88,7 +90,7 @@ def record_events(output_path: Path | None, events: Union[list[str], BaseModel,
88
90
 
89
91
  ### LLM Configuration
90
92
 
91
- LLMProvider = Literal["openai", "google", "openrouter", "xai"]
93
+ LLMProvider = Literal["openai", "google", "openrouter", "xai", "vertexai"]
92
94
  LLMUtilsNode = Literal["outputter", "hopper"]
93
95
  AgentNode = Literal["planner", "orchestrator", "cortex", "executor"]
94
96
  AgentNodeWithFallback = Literal["cortex"]
@@ -98,6 +100,17 @@ DEFAULT_LLM_CONFIG_FILENAME = "llm-config.defaults.jsonc"
98
100
  OVERRIDE_LLM_CONFIG_FILENAME = "llm-config.override.jsonc"
99
101
 
100
102
 
103
+ def validate_vertex_ai_credentials():
104
+ try:
105
+ _, project = google.auth.default()
106
+ if not project:
107
+ raise Exception("VertexAI requires a Google Cloud project to be set.")
108
+ except DefaultCredentialsError as e:
109
+ raise Exception(
110
+ f"VertexAI requires valid Google Application Default Credentials (ADC): {e}"
111
+ )
112
+
113
+
101
114
  class LLM(BaseModel):
102
115
  provider: LLMProvider
103
116
  model: str
@@ -110,6 +123,8 @@ class LLM(BaseModel):
110
123
  case "google":
111
124
  if not settings.GOOGLE_API_KEY:
112
125
  raise Exception(f"{name} requires GOOGLE_API_KEY in .env")
126
+ case "vertexai":
127
+ validate_vertex_ai_credentials()
113
128
  case "openrouter":
114
129
  if not settings.OPEN_ROUTER_API_KEY:
115
130
  raise Exception(f"{name} requires OPEN_ROUTER_API_KEY in .env")
@@ -170,7 +185,7 @@ def get_default_llm_config() -> LLMConfig:
170
185
  try:
171
186
  if not os.path.exists(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME):
172
187
  raise Exception("Default llm config not found")
173
- with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME, "r") as f:
188
+ with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME) as f:
174
189
  default_config_dict = load_jsonc(f)
175
190
  return LLMConfig.model_validate(default_config_dict["default"])
176
191
  except Exception as e:
@@ -211,7 +226,7 @@ def parse_llm_config() -> LLMConfig:
211
226
  override_config_dict = {}
212
227
  if os.path.exists(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME):
213
228
  logger.info("Loading custom llm config...")
214
- with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME, "r") as f:
229
+ with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME) as f:
215
230
  override_config_dict = load_jsonc(f)
216
231
  else:
217
232
  logger.warning("Custom llm config not found, loading default config")
@@ -237,7 +252,7 @@ def initialize_llm_config() -> LLMConfig:
237
252
 
238
253
  class OutputConfig(BaseModel):
239
254
  structured_output: Annotated[
240
- Optional[Union[type[BaseModel], dict]],
255
+ type[BaseModel] | dict | None,
241
256
  Field(
242
257
  default=None,
243
258
  description=(
@@ -247,7 +262,7 @@ class OutputConfig(BaseModel):
247
262
  ),
248
263
  ]
249
264
  output_description: Annotated[
250
- Optional[str],
265
+ str | None,
251
266
  Field(
252
267
  default=None,
253
268
  description=(
@@ -6,12 +6,11 @@ Uses ContextVar to avoid prop drilling and maintain clean function signatures.
6
6
 
7
7
  from enum import Enum
8
8
  from pathlib import Path
9
- from typing import Optional
10
9
 
11
10
  from adbutils import AdbClient
12
11
  from openai import BaseModel
13
12
  from pydantic import ConfigDict
14
- from typing_extensions import Literal
13
+ from typing import Literal
15
14
 
16
15
  from minitap.mobile_use.clients.device_hardware_client import DeviceHardwareClient
17
16
  from minitap.mobile_use.clients.screen_api_client import ScreenApiClient
@@ -56,8 +55,8 @@ class MobileUseContext(BaseModel):
56
55
  hw_bridge_client: DeviceHardwareClient
57
56
  screen_api_client: ScreenApiClient
58
57
  llm_config: LLMConfig
59
- adb_client: Optional[AdbClient] = None
60
- execution_setup: Optional[ExecutionSetup] = None
58
+ adb_client: AdbClient | None = None
59
+ execution_setup: ExecutionSetup | None = None
61
60
 
62
61
  def get_adb_client(self) -> AdbClient:
63
62
  if self.adb_client is None: