minitap-mobile-use 2.1.0__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/PKG-INFO +28 -8
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/README.md +22 -3
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/contextor/contextor.py +4 -2
- minitap_mobile_use-2.3.0/minitap/mobile_use/agents/cortex/cortex.md +148 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/cortex/cortex.py +1 -2
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/executor/executor.md +6 -4
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/executor/executor.py +3 -1
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/executor/utils.py +2 -1
- minitap_mobile_use-2.3.0/minitap/mobile_use/agents/outputter/test_outputter.py +169 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/planner/planner.md +1 -1
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/planner/planner.py +4 -2
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/config.py +16 -1
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/controllers/mobile_command_controller.py +4 -4
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/main.py +2 -2
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/agent.py +17 -8
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/builders/agent_config_builder.py +2 -2
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/types/exceptions.py +30 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/utils.py +3 -2
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/servers/device_hardware_bridge.py +2 -1
- minitap_mobile_use-2.3.0/minitap/mobile_use/servers/utils.py +8 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/services/llm.py +23 -6
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/index.py +21 -15
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/clear_text.py +73 -25
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/copy_text_from.py +7 -5
- minitap_mobile_use-2.1.0/minitap/mobile_use/tools/mobile/take_screenshot.py → minitap_mobile_use-2.3.0/minitap/mobile_use/tools/mobile/glimpse_screen.py +15 -11
- minitap_mobile_use-2.3.0/minitap/mobile_use/tools/mobile/input_text.py +178 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/paste_text.py +34 -8
- minitap_mobile_use-2.3.0/minitap/mobile_use/tools/mobile/swipe.py +150 -0
- minitap_mobile_use-2.3.0/minitap/mobile_use/tools/test_utils.py +351 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/tool_wrapper.py +5 -0
- minitap_mobile_use-2.3.0/minitap/mobile_use/tools/utils.py +193 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/recorder.py +2 -9
- minitap_mobile_use-2.3.0/minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/ui_hierarchy.py +2 -2
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/pyproject.toml +41 -6
- minitap_mobile_use-2.1.0/minitap/mobile_use/agents/cortex/cortex.md +0 -102
- minitap_mobile_use-2.1.0/minitap/mobile_use/agents/outputter/test_outputter.py +0 -107
- minitap_mobile_use-2.1.0/minitap/mobile_use/servers/utils.py +0 -11
- minitap_mobile_use-2.1.0/minitap/mobile_use/tools/mobile/input_text.py +0 -97
- minitap_mobile_use-2.1.0/minitap/mobile_use/tools/mobile/swipe.py +0 -52
- minitap_mobile_use-2.1.0/minitap/mobile_use/tools/utils.py +0 -86
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/LICENSE +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/__init__.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/cortex/types.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/executor/tool_node.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/hopper/hopper.md +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/hopper/hopper.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/orchestrator/human.md +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/orchestrator/orchestrator.md +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/orchestrator/orchestrator.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/orchestrator/types.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/outputter/human.md +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/outputter/outputter.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/planner/human.md +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/planner/types.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/planner/utils.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/summarizer/summarizer.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/clients/device_hardware_client.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/clients/ios_client.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/clients/screen_api_client.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/constants.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/context.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/controllers/__init__.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/controllers/platform_specific_commands_controller.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/graph/graph.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/graph/state.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/__init__.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/builders/__init__.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/builders/index.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/builders/task_request_builder.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/constants.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/examples/README.md +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/examples/__init__.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/examples/simple_photo_organizer.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/examples/smart_notification_assistant.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/types/__init__.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/types/agent.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/sdk/types/task.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/servers/config.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/servers/device_screen_api.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/servers/start_servers.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/servers/stop_servers.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/services/accessibility.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/back.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/erase_one_char.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/find_packages.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/launch_app.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/long_press_on.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/open_link.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/press_key.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/stop_app.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/tap.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/cli_helpers.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/cli_selection.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/conversations.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/decorators.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/errors.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/file.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/logger.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/media.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/requests_utils.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/shell_utils.py +0 -0
- {minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/utils/time.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: minitap-mobile-use
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0
|
|
4
4
|
Summary: AI-powered multi-agent system that automates real Android and iOS devices through low-level control using LangGraph.
|
|
5
5
|
Author: Pierre-Louis Favreau, Jean-Pierre Lo, Nicolas Dehandschoewercker
|
|
6
6
|
License: MIT License
|
|
@@ -24,11 +24,11 @@ License: MIT License
|
|
|
24
24
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
25
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
26
|
SOFTWARE.
|
|
27
|
-
Requires-Dist: langgraph
|
|
27
|
+
Requires-Dist: langgraph>=0.6.6
|
|
28
28
|
Requires-Dist: adbutils==2.9.3
|
|
29
|
-
Requires-Dist: langchain-google-genai
|
|
30
|
-
Requires-Dist: langchain
|
|
31
|
-
Requires-Dist: langchain-core
|
|
29
|
+
Requires-Dist: langchain-google-genai>=2.1.10
|
|
30
|
+
Requires-Dist: langchain>=0.3.27
|
|
31
|
+
Requires-Dist: langchain-core>=0.3.75
|
|
32
32
|
Requires-Dist: jinja2==3.1.6
|
|
33
33
|
Requires-Dist: python-dotenv==1.1.1
|
|
34
34
|
Requires-Dist: pydantic-settings==2.10.1
|
|
@@ -42,6 +42,7 @@ Requires-Dist: fastapi==0.111.0
|
|
|
42
42
|
Requires-Dist: uvicorn[standard]==0.30.1
|
|
43
43
|
Requires-Dist: colorama>=0.4.6
|
|
44
44
|
Requires-Dist: psutil>=5.9.0
|
|
45
|
+
Requires-Dist: langchain-google-vertexai>=2.0.28
|
|
45
46
|
Requires-Dist: ruff==0.5.3 ; extra == 'dev'
|
|
46
47
|
Requires-Dist: pytest==8.4.1 ; extra == 'dev'
|
|
47
48
|
Requires-Dist: pytest-cov==5.0.0 ; extra == 'dev'
|
|
@@ -55,7 +56,7 @@ Description-Content-Type: text/markdown
|
|
|
55
56
|
|
|
56
57
|
<div align="center">
|
|
57
58
|
|
|
58
|
-

|
|
59
60
|
|
|
60
61
|
</div>
|
|
61
62
|
|
|
@@ -69,6 +70,10 @@ Description-Content-Type: text/markdown
|
|
|
69
70
|
<a href="https://x.com/minitap_ai?t=iRWtI497UhRGLeCKYQekig&s=09"><b>Twitter / X</b></a>
|
|
70
71
|
</p>
|
|
71
72
|
|
|
73
|
+
[](https://pypi.org/project/minitap-mobile-use/)
|
|
74
|
+
[](https://www.python.org/downloads/)
|
|
75
|
+
[](https://github.com/minitap-ai/mobile-use/blob/main/LICENSE)
|
|
76
|
+
|
|
72
77
|
</div>
|
|
73
78
|
|
|
74
79
|
Mobile-use is a powerful, open-source AI agent that controls your Android or IOS device using natural language. It understands your commands and interacts with the UI to perform tasks, from sending messages to navigating complex apps.
|
|
@@ -78,7 +83,7 @@ Mobile-use is a powerful, open-source AI agent that controls your Android or IOS
|
|
|
78
83
|
## ✨ Features
|
|
79
84
|
|
|
80
85
|
- 🗣️ **Natural Language Control**: Interact with your phone using your native language.
|
|
81
|
-
- 📱 **UI-Aware Automation**: Intelligently navigates through app interfaces.
|
|
86
|
+
- 📱 **UI-Aware Automation**: Intelligently navigates through app interfaces (note: currently has limited effectiveness with games as they don't provide accessibility tree data).
|
|
82
87
|
- 📊 **Data Scraping**: Extract information from any app and structure it into your desired format (e.g., JSON) using a natural language description.
|
|
83
88
|
- 🔧 **Extensible & Customizable**: Easily configure different LLMs to power the agents that power mobile-use.
|
|
84
89
|
|
|
@@ -107,11 +112,26 @@ Ready to automate your mobile experience? Follow these steps to get mobile-use u
|
|
|
107
112
|
|
|
108
113
|
2. **(Optional) Customize LLM Configuration:**
|
|
109
114
|
To use different models or providers, create your own LLM configuration file.
|
|
115
|
+
|
|
110
116
|
```bash
|
|
111
117
|
cp llm-config.override.template.jsonc llm-config.override.jsonc
|
|
112
118
|
```
|
|
119
|
+
|
|
113
120
|
Then, edit `llm-config.override.jsonc` to fit your needs.
|
|
114
121
|
|
|
122
|
+
You can also use local LLMs or any other openai-api compatible providers :
|
|
123
|
+
|
|
124
|
+
1. Set `OPENAI_BASE_URL` and `OPENAI_API_KEY` in your `.env`
|
|
125
|
+
2. In your `llm-config.override.jsonc`, set `openai` as the provider for the agent nodes you want, and choose a model supported by your provider.
|
|
126
|
+
|
|
127
|
+
> [!NOTE]
|
|
128
|
+
> If you want to use Google Vertex AI, you must either:
|
|
129
|
+
>
|
|
130
|
+
> - Have credentials configured for your environment (gcloud, workload identity, etc…)
|
|
131
|
+
> - Store the path to a service account JSON file as the GOOGLE_APPLICATION_CREDENTIALS environment variable
|
|
132
|
+
>
|
|
133
|
+
> More information: - [Credential types](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) - [google.auth API reference](https://googleapis.dev/python/google-auth/latest/reference/google.auth.html#module-google.auth)
|
|
134
|
+
|
|
115
135
|
### Quick Launch (Docker)
|
|
116
136
|
|
|
117
137
|
> [!NOTE]
|
|
@@ -132,7 +152,7 @@ Then run in your terminal:
|
|
|
132
152
|
|
|
133
153
|
```bash
|
|
134
154
|
chmod +x mobile-use.sh
|
|
135
|
-
./mobile-use.sh \
|
|
155
|
+
bash ./mobile-use.sh \
|
|
136
156
|
"Open Gmail, find first 3 unread emails, and list their sender and subject line" \
|
|
137
157
|
--output-description "A JSON list of objects, each with 'sender' and 'subject' keys"
|
|
138
158
|
```
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
<div align="center">
|
|
4
4
|
|
|
5
|
-

|
|
6
6
|
|
|
7
7
|
</div>
|
|
8
8
|
|
|
@@ -16,6 +16,10 @@
|
|
|
16
16
|
<a href="https://x.com/minitap_ai?t=iRWtI497UhRGLeCKYQekig&s=09"><b>Twitter / X</b></a>
|
|
17
17
|
</p>
|
|
18
18
|
|
|
19
|
+
[](https://pypi.org/project/minitap-mobile-use/)
|
|
20
|
+
[](https://www.python.org/downloads/)
|
|
21
|
+
[](https://github.com/minitap-ai/mobile-use/blob/main/LICENSE)
|
|
22
|
+
|
|
19
23
|
</div>
|
|
20
24
|
|
|
21
25
|
Mobile-use is a powerful, open-source AI agent that controls your Android or IOS device using natural language. It understands your commands and interacts with the UI to perform tasks, from sending messages to navigating complex apps.
|
|
@@ -25,7 +29,7 @@ Mobile-use is a powerful, open-source AI agent that controls your Android or IOS
|
|
|
25
29
|
## ✨ Features
|
|
26
30
|
|
|
27
31
|
- 🗣️ **Natural Language Control**: Interact with your phone using your native language.
|
|
28
|
-
- 📱 **UI-Aware Automation**: Intelligently navigates through app interfaces.
|
|
32
|
+
- 📱 **UI-Aware Automation**: Intelligently navigates through app interfaces (note: currently has limited effectiveness with games as they don't provide accessibility tree data).
|
|
29
33
|
- 📊 **Data Scraping**: Extract information from any app and structure it into your desired format (e.g., JSON) using a natural language description.
|
|
30
34
|
- 🔧 **Extensible & Customizable**: Easily configure different LLMs to power the agents that power mobile-use.
|
|
31
35
|
|
|
@@ -54,11 +58,26 @@ Ready to automate your mobile experience? Follow these steps to get mobile-use u
|
|
|
54
58
|
|
|
55
59
|
2. **(Optional) Customize LLM Configuration:**
|
|
56
60
|
To use different models or providers, create your own LLM configuration file.
|
|
61
|
+
|
|
57
62
|
```bash
|
|
58
63
|
cp llm-config.override.template.jsonc llm-config.override.jsonc
|
|
59
64
|
```
|
|
65
|
+
|
|
60
66
|
Then, edit `llm-config.override.jsonc` to fit your needs.
|
|
61
67
|
|
|
68
|
+
You can also use local LLMs or any other openai-api compatible providers :
|
|
69
|
+
|
|
70
|
+
1. Set `OPENAI_BASE_URL` and `OPENAI_API_KEY` in your `.env`
|
|
71
|
+
2. In your `llm-config.override.jsonc`, set `openai` as the provider for the agent nodes you want, and choose a model supported by your provider.
|
|
72
|
+
|
|
73
|
+
> [!NOTE]
|
|
74
|
+
> If you want to use Google Vertex AI, you must either:
|
|
75
|
+
>
|
|
76
|
+
> - Have credentials configured for your environment (gcloud, workload identity, etc…)
|
|
77
|
+
> - Store the path to a service account JSON file as the GOOGLE_APPLICATION_CREDENTIALS environment variable
|
|
78
|
+
>
|
|
79
|
+
> More information: - [Credential types](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) - [google.auth API reference](https://googleapis.dev/python/google-auth/latest/reference/google.auth.html#module-google.auth)
|
|
80
|
+
|
|
62
81
|
### Quick Launch (Docker)
|
|
63
82
|
|
|
64
83
|
> [!NOTE]
|
|
@@ -79,7 +98,7 @@ Then run in your terminal:
|
|
|
79
98
|
|
|
80
99
|
```bash
|
|
81
100
|
chmod +x mobile-use.sh
|
|
82
|
-
./mobile-use.sh \
|
|
101
|
+
bash ./mobile-use.sh \
|
|
83
102
|
"Open Gmail, find first 3 unread emails, and list their sender and subject line" \
|
|
84
103
|
--output-description "A JSON list of objects, each with 'sender' and 'subject' keys"
|
|
85
104
|
```
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from minitap.mobile_use.agents.executor.utils import is_last_tool_message_take_screenshot
|
|
2
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
2
3
|
from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
|
|
3
4
|
from minitap.mobile_use.controllers.platform_specific_commands_controller import (
|
|
4
5
|
get_device_date,
|
|
@@ -7,7 +8,6 @@ from minitap.mobile_use.controllers.platform_specific_commands_controller import
|
|
|
7
8
|
from minitap.mobile_use.graph.state import State
|
|
8
9
|
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
9
10
|
from minitap.mobile_use.utils.logger import get_logger
|
|
10
|
-
from minitap.mobile_use.context import MobileUseContext
|
|
11
11
|
|
|
12
12
|
logger = get_logger(__name__)
|
|
13
13
|
|
|
@@ -26,7 +26,9 @@ class ContextorNode:
|
|
|
26
26
|
focused_app_info = get_focused_app_info(self.ctx)
|
|
27
27
|
device_date = get_device_date(self.ctx)
|
|
28
28
|
|
|
29
|
-
should_add_screenshot_context = is_last_tool_message_take_screenshot(
|
|
29
|
+
should_add_screenshot_context = is_last_tool_message_take_screenshot(
|
|
30
|
+
list(state.executor_messages)
|
|
31
|
+
)
|
|
30
32
|
|
|
31
33
|
return state.sanitize_update(
|
|
32
34
|
ctx=self.ctx,
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
## You are the **Cortex**
|
|
2
|
+
|
|
3
|
+
Your job is to **analyze the current {{ platform }} mobile device state** and produce **structured decisions** to achieve the current subgoal and more consecutive subgoals if possible.
|
|
4
|
+
|
|
5
|
+
You must act like a human brain, responsible for giving instructions to your hands (the **Executor** agent). Therefore, you must act with the same imprecision and uncertainty as a human when performing swipe actions: humans don't know where exactly they are swiping (always prefer percentages of width and height instead of absolute coordinates), they just know they are swiping up or down, left or right, and with how much force (usually amplified compared to what's truly needed - go overboard of sliders for instance).
|
|
6
|
+
|
|
7
|
+
### Core Principle: Break Unproductive Cycles
|
|
8
|
+
|
|
9
|
+
Your highest priority is to recognize when you are not making progress. You are in an unproductive cycle if a **sequence of actions brings you back to a previous state without achieving the subgoal.**
|
|
10
|
+
|
|
11
|
+
If you detect a cycle, you are **FORBIDDEN** from repeating it. You must pivot your strategy.
|
|
12
|
+
|
|
13
|
+
1. **Announce the Pivot:** In your `agent_thought`, you must briefly state which workflow is failing and what your new approach is.
|
|
14
|
+
|
|
15
|
+
2. **Find a Simpler Path:** Abandon the current workflow. Ask yourself: **"How would a human do this if this feature didn't exist?"** This usually means relying on fundamental actions like scrolling, swiping, or navigating through menus manually.
|
|
16
|
+
|
|
17
|
+
3. **Retreat as a Last Resort:** If no simpler path exists, declare the subgoal a failure to trigger a replan.
|
|
18
|
+
|
|
19
|
+
### How to Perceive the Screen: A Two-Sense Approach
|
|
20
|
+
|
|
21
|
+
To understand the device state, you have two senses, each with its purpose:
|
|
22
|
+
|
|
23
|
+
1. **UI Hierarchy (Your sense of "Touch"):**
|
|
24
|
+
* **What it is:** A structured list of all elements on the screen.
|
|
25
|
+
* **Use it for:** Finding elements by `resource-id`, checking for specific text, and understanding the layout structure.
|
|
26
|
+
* **Limitation:** It does NOT tell you what the screen *looks* like. It can be incomplete, and it contains no information about images, colors, or whether an element is visually obscured.
|
|
27
|
+
|
|
28
|
+
2. **`glimpse_screen` (Your sense of "Sight"):**
|
|
29
|
+
* **What it is:** A tool that provides a real, up-to-date image of the screen.
|
|
30
|
+
* **Use it for:** Confirming what is actually visible. This is your source of TRUTH for all visual information (icons, images, element positions, colors).
|
|
31
|
+
* **Golden Rule:** When the UI hierarchy is ambiguous, seems incomplete, or when you need to verify a visual detail before acting, **`glimpse_screen` is always the most effective and reliable action.** Never guess what the screen looks like; use your sight to be sure.
|
|
32
|
+
|
|
33
|
+
**CRITICAL NOTE ON SIGHT:** The visual information from `glimpse_screen` is **ephemeral**. It is available for **THIS decision turn ONLY**. You MUST extract all necessary information from it IMMEDIATELY, as it will be cleared before the next step.
|
|
34
|
+
### Context You Receive:
|
|
35
|
+
|
|
36
|
+
- 📱 **Device state**:
|
|
37
|
+
- Latest **UI hierarchy** and (if available) a **screenshot**.
|
|
38
|
+
- **CRITICAL NOTE ON SIGHT:** The visual information from `glimpse_screen` is **ephemeral**. It is available for **THIS decision turn ONLY**. You MUST extract all necessary information from it IMMEDIATELY, as it will be cleared before the next step.
|
|
39
|
+
|
|
40
|
+
- 🧭 **Task context**:
|
|
41
|
+
- The user's **initial goal**
|
|
42
|
+
- The **subgoal plan** with their statuses
|
|
43
|
+
- The **current subgoal** (the one in `PENDING` in the plan)
|
|
44
|
+
- A list of **agent thoughts** (previous reasoning, observations about the environment)
|
|
45
|
+
- **Executor agent feedback** on the latest UI decisions
|
|
46
|
+
|
|
47
|
+
### Your Mission:
|
|
48
|
+
|
|
49
|
+
Focus on the **current PENDING subgoal and the next subgoals not yet started**.
|
|
50
|
+
|
|
51
|
+
**CRITICAL: Before making any decision, you MUST thoroughly analyze the agent thoughts history to:**
|
|
52
|
+
- **Detect patterns of failure or repeated attempts** that suggest the current approach isn't working
|
|
53
|
+
- **Identify contradictions** between what was planned and what actually happened
|
|
54
|
+
- **Spot errors in previous reasoning** that need to be corrected
|
|
55
|
+
- **Learn from successful strategies** used in similar situations
|
|
56
|
+
- **Avoid repeating failed approaches** by recognizing when to change strategy
|
|
57
|
+
|
|
58
|
+
1. **Analyze the agent thoughts first** - Review all previous agent thoughts to understand:
|
|
59
|
+
- What strategies have been tried and their outcomes
|
|
60
|
+
- Any errors or misconceptions in previous reasoning
|
|
61
|
+
- Patterns that indicate success or failure
|
|
62
|
+
- Whether the current approach should be continued or modified
|
|
63
|
+
|
|
64
|
+
2. **Then analyze the UI** and environment to understand what action is required, but always in the context of what the agent thoughts reveal about the situation.
|
|
65
|
+
|
|
66
|
+
3. If some of the subgoals must be **completed** based on your observations, add them to `complete_subgoals_by_ids`. To justify your conclusion, you will fill in the `agent_thought` field based on:
|
|
67
|
+
|
|
68
|
+
- The current UI state
|
|
69
|
+
- **Critical analysis of past agent thoughts and their accuracy**
|
|
70
|
+
- Recent tool effects and whether they matched expectations from agent thoughts
|
|
71
|
+
- **Any corrections needed to previous reasoning or strategy**
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
### The Rule of Element Interaction
|
|
75
|
+
|
|
76
|
+
**You MUST follow it for every element interaction.**
|
|
77
|
+
|
|
78
|
+
When you target a UI element (for a `tap`, `input_text`, `clear_text`, etc.), you **MUST** provide a comprehensive target object containing every piece of information you can find about it.
|
|
79
|
+
|
|
80
|
+
* **1. `resource_id`**: Include this if it is present in the UI hierarchy.
|
|
81
|
+
* **2. `coordinates`**: Include the full bounds (`x`, `y`, `width`, `height`) if they are available.
|
|
82
|
+
* **3. `text`**: Include the *current text* content of the element (e.g., "Sign In", "Search...", "First Name").
|
|
83
|
+
|
|
84
|
+
**This is NOT optional.** Providing all three locators if we have, it is the foundation of the system's reliability. It allows next steps to use a fallback mechanism: if the ID fails, it tries the coordinates, etc. Failing to provide this complete context will lead to action failures.
|
|
85
|
+
|
|
86
|
+
### Outputting Your Decisions
|
|
87
|
+
|
|
88
|
+
If you decide to act, output a **valid JSON stringified structured set of instructions** for the Executor.
|
|
89
|
+
|
|
90
|
+
- These must be **concrete low-level actions**.
|
|
91
|
+
- The executor has the following available tools: {{ executor_tools_list }}.
|
|
92
|
+
- Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
|
|
93
|
+
- To open URLs/links directly, use the `open_link` tool - it will automatically handle opening in the appropriate browser. It also handles deep links.
|
|
94
|
+
- When you need to open an app, use the `find_packages` low-level action to try and get its name. Then, simply use the `launch_app` low-level action to launch it.
|
|
95
|
+
- **Always use a single `input_text` action** to type in a field. This tool handles focusing the element and placing the cursor correctly. If the tool feedback indicates verification is needed or shows None/empty content, perform verification before proceeding.
|
|
96
|
+
- **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
|
|
97
|
+
- **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
|
|
98
|
+
|
|
99
|
+
### Output
|
|
100
|
+
|
|
101
|
+
- **complete_subgoals_by_ids** _(optional)_:
|
|
102
|
+
A list of subgoal IDs that should be marked as completed.
|
|
103
|
+
|
|
104
|
+
- **Structured Decisions** _(optional)_:
|
|
105
|
+
A **valid stringified JSON** describing what should be executed **right now** to advance through the subgoals as much as possible.
|
|
106
|
+
|
|
107
|
+
- **Agent Thought** _(2-4 sentences)_:
|
|
108
|
+
**MANDATORY: Start by analyzing previous agent thoughts** - Did previous reasoning contain errors? Are we repeating failed approaches? What worked before in similar situations?
|
|
109
|
+
|
|
110
|
+
Then explain your current decision based on this analysis. If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
|
|
111
|
+
|
|
112
|
+
This also helps other agents understand your decision and learn from future failures. **Explicitly mention if you're correcting a previous error or changing strategy based on agent thoughts analysis.**
|
|
113
|
+
You must also use this field to mention checkpoints when you perform actions without definite ending: for instance "Swiping up to reveal more recipes - last seen recipe was <ID or NAME>, stop when no more".
|
|
114
|
+
|
|
115
|
+
**Important:** `complete_subgoals_by_ids` and the structured decisions are mutually exclusive: if you provide both, the structured decisions will be ignored. Therefore, you must always prioritize completing subgoals over providing structured decisions.
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
### Example
|
|
120
|
+
|
|
121
|
+
#### Current Subgoal:
|
|
122
|
+
|
|
123
|
+
> "Search for Alice in WhatsApp"
|
|
124
|
+
|
|
125
|
+
#### Structured Decisions:
|
|
126
|
+
|
|
127
|
+
```text
|
|
128
|
+
"{\"action\": \"tap\", \"target\": {\"text_input_resource_id\": \"com.whatsapp:id/menuitem_search\", \"text_input_coordinates\": {\"x\": 880, \"y\": 150, \"width\": 120, \"height\": 120}, \"text_input_text\": \"Search\"}}"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
#### Agent Thought:
|
|
132
|
+
|
|
133
|
+
> Analysis: No previous attempts, this is a fresh approach. I will tap the search icon to begin searching. I am providing its resource_id, coordinates, and text content to ensure the Executor can find it reliably, following the element rule.
|
|
134
|
+
|
|
135
|
+
### Input
|
|
136
|
+
|
|
137
|
+
**Initial Goal:**
|
|
138
|
+
{{ initial_goal }}
|
|
139
|
+
|
|
140
|
+
**Subgoal Plan:**
|
|
141
|
+
{{ subgoal_plan }}
|
|
142
|
+
|
|
143
|
+
**Current Subgoal (what needs to be done right now):**
|
|
144
|
+
{{ current_subgoal }}
|
|
145
|
+
|
|
146
|
+
**Executor agent feedback on latest UI decisions:**
|
|
147
|
+
|
|
148
|
+
{{ executor_feedback }}
|
{minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/cortex/cortex.py
RENAMED
|
@@ -44,9 +44,8 @@ class CortexNode:
|
|
|
44
44
|
initial_goal=state.initial_goal,
|
|
45
45
|
subgoal_plan=state.subgoal_plan,
|
|
46
46
|
current_subgoal=get_current_subgoal(state.subgoal_plan),
|
|
47
|
-
agents_thoughts=state.agents_thoughts,
|
|
48
47
|
executor_feedback=executor_feedback,
|
|
49
|
-
executor_tools_list=format_tools_list(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
|
|
48
|
+
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
|
|
50
49
|
)
|
|
51
50
|
messages = [
|
|
52
51
|
SystemMessage(content=system_message),
|
{minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/executor/executor.md
RENAMED
|
@@ -28,7 +28,6 @@ and your previous actions, you must:
|
|
|
28
28
|
{
|
|
29
29
|
"action": "tap",
|
|
30
30
|
"target": {
|
|
31
|
-
"text": "Alice",
|
|
32
31
|
"resource_id": "com.whatsapp:id/conversation_item"
|
|
33
32
|
}
|
|
34
33
|
}
|
|
@@ -39,7 +38,6 @@ and your previous actions, you must:
|
|
|
39
38
|
Call the `tap_on_element` tool with:
|
|
40
39
|
|
|
41
40
|
- `resource_id = "com.whatsapp:id/conversation_item"`
|
|
42
|
-
- `text = "Alice"`
|
|
43
41
|
- `agent_thought = "I'm tapping on the chat item labeled 'Alice' to open the conversation."`
|
|
44
42
|
|
|
45
43
|
---
|
|
@@ -55,10 +53,14 @@ Call the `tap_on_element` tool with:
|
|
|
55
53
|
|
|
56
54
|
When using the `input_text` tool:
|
|
57
55
|
|
|
58
|
-
- **
|
|
56
|
+
- **Provide all available information** from the following optional parameters to identify the text input element:
|
|
57
|
+
- `text_input_resource_id`: The resource ID of the text input element (when available)
|
|
58
|
+
- `text_input_coordinates`: The bounds (ElementBounds) of the text input element (when available)
|
|
59
|
+
- `text_input_text`: The current text content of the text input element (when available)
|
|
60
|
+
|
|
59
61
|
- The tool will automatically:
|
|
60
62
|
|
|
61
|
-
1. **Focus the element
|
|
63
|
+
1. **Focus the element** using the provided identification parameters
|
|
62
64
|
2. **Move the cursor to the end** of the existing text
|
|
63
65
|
3. **Then type the new text**
|
|
64
66
|
|
{minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/executor/executor.py
RENAMED
|
@@ -3,6 +3,8 @@ from pathlib import Path
|
|
|
3
3
|
from jinja2 import Template
|
|
4
4
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
5
5
|
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
6
|
+
from langchain_google_vertexai.chat_models import ChatVertexAI
|
|
7
|
+
|
|
6
8
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
7
9
|
from minitap.mobile_use.context import MobileUseContext
|
|
8
10
|
from minitap.mobile_use.graph.state import State
|
|
@@ -56,7 +58,7 @@ class ExecutorNode:
|
|
|
56
58
|
}
|
|
57
59
|
|
|
58
60
|
# ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
|
|
59
|
-
if not isinstance(llm, ChatGoogleGenerativeAI):
|
|
61
|
+
if not isinstance(llm, ChatGoogleGenerativeAI | ChatVertexAI):
|
|
60
62
|
llm_bind_tools_kwargs["parallel_tool_calls"] = True
|
|
61
63
|
|
|
62
64
|
llm = llm.bind_tools(**llm_bind_tools_kwargs)
|
{minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/executor/utils.py
RENAMED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from langchain_core.messages import BaseMessage
|
|
2
|
+
|
|
2
3
|
from minitap.mobile_use.utils.conversations import is_tool_message
|
|
3
4
|
|
|
4
5
|
|
|
@@ -7,5 +8,5 @@ def is_last_tool_message_take_screenshot(messages: list[BaseMessage]) -> bool:
|
|
|
7
8
|
return False
|
|
8
9
|
for msg in messages[::-1]:
|
|
9
10
|
if is_tool_message(msg):
|
|
10
|
-
return msg.name == "
|
|
11
|
+
return msg.name == "glimpse_screen"
|
|
11
12
|
return False
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from unittest.mock import AsyncMock, Mock, patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
sys.modules["langgraph.prebuilt.chat_agent_executor"] = Mock()
|
|
8
|
+
sys.modules["minitap.mobile_use.graph.state"] = Mock()
|
|
9
|
+
sys.modules["langchain_google_vertexai"] = Mock()
|
|
10
|
+
sys.modules["langchain_google_genai"] = Mock()
|
|
11
|
+
sys.modules["langchain_openai"] = Mock()
|
|
12
|
+
sys.modules["langchain_cerebras"] = Mock()
|
|
13
|
+
|
|
14
|
+
from minitap.mobile_use.agents.outputter.outputter import outputter # noqa: E402
|
|
15
|
+
from minitap.mobile_use.config import LLM, OutputConfig # noqa: E402
|
|
16
|
+
from minitap.mobile_use.context import MobileUseContext # noqa: E402
|
|
17
|
+
from minitap.mobile_use.utils.logger import get_logger # noqa: E402
|
|
18
|
+
|
|
19
|
+
logger = get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MockPydanticSchema(BaseModel):
|
|
23
|
+
color: str
|
|
24
|
+
price: float
|
|
25
|
+
currency_symbol: str
|
|
26
|
+
website_url: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
mock_dict = {
|
|
30
|
+
"color": "green",
|
|
31
|
+
"price": 20,
|
|
32
|
+
"currency_symbol": "$",
|
|
33
|
+
"website_url": "http://superwebsite.fr",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DummyState:
|
|
38
|
+
def __init__(self, messages, initial_goal, agents_thoughts):
|
|
39
|
+
self.messages = messages
|
|
40
|
+
self.initial_goal = initial_goal
|
|
41
|
+
self.agents_thoughts = agents_thoughts
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
mocked_state = DummyState(
|
|
45
|
+
messages=[],
|
|
46
|
+
initial_goal="Find a green product on my website",
|
|
47
|
+
agents_thoughts=[
|
|
48
|
+
"Going on http://superwebsite.fr",
|
|
49
|
+
"Searching for products",
|
|
50
|
+
"Filtering by color",
|
|
51
|
+
"Color 'green' found for a 20 dollars product",
|
|
52
|
+
],
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.fixture
|
|
57
|
+
def mock_context():
|
|
58
|
+
"""Create a properly mocked context with all required fields."""
|
|
59
|
+
ctx = Mock(spec=MobileUseContext)
|
|
60
|
+
ctx.llm_config = {
|
|
61
|
+
"executor": LLM(provider="openai", model="gpt-5-nano"),
|
|
62
|
+
"cortex": LLM(provider="openai", model="gpt-5-nano"),
|
|
63
|
+
"planner": LLM(provider="openai", model="gpt-5-nano"),
|
|
64
|
+
"orchestrator": LLM(provider="openai", model="gpt-5-nano"),
|
|
65
|
+
}
|
|
66
|
+
ctx.device = Mock()
|
|
67
|
+
ctx.hw_bridge_client = Mock()
|
|
68
|
+
ctx.screen_api_client = Mock()
|
|
69
|
+
return ctx
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@pytest.fixture
|
|
73
|
+
def mock_state():
|
|
74
|
+
"""Create a mock state with test data."""
|
|
75
|
+
return DummyState(
|
|
76
|
+
messages=[],
|
|
77
|
+
initial_goal="Find a green product on my website",
|
|
78
|
+
agents_thoughts=[
|
|
79
|
+
"Going on http://superwebsite.fr",
|
|
80
|
+
"Searching for products",
|
|
81
|
+
"Filtering by color",
|
|
82
|
+
"Color 'green' found for a 20 dollars product",
|
|
83
|
+
],
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
|
|
88
|
+
@pytest.mark.asyncio
|
|
89
|
+
async def test_outputter_with_pydantic_model(mock_get_llm, mock_context, mock_state):
|
|
90
|
+
"""Test outputter with Pydantic model output."""
|
|
91
|
+
# Mock the structured LLM response
|
|
92
|
+
mock_structured_llm = AsyncMock()
|
|
93
|
+
mock_structured_llm.ainvoke.return_value = MockPydanticSchema(
|
|
94
|
+
color="green", price=20, currency_symbol="$", website_url="http://superwebsite.fr"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Mock the base LLM
|
|
98
|
+
mock_llm = Mock()
|
|
99
|
+
mock_llm.with_structured_output.return_value = mock_structured_llm
|
|
100
|
+
mock_get_llm.return_value = mock_llm
|
|
101
|
+
|
|
102
|
+
config = OutputConfig(
|
|
103
|
+
structured_output=MockPydanticSchema,
|
|
104
|
+
output_description=None,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
|
|
108
|
+
|
|
109
|
+
assert isinstance(result, dict)
|
|
110
|
+
assert result.get("color") == "green"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
|
|
114
|
+
@pytest.mark.asyncio
|
|
115
|
+
async def test_outputter_with_dict(mock_get_llm, mock_context, mock_state):
|
|
116
|
+
"""Test outputter with dictionary output."""
|
|
117
|
+
# Mock the structured LLM response for dict
|
|
118
|
+
mock_structured_llm = AsyncMock()
|
|
119
|
+
expected_dict = {
|
|
120
|
+
"color": "green",
|
|
121
|
+
"price": 20,
|
|
122
|
+
"currency_symbol": "$",
|
|
123
|
+
"website_url": "http://superwebsite.fr",
|
|
124
|
+
}
|
|
125
|
+
mock_structured_llm.ainvoke.return_value = expected_dict
|
|
126
|
+
|
|
127
|
+
# Mock the base LLM
|
|
128
|
+
mock_llm = Mock()
|
|
129
|
+
mock_llm.with_structured_output.return_value = mock_structured_llm
|
|
130
|
+
mock_get_llm.return_value = mock_llm
|
|
131
|
+
|
|
132
|
+
config = OutputConfig(
|
|
133
|
+
structured_output=mock_dict,
|
|
134
|
+
output_description=None,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
|
|
138
|
+
|
|
139
|
+
assert isinstance(result, dict)
|
|
140
|
+
assert result.get("color") == "green"
|
|
141
|
+
assert result.get("price") == 20
|
|
142
|
+
assert result.get("currency_symbol") == "$"
|
|
143
|
+
assert result.get("website_url") == "http://superwebsite.fr"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
|
|
147
|
+
@pytest.mark.asyncio
|
|
148
|
+
async def test_outputter_with_natural_language_output(mock_get_llm, mock_context, mock_state):
|
|
149
|
+
"""Test outputter with natural language description output."""
|
|
150
|
+
# Mock the LLM response for natural language output (no structured output)
|
|
151
|
+
mock_llm = AsyncMock()
|
|
152
|
+
expected_json = '{"color": "green", "price": 20, "currency_symbol": "$", "website_url": "http://superwebsite.fr"}'
|
|
153
|
+
mock_llm.ainvoke.return_value = Mock(content=expected_json)
|
|
154
|
+
mock_get_llm.return_value = mock_llm
|
|
155
|
+
|
|
156
|
+
config = OutputConfig(
|
|
157
|
+
structured_output=None,
|
|
158
|
+
output_description=(
|
|
159
|
+
"A JSON object with a color, a price, a currency_symbol and a website_url key"
|
|
160
|
+
),
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
|
|
164
|
+
|
|
165
|
+
assert isinstance(result, dict)
|
|
166
|
+
assert result.get("color") == "green"
|
|
167
|
+
assert result.get("price") == 20
|
|
168
|
+
assert result.get("currency_symbol") == "$"
|
|
169
|
+
assert result.get("website_url") == "http://superwebsite.fr"
|
{minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/planner/planner.md
RENAMED
|
@@ -13,7 +13,7 @@ You work like an agile tech lead: defining the key milestones without locking in
|
|
|
13
13
|
- Don't assume the full UI is visible yet. Plan based on how most mobile apps work, and keep flexibility.
|
|
14
14
|
- List of agents thoughts is empty which is expected, since it is the first plan.
|
|
15
15
|
- Avoid too granular UI actions based tasks (e.g. "tap", "swipe", "copy", "paste") unless explicitly required.
|
|
16
|
-
- The executor has the following available tools:
|
|
16
|
+
- The executor has the following available tools: {{ executor_tools_list }}.
|
|
17
17
|
When one of these tools offers a direct shortcut (e.g. `openLink` instead of manually launching a browser and typing a URL), prefer it over decomposed manual steps.
|
|
18
18
|
|
|
19
19
|
2. **Replanning**
|
{minitap_mobile_use-2.1.0 → minitap_mobile_use-2.3.0}/minitap/mobile_use/agents/planner/planner.py
RENAMED
|
@@ -30,7 +30,10 @@ class PlannerNode:
|
|
|
30
30
|
|
|
31
31
|
system_message = Template(
|
|
32
32
|
Path(__file__).parent.joinpath("planner.md").read_text(encoding="utf-8")
|
|
33
|
-
).render(
|
|
33
|
+
).render(
|
|
34
|
+
platform=self.ctx.device.mobile_platform.value,
|
|
35
|
+
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
|
|
36
|
+
)
|
|
34
37
|
human_message = Template(
|
|
35
38
|
Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
|
|
36
39
|
).render(
|
|
@@ -38,7 +41,6 @@ class PlannerNode:
|
|
|
38
41
|
initial_goal=state.initial_goal,
|
|
39
42
|
previous_plan="\n".join(str(s) for s in state.subgoal_plan),
|
|
40
43
|
agent_thoughts="\n".join(state.agents_thoughts),
|
|
41
|
-
executor_tools_list=format_tools_list(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
|
|
42
44
|
)
|
|
43
45
|
messages = [
|
|
44
46
|
SystemMessage(content=system_message),
|
|
@@ -3,7 +3,9 @@ import os
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Annotated, Any, Literal
|
|
5
5
|
|
|
6
|
+
import google.auth
|
|
6
7
|
from dotenv import load_dotenv
|
|
8
|
+
from google.auth.exceptions import DefaultCredentialsError
|
|
7
9
|
from pydantic import BaseModel, Field, SecretStr, ValidationError, model_validator
|
|
8
10
|
from pydantic_settings import BaseSettings
|
|
9
11
|
|
|
@@ -88,7 +90,7 @@ def record_events(output_path: Path | None, events: list[str] | BaseModel | Any)
|
|
|
88
90
|
|
|
89
91
|
### LLM Configuration
|
|
90
92
|
|
|
91
|
-
LLMProvider = Literal["openai", "google", "openrouter", "xai"]
|
|
93
|
+
LLMProvider = Literal["openai", "google", "openrouter", "xai", "vertexai"]
|
|
92
94
|
LLMUtilsNode = Literal["outputter", "hopper"]
|
|
93
95
|
AgentNode = Literal["planner", "orchestrator", "cortex", "executor"]
|
|
94
96
|
AgentNodeWithFallback = Literal["cortex"]
|
|
@@ -98,6 +100,17 @@ DEFAULT_LLM_CONFIG_FILENAME = "llm-config.defaults.jsonc"
|
|
|
98
100
|
OVERRIDE_LLM_CONFIG_FILENAME = "llm-config.override.jsonc"
|
|
99
101
|
|
|
100
102
|
|
|
103
|
+
def validate_vertex_ai_credentials():
|
|
104
|
+
try:
|
|
105
|
+
_, project = google.auth.default()
|
|
106
|
+
if not project:
|
|
107
|
+
raise Exception("VertexAI requires a Google Cloud project to be set.")
|
|
108
|
+
except DefaultCredentialsError as e:
|
|
109
|
+
raise Exception(
|
|
110
|
+
f"VertexAI requires valid Google Application Default Credentials (ADC): {e}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
101
114
|
class LLM(BaseModel):
|
|
102
115
|
provider: LLMProvider
|
|
103
116
|
model: str
|
|
@@ -110,6 +123,8 @@ class LLM(BaseModel):
|
|
|
110
123
|
case "google":
|
|
111
124
|
if not settings.GOOGLE_API_KEY:
|
|
112
125
|
raise Exception(f"{name} requires GOOGLE_API_KEY in .env")
|
|
126
|
+
case "vertexai":
|
|
127
|
+
validate_vertex_ai_credentials()
|
|
113
128
|
case "openrouter":
|
|
114
129
|
if not settings.OPEN_ROUTER_API_KEY:
|
|
115
130
|
raise Exception(f"{name} requires OPEN_ROUTER_API_KEY in .env")
|