minitap-mobile-use 2.1.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (36) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +4 -2
  2. minitap/mobile_use/agents/cortex/cortex.md +72 -26
  3. minitap/mobile_use/agents/cortex/cortex.py +1 -2
  4. minitap/mobile_use/agents/executor/executor.md +6 -4
  5. minitap/mobile_use/agents/executor/executor.py +3 -1
  6. minitap/mobile_use/agents/executor/utils.py +2 -1
  7. minitap/mobile_use/agents/outputter/test_outputter.py +104 -42
  8. minitap/mobile_use/agents/planner/planner.md +1 -1
  9. minitap/mobile_use/agents/planner/planner.py +4 -2
  10. minitap/mobile_use/config.py +16 -1
  11. minitap/mobile_use/controllers/mobile_command_controller.py +4 -4
  12. minitap/mobile_use/main.py +2 -2
  13. minitap/mobile_use/sdk/agent.py +17 -8
  14. minitap/mobile_use/sdk/builders/agent_config_builder.py +2 -2
  15. minitap/mobile_use/sdk/types/exceptions.py +30 -0
  16. minitap/mobile_use/sdk/utils.py +3 -2
  17. minitap/mobile_use/servers/device_hardware_bridge.py +2 -1
  18. minitap/mobile_use/servers/utils.py +6 -9
  19. minitap/mobile_use/services/llm.py +23 -6
  20. minitap/mobile_use/tools/index.py +21 -15
  21. minitap/mobile_use/tools/mobile/clear_text.py +73 -25
  22. minitap/mobile_use/tools/mobile/copy_text_from.py +7 -5
  23. minitap/mobile_use/tools/mobile/{take_screenshot.py → glimpse_screen.py} +15 -11
  24. minitap/mobile_use/tools/mobile/input_text.py +94 -13
  25. minitap/mobile_use/tools/mobile/paste_text.py +34 -8
  26. minitap/mobile_use/tools/mobile/swipe.py +107 -9
  27. minitap/mobile_use/tools/test_utils.py +351 -0
  28. minitap/mobile_use/tools/tool_wrapper.py +5 -0
  29. minitap/mobile_use/tools/utils.py +147 -40
  30. minitap/mobile_use/utils/recorder.py +2 -9
  31. minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
  32. minitap/mobile_use/utils/ui_hierarchy.py +2 -2
  33. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/METADATA +28 -8
  34. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/RECORD +36 -34
  35. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/WHEEL +0 -0
  36. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/entry_points.txt +0 -0
@@ -4,6 +4,8 @@ Exceptions for the Mobile-use SDK.
4
4
  This module defines the exception hierarchy used throughout the Mobile-use SDK.
5
5
  """
6
6
 
7
+ from typing import Literal
8
+
7
9
 
8
10
  class MobileUseError(Exception):
9
11
  """Base exception class for all Mobile-use SDK exceptions."""
@@ -72,3 +74,31 @@ class AgentProfileNotFoundError(AgentTaskRequestError):
72
74
 
73
75
  def __init__(self, profile_name: str):
74
76
  super().__init__(f"Agent profile {profile_name} not found")
77
+
78
+
79
+ EXECUTABLES = Literal["adb", "maestro", "xcrun", "cli_tools"]
80
+
81
+
82
+ class ExecutableNotFoundError(MobileUseError):
83
+ """Exception raised when a required executable is not found."""
84
+
85
+ def __init__(self, executable_name: EXECUTABLES):
86
+ install_instructions: dict[EXECUTABLES, str] = {
87
+ "adb": "https://developer.android.com/tools/adb",
88
+ "maestro": "https://docs.maestro.dev/getting-started/installing-maestro",
89
+ "xcrun": "Install with: xcode-select --install",
90
+ }
91
+ if executable_name == "cli_tools":
92
+ message = (
93
+ "ADB or Xcode Command Line Tools not found in PATH. "
94
+ "At least one of them is required to run mobile-use "
95
+ "depending on the device platform you wish to run (Android: adb, iOS: xcrun)."
96
+ "Refer to the following links for installation instructions :"
97
+ f"\n- ADB: {install_instructions['adb']}"
98
+ f"\n- Xcode Command Line Tools: {install_instructions['xcrun']}"
99
+ )
100
+ else:
101
+ message = f"Required executable '{executable_name}' not found in PATH."
102
+ if executable_name in install_instructions:
103
+ message += f"\nTo install it, please visit: {install_instructions[executable_name]}"
104
+ super().__init__(message)
@@ -2,11 +2,11 @@ import os
2
2
  from pathlib import Path
3
3
 
4
4
  from pydantic import ValidationError
5
+
5
6
  from minitap.mobile_use.config import LLMConfig, deep_merge_llm_config, get_default_llm_config
6
7
  from minitap.mobile_use.utils.file import load_jsonc
7
8
  from minitap.mobile_use.utils.logger import get_logger
8
9
 
9
-
10
10
  logger = get_logger(__name__)
11
11
 
12
12
 
@@ -24,5 +24,6 @@ def load_llm_config_override(path: Path) -> LLMConfig:
24
24
  try:
25
25
  return deep_merge_llm_config(default_config, override_config_dict)
26
26
  except ValidationError as e:
27
- logger.error(f"Invalid LLM config: {e}. Falling back to default config")
27
+ logger.error(f"Invalid LLM config: {e}")
28
+ logger.info("Falling back to default config")
28
29
  return default_config
@@ -6,6 +6,7 @@ import time
6
6
  from enum import Enum
7
7
 
8
8
  import requests
9
+
9
10
  from minitap.mobile_use.context import DevicePlatform
10
11
  from minitap.mobile_use.servers.utils import is_port_in_use
11
12
 
@@ -175,7 +176,7 @@ class DeviceHardwareBridge:
175
176
  ]
176
177
 
177
178
  def start(self):
178
- if is_port_in_use(DEVICE_HARDWARE_BRIDGE_PORT):
179
+ if is_port_in_use(port=DEVICE_HARDWARE_BRIDGE_PORT):
179
180
  print("Maestro port already in use - assuming Maestro is running.")
180
181
  self.status = BridgeStatus.RUNNING
181
182
  return True
@@ -1,11 +1,8 @@
1
- import psutil
1
+ import contextlib
2
+ import socket
2
3
 
3
4
 
4
- def is_port_in_use(port: int):
5
- for conn in psutil.net_connections():
6
- if conn.status == psutil.CONN_LISTEN and conn.laddr:
7
- if hasattr(conn.laddr, "port") and conn.laddr.port == port:
8
- return True
9
- elif isinstance(conn.laddr, tuple) and len(conn.laddr) >= 2 and conn.laddr[1] == port:
10
- return True
11
- return False
5
+ def is_port_in_use(port: int, host: str = "127.0.0.1") -> bool:
6
+ with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
7
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
8
+ return s.connect_ex((host, port)) == 0
@@ -1,10 +1,12 @@
1
1
  import logging
2
- from typing import Literal, TypeVar
3
2
  from collections.abc import Awaitable, Callable
4
- from typing import overload
3
+ from typing import Literal, TypeVar, overload
5
4
 
5
+ from langchain_core.language_models.chat_models import BaseChatModel
6
6
  from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from langchain_google_vertexai import ChatVertexAI
7
8
  from langchain_openai import ChatOpenAI
9
+
8
10
  from minitap.mobile_use.config import (
9
11
  AgentNode,
10
12
  AgentNodeWithFallback,
@@ -32,6 +34,19 @@ def get_google_llm(
32
34
  return client
33
35
 
34
36
 
37
+ def get_vertex_llm(
38
+ model_name: str = "gemini-2.5-pro",
39
+ temperature: float = 0.7,
40
+ ) -> ChatVertexAI:
41
+ client = ChatVertexAI(
42
+ model_name=model_name,
43
+ max_tokens=None,
44
+ temperature=temperature,
45
+ max_retries=2,
46
+ )
47
+ return client
48
+
49
+
35
50
  def get_openai_llm(
36
51
  model_name: str = "o3",
37
52
  temperature: float = 1,
@@ -75,7 +90,7 @@ def get_llm(
75
90
  *,
76
91
  use_fallback: bool = False,
77
92
  temperature: float = 1,
78
- ): ...
93
+ ) -> BaseChatModel: ...
79
94
 
80
95
 
81
96
  @overload
@@ -84,7 +99,7 @@ def get_llm(
84
99
  name: AgentNode,
85
100
  *,
86
101
  temperature: float = 1,
87
- ): ...
102
+ ) -> BaseChatModel: ...
88
103
 
89
104
 
90
105
  @overload
@@ -94,7 +109,7 @@ def get_llm(
94
109
  *,
95
110
  is_utils: Literal[True],
96
111
  temperature: float = 1,
97
- ): ...
112
+ ) -> BaseChatModel: ...
98
113
 
99
114
 
100
115
  def get_llm(
@@ -103,7 +118,7 @@ def get_llm(
103
118
  is_utils: bool = False,
104
119
  use_fallback: bool = False,
105
120
  temperature: float = 1,
106
- ):
121
+ ) -> BaseChatModel:
107
122
  llm = (
108
123
  ctx.llm_config.get_utils(name) # type: ignore
109
124
  if is_utils
@@ -118,6 +133,8 @@ def get_llm(
118
133
  return get_openai_llm(llm.model, temperature)
119
134
  elif llm.provider == "google":
120
135
  return get_google_llm(llm.model, temperature)
136
+ elif llm.provider == "vertexai":
137
+ return get_vertex_llm(llm.model, temperature)
121
138
  elif llm.provider == "openrouter":
122
139
  return get_openrouter_llm(llm.model, temperature)
123
140
  elif llm.provider == "xai":
@@ -6,6 +6,7 @@ from minitap.mobile_use.tools.mobile.clear_text import clear_text_wrapper
6
6
  from minitap.mobile_use.tools.mobile.copy_text_from import copy_text_from_wrapper
7
7
  from minitap.mobile_use.tools.mobile.erase_one_char import erase_one_char_wrapper
8
8
  from minitap.mobile_use.tools.mobile.find_packages import find_packages_wrapper
9
+ from minitap.mobile_use.tools.mobile.glimpse_screen import glimpse_screen_wrapper
9
10
  from minitap.mobile_use.tools.mobile.input_text import input_text_wrapper
10
11
  from minitap.mobile_use.tools.mobile.launch_app import launch_app_wrapper
11
12
  from minitap.mobile_use.tools.mobile.long_press_on import long_press_on_wrapper
@@ -14,12 +15,11 @@ from minitap.mobile_use.tools.mobile.paste_text import paste_text_wrapper
14
15
  from minitap.mobile_use.tools.mobile.press_key import press_key_wrapper
15
16
  from minitap.mobile_use.tools.mobile.stop_app import stop_app_wrapper
16
17
  from minitap.mobile_use.tools.mobile.swipe import swipe_wrapper
17
- from minitap.mobile_use.tools.mobile.take_screenshot import take_screenshot_wrapper
18
18
  from minitap.mobile_use.tools.mobile.tap import tap_wrapper
19
19
  from minitap.mobile_use.tools.mobile.wait_for_animation_to_end import (
20
20
  wait_for_animation_to_end_wrapper,
21
21
  )
22
- from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
22
+ from minitap.mobile_use.tools.tool_wrapper import CompositeToolWrapper, ToolWrapper
23
23
 
24
24
  EXECUTOR_WRAPPERS_TOOLS = [
25
25
  back_wrapper,
@@ -27,7 +27,7 @@ EXECUTOR_WRAPPERS_TOOLS = [
27
27
  tap_wrapper,
28
28
  long_press_on_wrapper,
29
29
  swipe_wrapper,
30
- take_screenshot_wrapper,
30
+ glimpse_screen_wrapper,
31
31
  copy_text_from_wrapper,
32
32
  input_text_wrapper,
33
33
  erase_one_char_wrapper,
@@ -41,18 +41,24 @@ EXECUTOR_WRAPPERS_TOOLS = [
41
41
  ]
42
42
 
43
43
 
44
- def get_tools_from_wrappers(ctx: MobileUseContext, wrappers: list[ToolWrapper]) -> list[BaseTool]:
45
- """Get the tools from the wrappers."""
46
- return [wrapper.tool_fn_getter(ctx) for wrapper in wrappers]
44
+ def get_tools_from_wrappers(
45
+ ctx: "MobileUseContext",
46
+ wrappers: list[ToolWrapper],
47
+ ) -> list[BaseTool]:
48
+ tools: list[BaseTool] = []
49
+ for wrapper in wrappers:
50
+ if ctx.llm_config.get_agent("executor").provider == "vertexai":
51
+ # The main swipe tool argument structure is not supported by vertexai, we need to split
52
+ # this tool into multiple tools
53
+ if wrapper.tool_fn_getter == swipe_wrapper.tool_fn_getter and isinstance(
54
+ wrapper, CompositeToolWrapper
55
+ ):
56
+ tools.extend(wrapper.composite_tools_fn_getter(ctx))
57
+ continue
47
58
 
48
-
49
- def format_tools_list(ctx: MobileUseContext, wrappers: list[ToolWrapper]) -> str:
50
- return "\n".join([tool.name for tool in get_tools_from_wrappers(ctx, wrappers)])
59
+ tools.append(wrapper.tool_fn_getter(ctx))
60
+ return tools
51
61
 
52
62
 
53
- def get_tool_wrapper_from_name(name: str) -> ToolWrapper | None:
54
- """Get the tool wrapper from the name."""
55
- for wrapper in EXECUTOR_WRAPPERS_TOOLS:
56
- if wrapper.tool_fn_getter.__name__ == f"get_{name}_tool":
57
- return wrapper
58
- return None
63
+ def format_tools_list(ctx: MobileUseContext, wrappers: list[ToolWrapper]) -> str:
64
+ return ", ".join([tool.name for tool in get_tools_from_wrappers(ctx, wrappers)])
@@ -23,6 +23,7 @@ from minitap.mobile_use.tools.utils import (
23
23
  )
24
24
  from minitap.mobile_use.utils.logger import get_logger
25
25
  from minitap.mobile_use.utils.ui_hierarchy import (
26
+ ElementBounds,
26
27
  find_element_by_resource_id,
27
28
  get_element_text,
28
29
  text_input_is_empty,
@@ -50,16 +51,20 @@ class TextClearer:
50
51
  screen_data = get_screen_data(screen_api_client=self.ctx.screen_api_client)
51
52
  self.state.latest_ui_hierarchy = screen_data.elements
52
53
 
53
- def _get_element_info(self, resource_id: str) -> tuple[object | None, str | None, str | None]:
54
+ def _get_element_info(
55
+ self, resource_id: str | None
56
+ ) -> tuple[object | None, str | None, str | None]:
54
57
  if not self.state.latest_ui_hierarchy:
55
58
  self._refresh_ui_hierarchy()
56
59
 
57
60
  if not self.state.latest_ui_hierarchy:
58
61
  return None, None, None
59
62
 
60
- element = find_element_by_resource_id(
61
- ui_hierarchy=self.state.latest_ui_hierarchy, resource_id=resource_id
62
- )
63
+ element = None
64
+ if resource_id:
65
+ element = find_element_by_resource_id(
66
+ ui_hierarchy=self.state.latest_ui_hierarchy, resource_id=resource_id
67
+ )
63
68
 
64
69
  if not element:
65
70
  return None, None, None
@@ -83,11 +88,27 @@ class TextClearer:
83
88
  def _should_clear_text(self, current_text: str | None, hint_text: str | None) -> bool:
84
89
  return current_text is not None and current_text != "" and current_text != hint_text
85
90
 
86
- def _prepare_element_for_clearing(self, resource_id: str) -> bool:
87
- if not focus_element_if_needed(ctx=self.ctx, resource_id=resource_id):
91
+ def _prepare_element_for_clearing(
92
+ self,
93
+ text_input_resource_id: str | None,
94
+ text_input_coordinates: ElementBounds | None,
95
+ text_input_text: str | None,
96
+ ) -> bool:
97
+ if not focus_element_if_needed(
98
+ ctx=self.ctx,
99
+ input_resource_id=text_input_resource_id,
100
+ input_coordinates=text_input_coordinates,
101
+ input_text=text_input_text,
102
+ ):
88
103
  return False
89
104
 
90
- move_cursor_to_end_if_bounds(ctx=self.ctx, state=self.state, resource_id=resource_id)
105
+ move_cursor_to_end_if_bounds(
106
+ ctx=self.ctx,
107
+ state=self.state,
108
+ text_input_resource_id=text_input_resource_id,
109
+ text_input_coordinates=text_input_coordinates,
110
+ text_input_text=text_input_text,
111
+ )
91
112
  return True
92
113
 
93
114
  def _erase_text_attempt(self, text_length: int) -> str | None:
@@ -102,7 +123,12 @@ class TextClearer:
102
123
  return None
103
124
 
104
125
  def _clear_with_retries(
105
- self, resource_id: str, initial_text: str, hint_text: str | None
126
+ self,
127
+ text_input_resource_id: str | None,
128
+ text_input_coordinates: ElementBounds | None,
129
+ text_input_text: str | None,
130
+ initial_text: str,
131
+ hint_text: str | None,
106
132
  ) -> tuple[bool, str | None, int]:
107
133
  current_text = initial_text
108
134
  erased_chars = 0
@@ -118,18 +144,25 @@ class TextClearer:
118
144
  erased_chars += chars_to_erase
119
145
 
120
146
  self._refresh_ui_hierarchy()
121
- elt = find_element_by_resource_id(
122
- ui_hierarchy=self.state.latest_ui_hierarchy or [],
123
- resource_id=resource_id,
124
- )
125
- if elt:
126
- current_text = get_element_text(elt)
127
- logger.info(f"Current text: {current_text}")
128
- if text_input_is_empty(text=current_text, hint_text=hint_text):
129
- break
147
+ elt = None
148
+ if text_input_resource_id:
149
+ elt = find_element_by_resource_id(
150
+ ui_hierarchy=self.state.latest_ui_hierarchy or [],
151
+ resource_id=text_input_resource_id,
152
+ )
153
+ if elt:
154
+ current_text = get_element_text(elt)
155
+ logger.info(f"Current text: {current_text}")
156
+ if text_input_is_empty(text=current_text, hint_text=hint_text):
157
+ break
130
158
 
131
159
  move_cursor_to_end_if_bounds(
132
- ctx=self.ctx, state=self.state, resource_id=resource_id, elt=elt
160
+ ctx=self.ctx,
161
+ state=self.state,
162
+ text_input_resource_id=text_input_resource_id,
163
+ text_input_coordinates=text_input_coordinates,
164
+ text_input_text=text_input_text,
165
+ elt=elt,
133
166
  )
134
167
 
135
168
  return True, current_text, erased_chars
@@ -162,7 +195,9 @@ class TextClearer:
162
195
  hint_text=hint_text,
163
196
  )
164
197
 
165
- def _handle_element_not_found(self, resource_id: str, hint_text: str | None) -> ClearTextResult:
198
+ def _handle_element_not_found(
199
+ self, resource_id: str | None, hint_text: str | None
200
+ ) -> ClearTextResult:
166
201
  error = erase_text_controller(ctx=self.ctx)
167
202
  self._refresh_ui_hierarchy()
168
203
 
@@ -176,16 +211,23 @@ class TextClearer:
176
211
  hint_text=hint_text,
177
212
  )
178
213
 
179
- def clear_text_by_resource_id(self, resource_id: str) -> ClearTextResult:
180
- element, current_text, hint_text = self._get_element_info(resource_id)
214
+ def clear_input_text(
215
+ self,
216
+ text_input_resource_id: str | None,
217
+ text_input_coordinates: ElementBounds | None,
218
+ text_input_text: str | None,
219
+ ) -> ClearTextResult:
220
+ element, current_text, hint_text = self._get_element_info(text_input_resource_id)
181
221
 
182
222
  if not element:
183
- return self._handle_element_not_found(resource_id, hint_text)
223
+ return self._handle_element_not_found(text_input_resource_id, hint_text)
184
224
 
185
225
  if not self._should_clear_text(current_text, hint_text):
186
226
  return self._handle_no_clearing_needed(current_text, hint_text)
187
227
 
188
- if not self._prepare_element_for_clearing(resource_id):
228
+ if not self._prepare_element_for_clearing(
229
+ text_input_resource_id, text_input_coordinates, text_input_text
230
+ ):
189
231
  return self._create_result(
190
232
  success=False,
191
233
  error_message="Failed to focus element",
@@ -195,7 +237,9 @@ class TextClearer:
195
237
  )
196
238
 
197
239
  success, final_text, chars_erased = self._clear_with_retries(
198
- resource_id=resource_id,
240
+ text_input_resource_id=text_input_resource_id,
241
+ text_input_coordinates=text_input_coordinates,
242
+ text_input_text=text_input_text,
199
243
  initial_text=current_text or "",
200
244
  hint_text=hint_text,
201
245
  )
@@ -218,12 +262,16 @@ def get_clear_text_tool(ctx: MobileUseContext):
218
262
  state: Annotated[State, InjectedState],
219
263
  agent_thought: str,
220
264
  text_input_resource_id: str,
265
+ text_input_coordinates: ElementBounds | None,
266
+ text_input_text: str | None,
221
267
  ):
222
268
  """
223
269
  Clears all the text from the text field, by focusing it if needed.
224
270
  """
225
271
  clearer = TextClearer(ctx, state)
226
- result = clearer.clear_text_by_resource_id(text_input_resource_id)
272
+ result = clearer.clear_input_text(
273
+ text_input_resource_id, text_input_coordinates, text_input_text
274
+ )
227
275
 
228
276
  content = (
229
277
  clear_text_wrapper.on_failure_fn(result.error_message)
@@ -1,18 +1,20 @@
1
+ from typing import Annotated
2
+
1
3
  from langchain_core.messages import ToolMessage
2
4
  from langchain_core.tools import tool
3
5
  from langchain_core.tools.base import InjectedToolCallId
6
+ from langgraph.prebuilt import InjectedState
4
7
  from langgraph.types import Command
8
+ from pydantic import Field
9
+
5
10
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
11
+ from minitap.mobile_use.context import MobileUseContext
6
12
  from minitap.mobile_use.controllers.mobile_command_controller import SelectorRequest
7
13
  from minitap.mobile_use.controllers.mobile_command_controller import (
8
14
  copy_text_from as copy_text_from_controller,
9
15
  )
10
- from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
11
- from pydantic import Field
12
- from typing import Annotated
13
- from minitap.mobile_use.context import MobileUseContext
14
16
  from minitap.mobile_use.graph.state import State
15
- from langgraph.prebuilt import InjectedState
17
+ from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
16
18
 
17
19
 
18
20
  def get_copy_text_from_tool(ctx: MobileUseContext):
@@ -1,8 +1,11 @@
1
+ from typing import Annotated
2
+
1
3
  from langchain_core.messages import ToolMessage
2
4
  from langchain_core.tools import tool
3
5
  from langchain_core.tools.base import InjectedToolCallId
4
6
  from langgraph.prebuilt import InjectedState
5
7
  from langgraph.types import Command
8
+
6
9
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
7
10
  from minitap.mobile_use.context import MobileUseContext
8
11
  from minitap.mobile_use.controllers.mobile_command_controller import (
@@ -11,18 +14,18 @@ from minitap.mobile_use.controllers.mobile_command_controller import (
11
14
  from minitap.mobile_use.graph.state import State
12
15
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
13
16
  from minitap.mobile_use.utils.media import compress_base64_jpeg
14
- from typing import Annotated
15
17
 
16
18
 
17
- def get_take_screenshot_tool(ctx: MobileUseContext):
19
+ def get_glimpse_screen_tool(ctx: MobileUseContext):
18
20
  @tool
19
- def take_screenshot(
21
+ def glimpse_screen(
20
22
  tool_call_id: Annotated[str, InjectedToolCallId],
21
23
  state: Annotated[State, InjectedState],
22
24
  agent_thought: str,
23
25
  ):
24
26
  """
25
- Take a screenshot of the device.
27
+ Captures the current screen as an image.
28
+ The resulting screenshot is added to the context for the next reasoning step.
26
29
  """
27
30
  compressed_image_base64 = None
28
31
  has_failed = False
@@ -36,9 +39,9 @@ def get_take_screenshot_tool(ctx: MobileUseContext):
36
39
 
37
40
  tool_message = ToolMessage(
38
41
  tool_call_id=tool_call_id,
39
- content=take_screenshot_wrapper.on_failure_fn()
42
+ content=glimpse_screen_wrapper.on_failure_fn()
40
43
  if has_failed
41
- else take_screenshot_wrapper.on_success_fn(),
44
+ else glimpse_screen_wrapper.on_success_fn(),
42
45
  additional_kwargs={"error": output} if has_failed else {},
43
46
  status="error" if has_failed else "success",
44
47
  )
@@ -56,11 +59,12 @@ def get_take_screenshot_tool(ctx: MobileUseContext):
56
59
  ),
57
60
  )
58
61
 
59
- return take_screenshot
62
+ return glimpse_screen
60
63
 
61
64
 
62
- take_screenshot_wrapper = ToolWrapper(
63
- tool_fn_getter=get_take_screenshot_tool,
64
- on_success_fn=lambda: "Screenshot taken successfully.",
65
- on_failure_fn=lambda: "Failed to take screenshot.",
65
+ glimpse_screen_wrapper = ToolWrapper(
66
+ tool_fn_getter=get_glimpse_screen_tool,
67
+ on_success_fn=lambda: "Visual context captured successfully."
68
+ + "It is now available for immediate analysis.",
69
+ on_failure_fn=lambda: "Failed to capture visual context.",
66
70
  )