minitap-mobile-use 2.3.0__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (56) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +2 -2
  2. minitap/mobile_use/agents/cortex/cortex.md +49 -8
  3. minitap/mobile_use/agents/cortex/cortex.py +8 -4
  4. minitap/mobile_use/agents/executor/executor.md +14 -11
  5. minitap/mobile_use/agents/executor/executor.py +6 -5
  6. minitap/mobile_use/agents/hopper/hopper.py +6 -3
  7. minitap/mobile_use/agents/orchestrator/orchestrator.py +26 -11
  8. minitap/mobile_use/agents/outputter/outputter.py +6 -3
  9. minitap/mobile_use/agents/planner/planner.md +20 -22
  10. minitap/mobile_use/agents/planner/planner.py +10 -7
  11. minitap/mobile_use/agents/planner/types.py +4 -2
  12. minitap/mobile_use/agents/planner/utils.py +14 -0
  13. minitap/mobile_use/agents/summarizer/summarizer.py +2 -2
  14. minitap/mobile_use/config.py +6 -1
  15. minitap/mobile_use/context.py +13 -3
  16. minitap/mobile_use/controllers/mobile_command_controller.py +1 -14
  17. minitap/mobile_use/graph/state.py +7 -3
  18. minitap/mobile_use/sdk/agent.py +188 -23
  19. minitap/mobile_use/sdk/examples/README.md +19 -1
  20. minitap/mobile_use/sdk/examples/platform_manual_task_example.py +65 -0
  21. minitap/mobile_use/sdk/examples/platform_minimal_example.py +46 -0
  22. minitap/mobile_use/sdk/services/platform.py +307 -0
  23. minitap/mobile_use/sdk/types/__init__.py +16 -14
  24. minitap/mobile_use/sdk/types/exceptions.py +27 -0
  25. minitap/mobile_use/sdk/types/platform.py +127 -0
  26. minitap/mobile_use/sdk/types/task.py +78 -17
  27. minitap/mobile_use/servers/device_hardware_bridge.py +1 -1
  28. minitap/mobile_use/servers/stop_servers.py +11 -12
  29. minitap/mobile_use/services/llm.py +89 -5
  30. minitap/mobile_use/tools/index.py +0 -6
  31. minitap/mobile_use/tools/mobile/back.py +3 -3
  32. minitap/mobile_use/tools/mobile/clear_text.py +24 -43
  33. minitap/mobile_use/tools/mobile/erase_one_char.py +5 -4
  34. minitap/mobile_use/tools/mobile/glimpse_screen.py +11 -7
  35. minitap/mobile_use/tools/mobile/input_text.py +21 -51
  36. minitap/mobile_use/tools/mobile/launch_app.py +54 -22
  37. minitap/mobile_use/tools/mobile/long_press_on.py +15 -8
  38. minitap/mobile_use/tools/mobile/open_link.py +15 -8
  39. minitap/mobile_use/tools/mobile/press_key.py +15 -8
  40. minitap/mobile_use/tools/mobile/stop_app.py +14 -8
  41. minitap/mobile_use/tools/mobile/swipe.py +11 -5
  42. minitap/mobile_use/tools/mobile/tap.py +103 -21
  43. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +3 -3
  44. minitap/mobile_use/tools/test_utils.py +104 -78
  45. minitap/mobile_use/tools/types.py +35 -0
  46. minitap/mobile_use/tools/utils.py +51 -48
  47. minitap/mobile_use/utils/recorder.py +1 -1
  48. minitap/mobile_use/utils/ui_hierarchy.py +9 -2
  49. {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/METADATA +3 -1
  50. minitap_mobile_use-2.5.0.dist-info/RECORD +100 -0
  51. minitap/mobile_use/tools/mobile/copy_text_from.py +0 -75
  52. minitap/mobile_use/tools/mobile/find_packages.py +0 -69
  53. minitap/mobile_use/tools/mobile/paste_text.py +0 -88
  54. minitap_mobile_use-2.3.0.dist-info/RECORD +0 -98
  55. {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/WHEEL +0 -0
  56. {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/entry_points.txt +0 -0
@@ -70,18 +70,17 @@ def stop_process_gracefully(process: psutil.Process, timeout: int = 5) -> bool:
70
70
  return False
71
71
 
72
72
 
73
- def check_service_health(port: int, service_name: str) -> bool:
73
+ def check_service_running(port: int, service_name: str) -> bool:
74
74
  try:
75
75
  if port == server_settings.DEVICE_SCREEN_API_PORT:
76
- response = requests.get(f"http://localhost:{port}/health", timeout=2)
76
+ requests.get(f"http://localhost:{port}/health", timeout=2)
77
77
  elif port == DEVICE_HARDWARE_BRIDGE_PORT:
78
- response = requests.get(f"http://localhost:{port}/api/banner-message", timeout=2)
78
+ requests.get(f"http://localhost:{port}/api/banner-message", timeout=2)
79
79
  else:
80
80
  return False
81
81
 
82
- if response.status_code == 200:
83
- logger.debug(f"{service_name} is still responding on port {port}")
84
- return True
82
+ logger.debug(f"{service_name} is still responding on port {port}")
83
+ return True
85
84
  except requests.exceptions.RequestException:
86
85
  pass
87
86
 
@@ -92,7 +91,7 @@ def stop_device_screen_api() -> bool:
92
91
  logger.info("Stopping Device Screen API...")
93
92
  api_port = server_settings.DEVICE_SCREEN_API_PORT
94
93
 
95
- if not check_service_health(api_port, "Device Screen API"):
94
+ if not check_service_running(api_port, "Device Screen API"):
96
95
  logger.success("Device Screen API is not running")
97
96
  return True
98
97
 
@@ -109,7 +108,7 @@ def stop_device_screen_api() -> bool:
109
108
  logger.warning("No Device Screen API processes found, but service is still responding")
110
109
  # Still try to verify if service actually stops
111
110
  time.sleep(1)
112
- if not check_service_health(api_port, "Device Screen API"):
111
+ if not check_service_running(api_port, "Device Screen API"):
113
112
  logger.success("Device Screen API stopped successfully (was orphaned)")
114
113
  return True
115
114
  return False
@@ -120,7 +119,7 @@ def stop_device_screen_api() -> bool:
120
119
 
121
120
  # Verify service is stopped
122
121
  time.sleep(1)
123
- if check_service_health(api_port, "Device Screen API"):
122
+ if check_service_running(api_port, "Device Screen API"):
124
123
  logger.error("Device Screen API is still running after stop attempt")
125
124
  return False
126
125
 
@@ -131,7 +130,7 @@ def stop_device_screen_api() -> bool:
131
130
  def stop_device_hardware_bridge() -> bool:
132
131
  logger.info("Stopping Device Hardware Bridge...")
133
132
 
134
- if not check_service_health(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
133
+ if not check_service_running(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
135
134
  logger.success("Device Hardware Bridge is not running")
136
135
  return True
137
136
 
@@ -145,7 +144,7 @@ def stop_device_hardware_bridge() -> bool:
145
144
  logger.warning("No Device Hardware Bridge processes found, but service is still responding")
146
145
  # Still try to verify if service actually stops
147
146
  time.sleep(1)
148
- if not check_service_health(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
147
+ if not check_service_running(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
149
148
  logger.success("Device Hardware Bridge stopped successfully (was orphaned)")
150
149
  return True
151
150
  return False
@@ -154,7 +153,7 @@ def stop_device_hardware_bridge() -> bool:
154
153
  stop_process_gracefully(proc)
155
154
 
156
155
  time.sleep(1)
157
- if check_service_health(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
156
+ if check_service_running(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
158
157
  logger.error("Device Hardware Bridge is still running after stop attempt")
159
158
  return False
160
159
 
@@ -1,11 +1,13 @@
1
+ import asyncio
1
2
  import logging
2
- from collections.abc import Awaitable, Callable
3
- from typing import Literal, TypeVar, overload
3
+ from collections.abc import Awaitable, Callable, Coroutine
4
+ from typing import Any, Literal, TypeVar, overload
4
5
 
5
6
  from langchain_core.language_models.chat_models import BaseChatModel
6
7
  from langchain_google_genai import ChatGoogleGenerativeAI
7
8
  from langchain_google_vertexai import ChatVertexAI
8
9
  from langchain_openai import ChatOpenAI
10
+ from pydantic import SecretStr
9
11
 
10
12
  from minitap.mobile_use.config import (
11
13
  AgentNode,
@@ -15,8 +17,79 @@ from minitap.mobile_use.config import (
15
17
  settings,
16
18
  )
17
19
  from minitap.mobile_use.context import MobileUseContext
20
+ from minitap.mobile_use.utils.logger import get_logger
18
21
 
19
- logger = logging.getLogger(__name__)
22
+ # Logger for internal messages (ex: fallback)
23
+ llm_logger = logging.getLogger(__name__)
24
+ # Logger for user messages
25
+ user_messages_logger = get_logger(__name__)
26
+
27
+
28
+ async def invoke_llm_with_timeout_message[T](
29
+ llm_call: Coroutine[Any, Any, T],
30
+ agent_name: str,
31
+ timeout_seconds: int = 10,
32
+ ) -> T:
33
+ """
34
+ Send a LLM call and display a timeout message if it takes too long.
35
+
36
+ Args:
37
+ llm_call: The coroutine of the LLM call to execute.
38
+ agent_name: The name of the agent making the call (for the message).
39
+ timeout_seconds: The delay in seconds before displaying the message.
40
+
41
+ Returns:
42
+ The result of the LLM call.
43
+ """
44
+ llm_task = asyncio.create_task(llm_call)
45
+ waiter_task = asyncio.create_task(asyncio.sleep(timeout_seconds))
46
+
47
+ done, _ = await asyncio.wait({llm_task, waiter_task}, return_when=asyncio.FIRST_COMPLETED)
48
+
49
+ if llm_task in done:
50
+ # The LLM call has finished before the timeout, cancel the timer
51
+ waiter_task.cancel()
52
+ return llm_task.result()
53
+ else:
54
+ # The timeout has been reached, display the message and wait for the call to finish
55
+ user_messages_logger.info("Waiting for LLM call response...")
56
+ return await llm_task
57
+
58
+
59
+ def get_minitap_llm(
60
+ trace_id: str,
61
+ remote_tracing: bool = False,
62
+ model: str = "google/gemini-2.5-pro",
63
+ temperature: float | None = None,
64
+ max_retries: int | None = None,
65
+ api_key: str | None = None,
66
+ ) -> ChatOpenAI:
67
+ if api_key:
68
+ effective_api_key = SecretStr(api_key)
69
+ elif settings.MINITAP_API_KEY:
70
+ effective_api_key = settings.MINITAP_API_KEY
71
+ else:
72
+ raise ValueError("MINITAP_API_KEY must be provided or set in environment")
73
+
74
+ if settings.MINITAP_API_BASE_URL is None:
75
+ raise ValueError("MINITAP_API_BASE_URL must be set in environment")
76
+
77
+ llm_base_url = f"{settings.MINITAP_API_BASE_URL}/api/v1"
78
+
79
+ if max_retries is None and model.startswith("google/"):
80
+ max_retries = 2
81
+ client = ChatOpenAI(
82
+ model=model,
83
+ temperature=temperature,
84
+ max_retries=max_retries,
85
+ api_key=effective_api_key,
86
+ base_url=llm_base_url,
87
+ default_query={
88
+ "sessionId": trace_id,
89
+ "traceOnlyUsage": remote_tracing,
90
+ },
91
+ )
92
+ return client
20
93
 
21
94
 
22
95
  def get_google_llm(
@@ -139,6 +212,17 @@ def get_llm(
139
212
  return get_openrouter_llm(llm.model, temperature)
140
213
  elif llm.provider == "xai":
141
214
  return get_grok_llm(llm.model, temperature)
215
+ elif llm.provider == "minitap":
216
+ remote_tracing = False
217
+ if ctx.execution_setup:
218
+ remote_tracing = ctx.execution_setup.enable_remote_tracing
219
+ return get_minitap_llm(
220
+ trace_id=ctx.trace_id,
221
+ remote_tracing=remote_tracing,
222
+ model=llm.model,
223
+ temperature=temperature,
224
+ api_key=ctx.minitap_api_key,
225
+ )
142
226
  else:
143
227
  raise ValueError(f"Unsupported provider: {llm.provider}")
144
228
 
@@ -154,9 +238,9 @@ async def with_fallback(
154
238
  try:
155
239
  result = await main_call()
156
240
  if result is None and none_should_fallback:
157
- logger.warning("Main LLM inference returned None. Falling back...")
241
+ llm_logger.warning("Main LLM inference returned None. Falling back...")
158
242
  return await fallback_call()
159
243
  return result
160
244
  except Exception as e:
161
- logger.warning(f"❗ Main LLM inference failed: {e}. Falling back...")
245
+ llm_logger.warning(f"❗ Main LLM inference failed: {e}. Falling back...")
162
246
  return await fallback_call()
@@ -3,15 +3,12 @@ from langchain_core.tools import BaseTool
3
3
  from minitap.mobile_use.context import MobileUseContext
4
4
  from minitap.mobile_use.tools.mobile.back import back_wrapper
5
5
  from minitap.mobile_use.tools.mobile.clear_text import clear_text_wrapper
6
- from minitap.mobile_use.tools.mobile.copy_text_from import copy_text_from_wrapper
7
6
  from minitap.mobile_use.tools.mobile.erase_one_char import erase_one_char_wrapper
8
- from minitap.mobile_use.tools.mobile.find_packages import find_packages_wrapper
9
7
  from minitap.mobile_use.tools.mobile.glimpse_screen import glimpse_screen_wrapper
10
8
  from minitap.mobile_use.tools.mobile.input_text import input_text_wrapper
11
9
  from minitap.mobile_use.tools.mobile.launch_app import launch_app_wrapper
12
10
  from minitap.mobile_use.tools.mobile.long_press_on import long_press_on_wrapper
13
11
  from minitap.mobile_use.tools.mobile.open_link import open_link_wrapper
14
- from minitap.mobile_use.tools.mobile.paste_text import paste_text_wrapper
15
12
  from minitap.mobile_use.tools.mobile.press_key import press_key_wrapper
16
13
  from minitap.mobile_use.tools.mobile.stop_app import stop_app_wrapper
17
14
  from minitap.mobile_use.tools.mobile.swipe import swipe_wrapper
@@ -28,13 +25,10 @@ EXECUTOR_WRAPPERS_TOOLS = [
28
25
  long_press_on_wrapper,
29
26
  swipe_wrapper,
30
27
  glimpse_screen_wrapper,
31
- copy_text_from_wrapper,
32
28
  input_text_wrapper,
33
29
  erase_one_char_wrapper,
34
- find_packages_wrapper,
35
30
  launch_app_wrapper,
36
31
  stop_app_wrapper,
37
- paste_text_wrapper,
38
32
  clear_text_wrapper,
39
33
  press_key_wrapper,
40
34
  wait_for_animation_to_end_wrapper,
@@ -13,11 +13,11 @@ from langgraph.prebuilt import InjectedState
13
13
 
14
14
  def get_back_tool(ctx: MobileUseContext):
15
15
  @tool
16
- def back(
16
+ async def back(
17
17
  tool_call_id: Annotated[str, InjectedToolCallId],
18
18
  state: Annotated[State, InjectedState],
19
19
  agent_thought: str,
20
- ):
20
+ ) -> Command:
21
21
  """Navigates to the previous screen. (Only works on Android for the moment)"""
22
22
  output = back_controller(ctx=ctx)
23
23
  has_failed = output is not None
@@ -28,7 +28,7 @@ def get_back_tool(ctx: MobileUseContext):
28
28
  status="error" if has_failed else "success",
29
29
  )
30
30
  return Command(
31
- update=state.sanitize_update(
31
+ update=await state.asanitize_update(
32
32
  ctx=ctx,
33
33
  update={
34
34
  "agents_thoughts": [agent_thought],
@@ -12,18 +12,13 @@ from minitap.mobile_use.context import MobileUseContext
12
12
  from minitap.mobile_use.controllers.mobile_command_controller import (
13
13
  erase_text as erase_text_controller,
14
14
  )
15
- from minitap.mobile_use.controllers.mobile_command_controller import (
16
- get_screen_data,
17
- )
15
+ from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
18
16
  from minitap.mobile_use.graph.state import State
19
17
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
20
- from minitap.mobile_use.tools.utils import (
21
- focus_element_if_needed,
22
- move_cursor_to_end_if_bounds,
23
- )
18
+ from minitap.mobile_use.tools.types import Target
19
+ from minitap.mobile_use.tools.utils import focus_element_if_needed, move_cursor_to_end_if_bounds
24
20
  from minitap.mobile_use.utils.logger import get_logger
25
21
  from minitap.mobile_use.utils.ui_hierarchy import (
26
- ElementBounds,
27
22
  find_element_by_resource_id,
28
23
  get_element_text,
29
24
  text_input_is_empty,
@@ -90,24 +85,18 @@ class TextClearer:
90
85
 
91
86
  def _prepare_element_for_clearing(
92
87
  self,
93
- text_input_resource_id: str | None,
94
- text_input_coordinates: ElementBounds | None,
95
- text_input_text: str | None,
88
+ target: Target,
96
89
  ) -> bool:
97
90
  if not focus_element_if_needed(
98
91
  ctx=self.ctx,
99
- input_resource_id=text_input_resource_id,
100
- input_coordinates=text_input_coordinates,
101
- input_text=text_input_text,
92
+ target=target,
102
93
  ):
103
94
  return False
104
95
 
105
96
  move_cursor_to_end_if_bounds(
106
97
  ctx=self.ctx,
107
98
  state=self.state,
108
- text_input_resource_id=text_input_resource_id,
109
- text_input_coordinates=text_input_coordinates,
110
- text_input_text=text_input_text,
99
+ target=target,
111
100
  )
112
101
  return True
113
102
 
@@ -124,9 +113,7 @@ class TextClearer:
124
113
 
125
114
  def _clear_with_retries(
126
115
  self,
127
- text_input_resource_id: str | None,
128
- text_input_coordinates: ElementBounds | None,
129
- text_input_text: str | None,
116
+ target: Target,
130
117
  initial_text: str,
131
118
  hint_text: str | None,
132
119
  ) -> tuple[bool, str | None, int]:
@@ -145,10 +132,10 @@ class TextClearer:
145
132
 
146
133
  self._refresh_ui_hierarchy()
147
134
  elt = None
148
- if text_input_resource_id:
135
+ if target.resource_id:
149
136
  elt = find_element_by_resource_id(
150
137
  ui_hierarchy=self.state.latest_ui_hierarchy or [],
151
- resource_id=text_input_resource_id,
138
+ resource_id=target.resource_id,
152
139
  )
153
140
  if elt:
154
141
  current_text = get_element_text(elt)
@@ -159,9 +146,7 @@ class TextClearer:
159
146
  move_cursor_to_end_if_bounds(
160
147
  ctx=self.ctx,
161
148
  state=self.state,
162
- text_input_resource_id=text_input_resource_id,
163
- text_input_coordinates=text_input_coordinates,
164
- text_input_text=text_input_text,
149
+ target=target,
165
150
  elt=elt,
166
151
  )
167
152
 
@@ -213,20 +198,20 @@ class TextClearer:
213
198
 
214
199
  def clear_input_text(
215
200
  self,
216
- text_input_resource_id: str | None,
217
- text_input_coordinates: ElementBounds | None,
218
- text_input_text: str | None,
201
+ target: Target,
219
202
  ) -> ClearTextResult:
220
- element, current_text, hint_text = self._get_element_info(text_input_resource_id)
203
+ element, current_text, hint_text = self._get_element_info(
204
+ resource_id=target.resource_id,
205
+ )
221
206
 
222
207
  if not element:
223
- return self._handle_element_not_found(text_input_resource_id, hint_text)
208
+ return self._handle_element_not_found(target.resource_id, hint_text)
224
209
 
225
210
  if not self._should_clear_text(current_text, hint_text):
226
211
  return self._handle_no_clearing_needed(current_text, hint_text)
227
212
 
228
213
  if not self._prepare_element_for_clearing(
229
- text_input_resource_id, text_input_coordinates, text_input_text
214
+ target=target,
230
215
  ):
231
216
  return self._create_result(
232
217
  success=False,
@@ -237,9 +222,7 @@ class TextClearer:
237
222
  )
238
223
 
239
224
  success, final_text, chars_erased = self._clear_with_retries(
240
- text_input_resource_id=text_input_resource_id,
241
- text_input_coordinates=text_input_coordinates,
242
- text_input_text=text_input_text,
225
+ target=target,
243
226
  initial_text=current_text or "",
244
227
  hint_text=hint_text,
245
228
  )
@@ -257,23 +240,21 @@ class TextClearer:
257
240
 
258
241
  def get_clear_text_tool(ctx: MobileUseContext):
259
242
  @tool
260
- def clear_text(
243
+ async def clear_text(
261
244
  tool_call_id: Annotated[str, InjectedToolCallId],
262
245
  state: Annotated[State, InjectedState],
263
246
  agent_thought: str,
264
- text_input_resource_id: str,
265
- text_input_coordinates: ElementBounds | None,
266
- text_input_text: str | None,
247
+ target: Target,
267
248
  ):
268
249
  """
269
250
  Clears all the text from the text field, by focusing it if needed.
270
251
  """
271
252
  clearer = TextClearer(ctx, state)
272
253
  result = clearer.clear_input_text(
273
- text_input_resource_id, text_input_coordinates, text_input_text
254
+ target=target,
274
255
  )
275
256
 
276
- content = (
257
+ agent_outcome = (
277
258
  clear_text_wrapper.on_failure_fn(result.error_message)
278
259
  if not result.success
279
260
  else clear_text_wrapper.on_success_fn(
@@ -283,16 +264,16 @@ def get_clear_text_tool(ctx: MobileUseContext):
283
264
 
284
265
  tool_message = ToolMessage(
285
266
  tool_call_id=tool_call_id,
286
- content=content,
267
+ content=agent_outcome,
287
268
  additional_kwargs={"error": result.error_message} if not result.success else {},
288
269
  status="error" if not result.success else "success",
289
270
  )
290
271
 
291
272
  return Command(
292
- update=state.sanitize_update(
273
+ update=await state.asanitize_update(
293
274
  ctx=ctx,
294
275
  update={
295
- "agents_thoughts": [agent_thought],
276
+ "agents_thoughts": [agent_thought, agent_outcome],
296
277
  EXECUTOR_MESSAGES_KEY: [tool_message],
297
278
  },
298
279
  agent="executor",
@@ -1,9 +1,10 @@
1
+ from typing import Annotated
2
+
1
3
  from langchain_core.messages import ToolMessage
2
4
  from langchain_core.tools import tool
3
5
  from langchain_core.tools.base import InjectedToolCallId
4
6
  from langgraph.prebuilt import InjectedState
5
7
  from langgraph.types import Command
6
- from typing import Annotated
7
8
 
8
9
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
9
10
  from minitap.mobile_use.context import MobileUseContext
@@ -16,11 +17,11 @@ from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
16
17
 
17
18
  def get_erase_one_char_tool(ctx: MobileUseContext):
18
19
  @tool
19
- def erase_one_char(
20
+ async def erase_one_char(
20
21
  tool_call_id: Annotated[str, InjectedToolCallId],
21
22
  state: Annotated[State, InjectedState],
22
23
  agent_thought: str,
23
- ):
24
+ ) -> Command:
24
25
  """
25
26
  Erase one character from a text area.
26
27
  It acts the same as pressing backspace a single time.
@@ -36,7 +37,7 @@ def get_erase_one_char_tool(ctx: MobileUseContext):
36
37
  status="error" if has_failed else "success",
37
38
  )
38
39
  return Command(
39
- update=state.sanitize_update(
40
+ update=await state.asanitize_update(
40
41
  ctx=ctx,
41
42
  update={
42
43
  "agents_thoughts": [agent_thought],
@@ -18,11 +18,11 @@ from minitap.mobile_use.utils.media import compress_base64_jpeg
18
18
 
19
19
  def get_glimpse_screen_tool(ctx: MobileUseContext):
20
20
  @tool
21
- def glimpse_screen(
21
+ async def glimpse_screen(
22
22
  tool_call_id: Annotated[str, InjectedToolCallId],
23
23
  state: Annotated[State, InjectedState],
24
24
  agent_thought: str,
25
- ):
25
+ ) -> Command:
26
26
  """
27
27
  Captures the current screen as an image.
28
28
  The resulting screenshot is added to the context for the next reasoning step.
@@ -37,22 +37,26 @@ def get_glimpse_screen_tool(ctx: MobileUseContext):
37
37
  output = str(e)
38
38
  has_failed = True
39
39
 
40
+ agent_outcome = (
41
+ glimpse_screen_wrapper.on_failure_fn()
42
+ if has_failed
43
+ else glimpse_screen_wrapper.on_success_fn()
44
+ )
45
+
40
46
  tool_message = ToolMessage(
41
47
  tool_call_id=tool_call_id,
42
- content=glimpse_screen_wrapper.on_failure_fn()
43
- if has_failed
44
- else glimpse_screen_wrapper.on_success_fn(),
48
+ content=agent_outcome,
45
49
  additional_kwargs={"error": output} if has_failed else {},
46
50
  status="error" if has_failed else "success",
47
51
  )
48
52
  updates = {
49
- "agents_thoughts": [agent_thought],
53
+ "agents_thoughts": [agent_thought, agent_outcome],
50
54
  EXECUTOR_MESSAGES_KEY: [tool_message],
51
55
  }
52
56
  if compressed_image_base64:
53
57
  updates["latest_screenshot_base64"] = compressed_image_base64
54
58
  return Command(
55
- update=state.sanitize_update(
59
+ update=await state.asanitize_update(
56
60
  ctx=ctx,
57
61
  update=updates,
58
62
  agent="executor",
@@ -11,21 +11,16 @@ from pydantic import BaseModel
11
11
 
12
12
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
13
13
  from minitap.mobile_use.context import MobileUseContext
14
- from minitap.mobile_use.controllers.mobile_command_controller import (
15
- get_screen_data,
16
- )
14
+ from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
17
15
  from minitap.mobile_use.controllers.mobile_command_controller import (
18
16
  input_text as input_text_controller,
19
17
  )
20
18
  from minitap.mobile_use.graph.state import State
21
19
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
20
+ from minitap.mobile_use.tools.types import Target
22
21
  from minitap.mobile_use.tools.utils import focus_element_if_needed, move_cursor_to_end_if_bounds
23
22
  from minitap.mobile_use.utils.logger import get_logger
24
- from minitap.mobile_use.utils.ui_hierarchy import (
25
- ElementBounds,
26
- find_element_by_resource_id,
27
- get_element_text,
28
- )
23
+ from minitap.mobile_use.utils.ui_hierarchy import find_element_by_resource_id, get_element_text
29
24
 
30
25
  logger = get_logger(__name__)
31
26
 
@@ -49,14 +44,12 @@ def _controller_input_text(ctx: MobileUseContext, text: str) -> InputResult:
49
44
 
50
45
  def get_input_text_tool(ctx: MobileUseContext):
51
46
  @tool
52
- def input_text(
47
+ async def input_text(
53
48
  tool_call_id: Annotated[str, InjectedToolCallId],
54
49
  state: Annotated[State, InjectedState],
55
50
  agent_thought: str,
56
51
  text: str,
57
- text_input_resource_id: str | None,
58
- text_input_coordinates: ElementBounds | None,
59
- text_input_text: str | None,
52
+ target: Target,
60
53
  ):
61
54
  """
62
55
  Focus a text field and type text into it.
@@ -70,17 +63,9 @@ def get_input_text_tool(ctx: MobileUseContext):
70
63
  state: The state of the agent.
71
64
  agent_thought: The thought of the agent.
72
65
  text: The text to type.
73
- text_input_resource_id: The resource ID of the text input (if available).
74
- text_input_coordinates: The bounds (ElementBounds) of the text input (if available).
75
- text_input_text: The current text content of the text input (if available).
66
+ target: The target of the text input (if available).
76
67
  """
77
-
78
- focused = focus_element_if_needed(
79
- ctx=ctx,
80
- input_resource_id=text_input_resource_id,
81
- input_coordinates=text_input_coordinates,
82
- input_text=text_input_text,
83
- )
68
+ focused = focus_element_if_needed(ctx=ctx, target=target)
84
69
  if not focused:
85
70
  error_message = "Failed to focus the text input element before typing."
86
71
  tool_message = ToolMessage(
@@ -90,7 +75,7 @@ def get_input_text_tool(ctx: MobileUseContext):
90
75
  status="error",
91
76
  )
92
77
  return Command(
93
- update=state.sanitize_update(
78
+ update=await state.asanitize_update(
94
79
  ctx=ctx,
95
80
  update={
96
81
  "agents_thoughts": [agent_thought, error_message],
@@ -100,40 +85,25 @@ def get_input_text_tool(ctx: MobileUseContext):
100
85
  ),
101
86
  )
102
87
 
103
- move_cursor_to_end_if_bounds(
104
- ctx=ctx,
105
- state=state,
106
- text_input_resource_id=text_input_resource_id,
107
- text_input_coordinates=text_input_coordinates,
108
- text_input_text=text_input_text,
109
- )
88
+ move_cursor_to_end_if_bounds(ctx=ctx, state=state, target=target)
110
89
 
111
90
  result = _controller_input_text(ctx=ctx, text=text)
112
-
113
91
  status: Literal["success", "error"] = "success" if result.ok else "error"
114
92
 
115
93
  text_input_content = ""
116
- if status == "success":
117
- if text_input_resource_id is not None:
118
- # Verification phase for elements with resource_id
119
- screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
120
- state.latest_ui_hierarchy = screen_data.elements
121
-
122
- element = find_element_by_resource_id(
123
- ui_hierarchy=state.latest_ui_hierarchy, resource_id=text_input_resource_id
124
- )
125
-
126
- if not element:
127
- result = InputResult(ok=False, error="Element not found")
128
-
129
- if element:
130
- text_input_content = get_element_text(element)
131
- else:
132
- # For elements without resource_id, skip verification and use direct message
133
- pass
94
+ if status == "success" and target.resource_id:
95
+ screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
96
+ state.latest_ui_hierarchy = screen_data.elements
97
+ element = find_element_by_resource_id(
98
+ ui_hierarchy=state.latest_ui_hierarchy,
99
+ resource_id=target.resource_id,
100
+ index=target.resource_id_index,
101
+ )
102
+ if element:
103
+ text_input_content = get_element_text(element)
134
104
 
135
105
  agent_outcome = (
136
- input_text_wrapper.on_success_fn(text, text_input_content, text_input_resource_id)
106
+ input_text_wrapper.on_success_fn(text, text_input_content, target.resource_id)
137
107
  if result.ok
138
108
  else input_text_wrapper.on_failure_fn(text, result.error)
139
109
  )
@@ -146,7 +116,7 @@ def get_input_text_tool(ctx: MobileUseContext):
146
116
  )
147
117
 
148
118
  return Command(
149
- update=state.sanitize_update(
119
+ update=await state.asanitize_update(
150
120
  ctx=ctx,
151
121
  update={
152
122
  "agents_thoughts": [agent_thought, agent_outcome],