minitap-mobile-use 2.5.3__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (40) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +0 -8
  2. minitap/mobile_use/agents/cortex/cortex.md +122 -36
  3. minitap/mobile_use/agents/cortex/cortex.py +32 -17
  4. minitap/mobile_use/agents/cortex/types.py +18 -4
  5. minitap/mobile_use/agents/executor/executor.md +3 -3
  6. minitap/mobile_use/agents/executor/executor.py +10 -3
  7. minitap/mobile_use/agents/hopper/hopper.md +30 -2
  8. minitap/mobile_use/agents/hopper/hopper.py +19 -15
  9. minitap/mobile_use/agents/orchestrator/orchestrator.py +14 -5
  10. minitap/mobile_use/agents/outputter/outputter.py +13 -3
  11. minitap/mobile_use/agents/planner/planner.md +20 -9
  12. minitap/mobile_use/agents/planner/planner.py +12 -5
  13. minitap/mobile_use/agents/screen_analyzer/human.md +16 -0
  14. minitap/mobile_use/agents/screen_analyzer/screen_analyzer.py +111 -0
  15. minitap/mobile_use/clients/ios_client.py +7 -3
  16. minitap/mobile_use/config.py +87 -24
  17. minitap/mobile_use/controllers/mobile_command_controller.py +354 -88
  18. minitap/mobile_use/controllers/platform_specific_commands_controller.py +41 -27
  19. minitap/mobile_use/controllers/types.py +95 -0
  20. minitap/mobile_use/graph/graph.py +55 -11
  21. minitap/mobile_use/graph/state.py +10 -3
  22. minitap/mobile_use/main.py +12 -4
  23. minitap/mobile_use/sdk/agent.py +109 -72
  24. minitap/mobile_use/sdk/examples/smart_notification_assistant.py +59 -10
  25. minitap/mobile_use/servers/device_hardware_bridge.py +13 -6
  26. minitap/mobile_use/services/llm.py +5 -2
  27. minitap/mobile_use/tools/index.py +7 -9
  28. minitap/mobile_use/tools/mobile/{clear_text.py → focus_and_clear_text.py} +7 -7
  29. minitap/mobile_use/tools/mobile/{input_text.py → focus_and_input_text.py} +8 -8
  30. minitap/mobile_use/tools/mobile/long_press_on.py +130 -15
  31. minitap/mobile_use/tools/mobile/swipe.py +3 -26
  32. minitap/mobile_use/tools/mobile/tap.py +41 -28
  33. minitap/mobile_use/tools/mobile/wait_for_delay.py +84 -0
  34. minitap/mobile_use/utils/cli_helpers.py +10 -6
  35. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.6.0.dist-info}/METADATA +1 -1
  36. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.6.0.dist-info}/RECORD +38 -36
  37. minitap/mobile_use/tools/mobile/glimpse_screen.py +0 -74
  38. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +0 -64
  39. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.6.0.dist-info}/WHEEL +0 -0
  40. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.6.0.dist-info}/entry_points.txt +0 -0
@@ -2,10 +2,9 @@ from langchain_core.tools import BaseTool
2
2
 
3
3
  from minitap.mobile_use.context import MobileUseContext
4
4
  from minitap.mobile_use.tools.mobile.back import back_wrapper
5
- from minitap.mobile_use.tools.mobile.clear_text import clear_text_wrapper
6
5
  from minitap.mobile_use.tools.mobile.erase_one_char import erase_one_char_wrapper
7
- from minitap.mobile_use.tools.mobile.glimpse_screen import glimpse_screen_wrapper
8
- from minitap.mobile_use.tools.mobile.input_text import input_text_wrapper
6
+ from minitap.mobile_use.tools.mobile.focus_and_clear_text import focus_and_clear_text_wrapper
7
+ from minitap.mobile_use.tools.mobile.focus_and_input_text import focus_and_input_text_wrapper
9
8
  from minitap.mobile_use.tools.mobile.launch_app import launch_app_wrapper
10
9
  from minitap.mobile_use.tools.mobile.long_press_on import long_press_on_wrapper
11
10
  from minitap.mobile_use.tools.mobile.open_link import open_link_wrapper
@@ -13,8 +12,8 @@ from minitap.mobile_use.tools.mobile.press_key import press_key_wrapper
13
12
  from minitap.mobile_use.tools.mobile.stop_app import stop_app_wrapper
14
13
  from minitap.mobile_use.tools.mobile.swipe import swipe_wrapper
15
14
  from minitap.mobile_use.tools.mobile.tap import tap_wrapper
16
- from minitap.mobile_use.tools.mobile.wait_for_animation_to_end import (
17
- wait_for_animation_to_end_wrapper,
15
+ from minitap.mobile_use.tools.mobile.wait_for_delay import (
16
+ wait_for_delay_wrapper,
18
17
  )
19
18
  from minitap.mobile_use.tools.tool_wrapper import CompositeToolWrapper, ToolWrapper
20
19
 
@@ -24,14 +23,13 @@ EXECUTOR_WRAPPERS_TOOLS = [
24
23
  tap_wrapper,
25
24
  long_press_on_wrapper,
26
25
  swipe_wrapper,
27
- glimpse_screen_wrapper,
28
- input_text_wrapper,
26
+ focus_and_input_text_wrapper,
29
27
  erase_one_char_wrapper,
30
28
  launch_app_wrapper,
31
29
  stop_app_wrapper,
32
- clear_text_wrapper,
30
+ focus_and_clear_text_wrapper,
33
31
  press_key_wrapper,
34
- wait_for_animation_to_end_wrapper,
32
+ wait_for_delay_wrapper,
35
33
  ]
36
34
 
37
35
 
@@ -238,9 +238,9 @@ class TextClearer:
238
238
  )
239
239
 
240
240
 
241
- def get_clear_text_tool(ctx: MobileUseContext):
241
+ def get_focus_and_clear_text_tool(ctx: MobileUseContext):
242
242
  @tool
243
- async def clear_text(
243
+ async def focus_and_clear_text(
244
244
  tool_call_id: Annotated[str, InjectedToolCallId],
245
245
  state: Annotated[State, InjectedState],
246
246
  agent_thought: str,
@@ -255,9 +255,9 @@ def get_clear_text_tool(ctx: MobileUseContext):
255
255
  )
256
256
 
257
257
  agent_outcome = (
258
- clear_text_wrapper.on_failure_fn(result.error_message)
258
+ focus_and_clear_text_wrapper.on_failure_fn(result.error_message)
259
259
  if not result.success
260
- else clear_text_wrapper.on_success_fn(
260
+ else focus_and_clear_text_wrapper.on_success_fn(
261
261
  nb_char_erased=result.chars_erased, new_text_value=result.final_text
262
262
  )
263
263
  )
@@ -280,7 +280,7 @@ def get_clear_text_tool(ctx: MobileUseContext):
280
280
  ),
281
281
  )
282
282
 
283
- return clear_text
283
+ return focus_and_clear_text
284
284
 
285
285
 
286
286
  def _format_success_message(nb_char_erased: int, new_text_value: str | None) -> str:
@@ -299,8 +299,8 @@ def _format_failure_message(output: str | None) -> str:
299
299
  return "Failed to erase text. " + (str(output) if output else "")
300
300
 
301
301
 
302
- clear_text_wrapper = ToolWrapper(
303
- tool_fn_getter=get_clear_text_tool,
302
+ focus_and_clear_text_wrapper = ToolWrapper(
303
+ tool_fn_getter=get_focus_and_clear_text_tool,
304
304
  on_success_fn=_format_success_message,
305
305
  on_failure_fn=_format_failure_message,
306
306
  )
@@ -42,9 +42,9 @@ def _controller_input_text(ctx: MobileUseContext, text: str) -> InputResult:
42
42
  return InputResult(ok=False, error=str(controller_out))
43
43
 
44
44
 
45
- def get_input_text_tool(ctx: MobileUseContext):
45
+ def get_focus_and_input_text_tool(ctx: MobileUseContext):
46
46
  @tool
47
- async def input_text(
47
+ async def focus_and_input_text(
48
48
  tool_call_id: Annotated[str, InjectedToolCallId],
49
49
  state: Annotated[State, InjectedState],
50
50
  agent_thought: str,
@@ -70,7 +70,7 @@ def get_input_text_tool(ctx: MobileUseContext):
70
70
  error_message = "Failed to focus the text input element before typing."
71
71
  tool_message = ToolMessage(
72
72
  tool_call_id=tool_call_id,
73
- content=input_text_wrapper.on_failure_fn(text, error_message),
73
+ content=focus_and_input_text_wrapper.on_failure_fn(text, error_message),
74
74
  additional_kwargs={"error": error_message},
75
75
  status="error",
76
76
  )
@@ -103,9 +103,9 @@ def get_input_text_tool(ctx: MobileUseContext):
103
103
  text_input_content = get_element_text(element)
104
104
 
105
105
  agent_outcome = (
106
- input_text_wrapper.on_success_fn(text, text_input_content, target.resource_id)
106
+ focus_and_input_text_wrapper.on_success_fn(text, text_input_content, target.resource_id)
107
107
  if result.ok
108
- else input_text_wrapper.on_failure_fn(text, result.error)
108
+ else focus_and_input_text_wrapper.on_failure_fn(text, result.error)
109
109
  )
110
110
 
111
111
  tool_message = ToolMessage(
@@ -126,7 +126,7 @@ def get_input_text_tool(ctx: MobileUseContext):
126
126
  ),
127
127
  )
128
128
 
129
- return input_text
129
+ return focus_and_input_text
130
130
 
131
131
 
132
132
  def _on_input_success(text, text_input_content, text_input_resource_id):
@@ -141,8 +141,8 @@ def _on_input_success(text, text_input_content, text_input_resource_id):
141
141
  return "Typed text, should now verify before moving forward"
142
142
 
143
143
 
144
- input_text_wrapper = ToolWrapper(
145
- tool_fn_getter=get_input_text_tool,
144
+ focus_and_input_text_wrapper = ToolWrapper(
145
+ tool_fn_getter=get_focus_and_input_text_tool,
146
146
  on_success_fn=_on_input_success,
147
147
  on_failure_fn=lambda text, error: f"Failed to input text {repr(text)}. Reason: {error}",
148
148
  )
@@ -2,46 +2,158 @@ from typing import Annotated
2
2
 
3
3
  from langchain_core.messages import ToolMessage
4
4
  from langchain_core.tools import tool
5
- from langchain_core.tools.base import InjectedToolCallId
5
+ from langchain_core.tools.base import BaseTool, InjectedToolCallId
6
6
  from langgraph.prebuilt import InjectedState
7
7
  from langgraph.types import Command
8
8
 
9
9
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
10
10
  from minitap.mobile_use.context import MobileUseContext
11
- from minitap.mobile_use.controllers.mobile_command_controller import SelectorRequest
11
+ from minitap.mobile_use.controllers.mobile_command_controller import (
12
+ CoordinatesSelectorRequest,
13
+ IdSelectorRequest,
14
+ SelectorRequestWithCoordinates,
15
+ TextSelectorRequest,
16
+ )
12
17
  from minitap.mobile_use.controllers.mobile_command_controller import (
13
18
  long_press_on as long_press_on_controller,
14
19
  )
15
20
  from minitap.mobile_use.graph.state import State
16
21
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
22
+ from minitap.mobile_use.tools.types import Target
23
+ from minitap.mobile_use.utils.logger import get_logger
17
24
 
25
+ logger = get_logger(__name__)
18
26
 
19
- def get_long_press_on_tool(ctx: MobileUseContext):
27
+
28
+ def get_long_press_on_tool(ctx: MobileUseContext) -> BaseTool:
20
29
  @tool
21
30
  async def long_press_on(
22
31
  tool_call_id: Annotated[str, InjectedToolCallId],
23
32
  state: Annotated[State, InjectedState],
24
33
  agent_thought: str,
25
- selector_request: SelectorRequest,
26
- index: int | None = None,
27
- ) -> Command:
34
+ target: Target,
35
+ duration_ms: int = 1000,
36
+ ):
28
37
  """
29
- Long press on a UI element identified by the given selector.
30
- An index can be specified to select a specific element if multiple are found.
38
+ Long presses on a UI element identified by the 'target' object.
39
+
40
+ The 'target' object allows specifying an element by its resource_id
41
+ (with an optional index), its coordinates, or its text content (with an optional index).
42
+ The tool uses a fallback strategy, trying the locators in that order.
43
+
44
+ Args:
45
+ target: The UI element to long press on (coordinates, resource_id, or text).
46
+ duration_ms: Duration of the long press in milliseconds. Choose based on interaction:
47
+ - 500-800ms: Quick long press (e.g., selecting text, haptic feedback)
48
+ - 1000ms (default): Standard long press (most common use case)
49
+ - 1500-2000ms: Extended long press (e.g., context menus, special actions)
50
+ - 2500ms+: Very long press (e.g., accessibility, advanced gestures)
31
51
  """
32
- output = long_press_on_controller(ctx=ctx, selector_request=selector_request, index=index)
33
- has_failed = output is not None
52
+ error_obj: dict | None = {
53
+ "error": "No valid selector provided or all selectors failed."
54
+ } # Default to failure
55
+ latest_selector_info: str | None = None
56
+
57
+ # 1. Try with COORDINATES FIRST (visual approach)
58
+ if target.coordinates:
59
+ try:
60
+ center_point = target.coordinates.get_center()
61
+ selector = SelectorRequestWithCoordinates(
62
+ coordinates=CoordinatesSelectorRequest(x=center_point.x, y=center_point.y)
63
+ )
64
+ logger.info(
65
+ f"Attempting to long press using coordinates: {center_point.x},{center_point.y}"
66
+ )
67
+ latest_selector_info = f"coordinates='{target.coordinates}'"
68
+ result = long_press_on_controller(
69
+ ctx=ctx,
70
+ selector_request=selector,
71
+ ui_hierarchy=state.latest_ui_hierarchy,
72
+ long_press_duration=duration_ms,
73
+ )
74
+ if result is None: # Success
75
+ error_obj = None
76
+ else:
77
+ logger.warning(
78
+ f"Long press with coordinates '{target.coordinates}' failed. "
79
+ f"Error: {result}"
80
+ )
81
+ error_obj = {"error": result} if isinstance(result, str) else result
82
+ except Exception as e:
83
+ logger.warning(
84
+ f"Exception during long press with coordinates '{target.coordinates}': {e}"
85
+ )
86
+ error_obj = {"error": str(e)}
87
+
88
+ # 2. If coordinates failed or weren't provided, try with resource_id
89
+ if error_obj is not None and target.resource_id:
90
+ try:
91
+ selector = IdSelectorRequest(id=target.resource_id)
92
+ logger.info(
93
+ f"Attempting to long press using resource_id: '{target.resource_id}' "
94
+ f"at index {target.resource_id_index}"
95
+ )
96
+ latest_selector_info = (
97
+ f"resource_id='{target.resource_id}' (index={target.resource_id_index})"
98
+ )
99
+ result = long_press_on_controller(
100
+ ctx=ctx,
101
+ selector_request=selector,
102
+ index=target.resource_id_index,
103
+ ui_hierarchy=state.latest_ui_hierarchy,
104
+ long_press_duration=duration_ms,
105
+ )
106
+ if result is None: # Success
107
+ error_obj = None
108
+ else:
109
+ logger.warning(
110
+ f"Long press with resource_id '{target.resource_id}' failed. "
111
+ f"Error: {result}"
112
+ )
113
+ error_obj = {"error": result} if isinstance(result, str) else result
114
+ except Exception as e:
115
+ logger.warning(
116
+ f"Exception during long press with resource_id '{target.resource_id}': {e}"
117
+ )
118
+ error_obj = {"error": str(e)}
119
+
120
+ # 3. If resource_id failed or wasn't provided, try with text (last resort)
121
+ if error_obj is not None and target.text:
122
+ try:
123
+ selector = TextSelectorRequest(text=target.text)
124
+ logger.info(
125
+ f"Attempting to long press using text: '{target.text}' "
126
+ f"at index {target.text_index}"
127
+ )
128
+ latest_selector_info = f"text='{target.text}' (index={target.text_index})"
129
+ result = long_press_on_controller(
130
+ ctx=ctx,
131
+ selector_request=selector,
132
+ index=target.text_index,
133
+ ui_hierarchy=state.latest_ui_hierarchy,
134
+ long_press_duration=duration_ms,
135
+ )
136
+ if result is None: # Success
137
+ error_obj = None
138
+ else:
139
+ logger.warning(f"Long press with text '{target.text}' failed. Error: {result}")
140
+ error_obj = {"error": result} if isinstance(result, str) else result
141
+ except Exception as e:
142
+ logger.warning(f"Exception during long press with text '{target.text}': {e}")
143
+ error_obj = {"error": str(e)}
34
144
 
145
+ has_failed = error_obj is not None
146
+ final_selector_info = latest_selector_info if latest_selector_info else "N/A"
35
147
  agent_outcome = (
36
- long_press_on_wrapper.on_failure_fn()
148
+ long_press_on_wrapper.on_failure_fn(final_selector_info)
37
149
  if has_failed
38
- else long_press_on_wrapper.on_success_fn()
150
+ else long_press_on_wrapper.on_success_fn(final_selector_info)
39
151
  )
40
152
 
41
153
  tool_message = ToolMessage(
42
154
  tool_call_id=tool_call_id,
43
155
  content=agent_outcome,
44
- additional_kwargs={"error": output} if has_failed else {},
156
+ additional_kwargs=error_obj if has_failed else {},
45
157
  status="error" if has_failed else "success",
46
158
  )
47
159
  return Command(
@@ -60,6 +172,9 @@ def get_long_press_on_tool(ctx: MobileUseContext):
60
172
 
61
173
  long_press_on_wrapper = ToolWrapper(
62
174
  tool_fn_getter=get_long_press_on_tool,
63
- on_success_fn=lambda: "Long press on is successful.",
64
- on_failure_fn=lambda: "Failed to long press on.",
175
+ on_success_fn=lambda selector_info: (
176
+ f"Long press on element with {selector_info} was successful."
177
+ ),
178
+ on_failure_fn=lambda selector_info: "Failed to long press on element. "
179
+ + f"Last attempt was with {selector_info}.",
65
180
  )
@@ -9,15 +9,14 @@ from pydantic import Field
9
9
 
10
10
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
11
11
  from minitap.mobile_use.context import MobileUseContext
12
- from minitap.mobile_use.controllers.mobile_command_controller import (
12
+ from minitap.mobile_use.controllers.mobile_command_controller import swipe as swipe_controller
13
+ from minitap.mobile_use.controllers.types import (
13
14
  CoordinatesSelectorRequest,
14
15
  PercentagesSelectorRequest,
15
- SwipeDirection,
16
16
  SwipeRequest,
17
17
  SwipeStartEndCoordinatesRequest,
18
18
  SwipeStartEndPercentagesRequest,
19
19
  )
20
- from minitap.mobile_use.controllers.mobile_command_controller import swipe as swipe_controller
21
20
  from minitap.mobile_use.graph.state import State
22
21
  from minitap.mobile_use.tools.tool_wrapper import CompositeToolWrapper
23
22
 
@@ -123,29 +122,7 @@ def get_composite_swipe_tools(ctx: MobileUseContext) -> list[BaseTool]:
123
122
  }
124
123
  )
125
124
 
126
- @tool
127
- def swipe_direction(
128
- agent_thought: str,
129
- tool_call_id: Annotated[str, InjectedToolCallId],
130
- state: Annotated[State, InjectedState],
131
- direction: SwipeDirection,
132
- duration: int = Field(description="Duration in ms", ge=1, le=10000, default=400),
133
- ):
134
- """Swipe in a specific direction across the screen."""
135
- swipe_request = SwipeRequest(
136
- swipe_mode=direction,
137
- duration=duration,
138
- )
139
- return get_swipe_tool(ctx=ctx).invoke(
140
- input={
141
- "tool_call_id": tool_call_id,
142
- "state": state,
143
- "agent_thought": agent_thought,
144
- "swipe_request": swipe_request,
145
- }
146
- )
147
-
148
- return [swipe_coordinates, swipe_percentages, swipe_direction]
125
+ return [swipe_coordinates, swipe_percentages]
149
126
 
150
127
 
151
128
  swipe_wrapper = CompositeToolWrapper(
@@ -2,7 +2,7 @@ from typing import Annotated
2
2
 
3
3
  from langchain_core.messages import ToolMessage
4
4
  from langchain_core.tools import tool
5
- from langchain_core.tools.base import InjectedToolCallId
5
+ from langchain_core.tools.base import BaseTool, InjectedToolCallId
6
6
  from langgraph.prebuilt import InjectedState
7
7
  from langgraph.types import Command
8
8
 
@@ -23,7 +23,7 @@ from minitap.mobile_use.utils.logger import get_logger
23
23
  logger = get_logger(__name__)
24
24
 
25
25
 
26
- def get_tap_tool(ctx: MobileUseContext):
26
+ def get_tap_tool(ctx: MobileUseContext) -> BaseTool:
27
27
  @tool
28
28
  async def tap(
29
29
  tool_call_id: Annotated[str, InjectedToolCallId],
@@ -41,67 +41,79 @@ def get_tap_tool(ctx: MobileUseContext):
41
41
  output = {
42
42
  "error": "No valid selector provided or all selectors failed."
43
43
  } # Default to failure
44
- final_selector_info = "N/A"
44
+ latest_selector_info: str | None = None
45
45
 
46
- # 1. Try with resource_id
47
- if target.resource_id:
46
+ # 1. Try with COORDINATES FIRST (visual approach)
47
+ if target.coordinates:
48
48
  try:
49
- selector = IdSelectorRequest(id=target.resource_id)
49
+ center_point = target.coordinates.get_center()
50
+ selector = SelectorRequestWithCoordinates(
51
+ coordinates=CoordinatesSelectorRequest(x=center_point.x, y=center_point.y)
52
+ )
50
53
  logger.info(
51
- f"Attempting to tap using resource_id: '{target.resource_id}' "
52
- f"at index {target.resource_id_index}"
54
+ f"Attempting to tap using coordinates: {center_point.x},{center_point.y}"
53
55
  )
56
+ latest_selector_info = f"coordinates='{target.coordinates}'"
54
57
  result = tap_controller(
55
- ctx=ctx, selector_request=selector, index=target.resource_id_index
58
+ ctx=ctx,
59
+ selector_request=selector,
60
+ ui_hierarchy=state.latest_ui_hierarchy,
56
61
  )
57
62
  if result is None: # Success
58
63
  output = None
59
- final_selector_info = (
60
- f"resource_id='{target.resource_id}' (index={target.resource_id_index})"
61
- )
62
64
  else:
63
65
  logger.warning(
64
- f"Tap with resource_id '{target.resource_id}' failed. Error: {result}"
66
+ f"Tap with coordinates '{target.coordinates}' failed. Error: {result}"
65
67
  )
66
68
  output = result
67
69
  except Exception as e:
68
- logger.warning(f"Exception during tap with resource_id '{target.resource_id}': {e}")
70
+ logger.warning(f"Exception during tap with coordinates '{target.coordinates}': {e}")
69
71
  output = {"error": str(e)}
70
72
 
71
- # 2. If resource_id failed or wasn't provided, try with coordinates
72
- if output is not None and target.coordinates:
73
+ # 2. If coordinates failed or weren't provided, try with resource_id
74
+ if output is not None and target.resource_id:
73
75
  try:
74
- center_point = target.coordinates.get_center()
75
- selector = SelectorRequestWithCoordinates(
76
- coordinates=CoordinatesSelectorRequest(x=center_point.x, y=center_point.y)
77
- )
76
+ selector = IdSelectorRequest(id=target.resource_id)
78
77
  logger.info(
79
- f"Attempting to tap using coordinates: {center_point.x},{center_point.y}"
78
+ f"Attempting to tap using resource_id: '{target.resource_id}' "
79
+ f"at index {target.resource_id_index}"
80
+ )
81
+ latest_selector_info = (
82
+ f"resource_id='{target.resource_id}' (index={target.resource_id_index})"
83
+ )
84
+ result = tap_controller(
85
+ ctx=ctx,
86
+ selector_request=selector,
87
+ index=target.resource_id_index,
88
+ ui_hierarchy=state.latest_ui_hierarchy,
80
89
  )
81
- result = tap_controller(ctx=ctx, selector_request=selector)
82
90
  if result is None: # Success
83
91
  output = None
84
- final_selector_info = f"coordinates='{target.coordinates}'"
85
92
  else:
86
93
  logger.warning(
87
- f"Tap with coordinates '{target.coordinates}' failed. Error: {result}"
94
+ f"Tap with resource_id '{target.resource_id}' failed. Error: {result}"
88
95
  )
89
96
  output = result
90
97
  except Exception as e:
91
- logger.warning(f"Exception during tap with coordinates '{target.coordinates}': {e}")
98
+ logger.warning(f"Exception during tap with resource_id '{target.resource_id}': {e}")
92
99
  output = {"error": str(e)}
93
100
 
94
- # 3. If coordinates failed or weren't provided, try with text
101
+ # 3. If resource_id failed or wasn't provided, try with text (last resort)
95
102
  if output is not None and target.text:
96
103
  try:
97
104
  selector = TextSelectorRequest(text=target.text)
98
105
  logger.info(
99
106
  f"Attempting to tap using text: '{target.text}' at index {target.text_index}"
100
107
  )
101
- result = tap_controller(ctx=ctx, selector_request=selector, index=target.text_index)
108
+ latest_selector_info = f"text='{target.text}' (index={target.text_index})"
109
+ result = tap_controller(
110
+ ctx=ctx,
111
+ selector_request=selector,
112
+ index=target.text_index,
113
+ ui_hierarchy=state.latest_ui_hierarchy,
114
+ )
102
115
  if result is None: # Success
103
116
  output = None
104
- final_selector_info = f"text='{target.text}' (index={target.text_index})"
105
117
  else:
106
118
  logger.warning(f"Tap with text '{target.text}' failed. Error: {result}")
107
119
  output = result
@@ -110,6 +122,7 @@ def get_tap_tool(ctx: MobileUseContext):
110
122
  output = {"error": str(e)}
111
123
 
112
124
  has_failed = output is not None
125
+ final_selector_info = latest_selector_info if latest_selector_info else "N/A"
113
126
  agent_outcome = (
114
127
  tap_wrapper.on_failure_fn(final_selector_info)
115
128
  if has_failed
@@ -0,0 +1,84 @@
1
+ import asyncio
2
+ from typing import Annotated
3
+
4
+ from langchain_core.messages import ToolMessage
5
+ from langchain_core.tools import tool
6
+ from langchain_core.tools.base import InjectedToolCallId
7
+ from langgraph.prebuilt import InjectedState
8
+ from langgraph.types import Command
9
+
10
+ from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
11
+ from minitap.mobile_use.context import MobileUseContext
12
+ from minitap.mobile_use.controllers.mobile_command_controller import (
13
+ wait_for_delay as wait_for_delay_controller,
14
+ )
15
+ from minitap.mobile_use.graph.state import State
16
+ from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
17
+
18
+ MAX_DELAY_MS = 60000
19
+
20
+
21
+ def get_wait_for_delay_tool(ctx: MobileUseContext):
22
+ @tool
23
+ async def wait_for_delay(
24
+ tool_call_id: Annotated[str, InjectedToolCallId],
25
+ state: Annotated[State, InjectedState],
26
+ agent_thought: str,
27
+ time_in_ms: int,
28
+ ) -> Command:
29
+ """
30
+ Wait for a delay in milliseconds.
31
+
32
+ This tool pauses execution for a specified number of milliseconds.
33
+ Use this when you need to introduce a controlled delay to allow the UI
34
+ to update after an action, regardless of whether an animation is playing.
35
+
36
+ Args:
37
+ time_in_ms: The number of milliseconds to wait. (capped at 60 seconds)
38
+
39
+
40
+ Example:
41
+ - wait_for_delay with time_in_ms=1000 (waits 1 second)
42
+ - wait_for_delay with time_in_ms=500 (waits 0.5 seconds)
43
+ """
44
+ if time_in_ms < 0:
45
+ time_in_ms = 1000
46
+ if time_in_ms > MAX_DELAY_MS:
47
+ time_in_ms = MAX_DELAY_MS
48
+ try:
49
+ await asyncio.to_thread(wait_for_delay_controller, time_in_ms)
50
+ output = None
51
+ has_failed = False
52
+ except Exception as e:
53
+ output = str(e)
54
+ has_failed = True
55
+ agent_outcome = (
56
+ wait_for_delay_wrapper.on_failure_fn()
57
+ if has_failed
58
+ else wait_for_delay_wrapper.on_success_fn(time_in_ms)
59
+ )
60
+ tool_message = ToolMessage(
61
+ tool_call_id=tool_call_id,
62
+ content=agent_outcome,
63
+ additional_kwargs={"error": output} if has_failed else {},
64
+ status="error" if has_failed else "success",
65
+ )
66
+ return Command(
67
+ update=await state.asanitize_update(
68
+ ctx=ctx,
69
+ update={
70
+ "agents_thoughts": [agent_thought, agent_outcome],
71
+ EXECUTOR_MESSAGES_KEY: [tool_message],
72
+ },
73
+ agent="executor",
74
+ ),
75
+ )
76
+
77
+ return wait_for_delay
78
+
79
+
80
+ wait_for_delay_wrapper = ToolWrapper(
81
+ tool_fn_getter=get_wait_for_delay_tool,
82
+ on_success_fn=lambda delay: f"Successfully waited for {delay} milliseconds.",
83
+ on_failure_fn=lambda: "Failed to wait for delay.",
84
+ )
@@ -1,9 +1,10 @@
1
1
  import sys
2
2
 
3
- from minitap.mobile_use.clients.ios_client import get_ios_devices
4
3
  from adbutils import AdbClient
5
4
  from rich.console import Console
6
5
 
6
+ from minitap.mobile_use.clients.ios_client import get_ios_devices
7
+
7
8
 
8
9
  def display_device_status(console: Console, adb_client: AdbClient | None = None):
9
10
  """Checks for connected devices and displays the status."""
@@ -17,21 +18,24 @@ def display_device_status(console: Console, adb_client: AdbClient | None = None)
17
18
  console.print(f" - {device.serial}")
18
19
  else:
19
20
  console.print("❌ [bold red]No Android device found.[/bold red]")
20
- console.print("Please make sure your emulator is running or a device is connected via USB.")
21
21
  command = "emulator -avd <avd_name>"
22
22
  if sys.platform not in ["win32", "darwin"]:
23
23
  command = f"./{command}"
24
- console.print(f"You can start an emulator using a command like: [bold]'{command}'[/bold]")
25
- console.print("[italic]iOS detection coming soon...[/italic]")
24
+ console.print(
25
+ f"You can start an emulator using a command like: [bold]'{command}'[/bold]"
26
+ )
26
27
 
27
28
  xcrun_available, ios_devices, error_message = get_ios_devices()
28
29
  if xcrun_available:
29
30
  if ios_devices:
30
31
  console.print("✅ [bold green]iOS device(s) connected:[/bold green]")
31
32
  for device in ios_devices:
32
- console.print(f" - {device}")
33
+ console.print(f" - [green]{device}[/green]")
33
34
  else:
34
- console.print("❌ [bold red]No iOS device found.[/bold red]")
35
+ console.print(
36
+ "❌ [bold red]No iOS device found. We only support iOS simulators for now."
37
+ "[/bold red]"
38
+ )
35
39
  console.print(
36
40
  "[iOS] Please make sure your emulator is running or a device is connected via USB."
37
41
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: minitap-mobile-use
3
- Version: 2.5.3
3
+ Version: 2.6.0
4
4
  Summary: AI-powered multi-agent system that automates real Android and iOS devices through low-level control using LangGraph.
5
5
  Author: Pierre-Louis Favreau, Jean-Pierre Lo, Nicolas Dehandschoewercker
6
6
  License: MIT License