minitap-mobile-use 2.5.3__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- minitap/mobile_use/agents/contextor/contextor.py +0 -8
- minitap/mobile_use/agents/cortex/cortex.md +122 -36
- minitap/mobile_use/agents/cortex/cortex.py +32 -17
- minitap/mobile_use/agents/cortex/types.py +18 -4
- minitap/mobile_use/agents/executor/executor.md +3 -3
- minitap/mobile_use/agents/executor/executor.py +10 -3
- minitap/mobile_use/agents/hopper/hopper.md +30 -2
- minitap/mobile_use/agents/hopper/hopper.py +19 -15
- minitap/mobile_use/agents/orchestrator/orchestrator.py +14 -5
- minitap/mobile_use/agents/outputter/outputter.py +13 -3
- minitap/mobile_use/agents/planner/planner.md +20 -9
- minitap/mobile_use/agents/planner/planner.py +12 -5
- minitap/mobile_use/agents/screen_analyzer/human.md +16 -0
- minitap/mobile_use/agents/screen_analyzer/screen_analyzer.py +111 -0
- minitap/mobile_use/clients/ios_client.py +7 -3
- minitap/mobile_use/config.py +87 -24
- minitap/mobile_use/controllers/mobile_command_controller.py +354 -88
- minitap/mobile_use/controllers/platform_specific_commands_controller.py +41 -27
- minitap/mobile_use/controllers/types.py +95 -0
- minitap/mobile_use/graph/graph.py +55 -11
- minitap/mobile_use/graph/state.py +10 -3
- minitap/mobile_use/main.py +12 -4
- minitap/mobile_use/sdk/agent.py +109 -72
- minitap/mobile_use/sdk/examples/smart_notification_assistant.py +59 -10
- minitap/mobile_use/servers/device_hardware_bridge.py +13 -6
- minitap/mobile_use/services/llm.py +5 -2
- minitap/mobile_use/tools/index.py +7 -9
- minitap/mobile_use/tools/mobile/{clear_text.py → focus_and_clear_text.py} +7 -7
- minitap/mobile_use/tools/mobile/{input_text.py → focus_and_input_text.py} +8 -8
- minitap/mobile_use/tools/mobile/long_press_on.py +130 -15
- minitap/mobile_use/tools/mobile/swipe.py +3 -26
- minitap/mobile_use/tools/mobile/tap.py +41 -28
- minitap/mobile_use/tools/mobile/wait_for_delay.py +84 -0
- minitap/mobile_use/utils/cli_helpers.py +10 -6
- {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.6.0.dist-info}/METADATA +1 -1
- {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.6.0.dist-info}/RECORD +38 -36
- minitap/mobile_use/tools/mobile/glimpse_screen.py +0 -74
- minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +0 -64
- {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.6.0.dist-info}/WHEEL +0 -0
- {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.6.0.dist-info}/entry_points.txt +0 -0
|
@@ -2,10 +2,9 @@ from langchain_core.tools import BaseTool
|
|
|
2
2
|
|
|
3
3
|
from minitap.mobile_use.context import MobileUseContext
|
|
4
4
|
from minitap.mobile_use.tools.mobile.back import back_wrapper
|
|
5
|
-
from minitap.mobile_use.tools.mobile.clear_text import clear_text_wrapper
|
|
6
5
|
from minitap.mobile_use.tools.mobile.erase_one_char import erase_one_char_wrapper
|
|
7
|
-
from minitap.mobile_use.tools.mobile.
|
|
8
|
-
from minitap.mobile_use.tools.mobile.
|
|
6
|
+
from minitap.mobile_use.tools.mobile.focus_and_clear_text import focus_and_clear_text_wrapper
|
|
7
|
+
from minitap.mobile_use.tools.mobile.focus_and_input_text import focus_and_input_text_wrapper
|
|
9
8
|
from minitap.mobile_use.tools.mobile.launch_app import launch_app_wrapper
|
|
10
9
|
from minitap.mobile_use.tools.mobile.long_press_on import long_press_on_wrapper
|
|
11
10
|
from minitap.mobile_use.tools.mobile.open_link import open_link_wrapper
|
|
@@ -13,8 +12,8 @@ from minitap.mobile_use.tools.mobile.press_key import press_key_wrapper
|
|
|
13
12
|
from minitap.mobile_use.tools.mobile.stop_app import stop_app_wrapper
|
|
14
13
|
from minitap.mobile_use.tools.mobile.swipe import swipe_wrapper
|
|
15
14
|
from minitap.mobile_use.tools.mobile.tap import tap_wrapper
|
|
16
|
-
from minitap.mobile_use.tools.mobile.
|
|
17
|
-
|
|
15
|
+
from minitap.mobile_use.tools.mobile.wait_for_delay import (
|
|
16
|
+
wait_for_delay_wrapper,
|
|
18
17
|
)
|
|
19
18
|
from minitap.mobile_use.tools.tool_wrapper import CompositeToolWrapper, ToolWrapper
|
|
20
19
|
|
|
@@ -24,14 +23,13 @@ EXECUTOR_WRAPPERS_TOOLS = [
|
|
|
24
23
|
tap_wrapper,
|
|
25
24
|
long_press_on_wrapper,
|
|
26
25
|
swipe_wrapper,
|
|
27
|
-
|
|
28
|
-
input_text_wrapper,
|
|
26
|
+
focus_and_input_text_wrapper,
|
|
29
27
|
erase_one_char_wrapper,
|
|
30
28
|
launch_app_wrapper,
|
|
31
29
|
stop_app_wrapper,
|
|
32
|
-
|
|
30
|
+
focus_and_clear_text_wrapper,
|
|
33
31
|
press_key_wrapper,
|
|
34
|
-
|
|
32
|
+
wait_for_delay_wrapper,
|
|
35
33
|
]
|
|
36
34
|
|
|
37
35
|
|
|
@@ -238,9 +238,9 @@ class TextClearer:
|
|
|
238
238
|
)
|
|
239
239
|
|
|
240
240
|
|
|
241
|
-
def
|
|
241
|
+
def get_focus_and_clear_text_tool(ctx: MobileUseContext):
|
|
242
242
|
@tool
|
|
243
|
-
async def
|
|
243
|
+
async def focus_and_clear_text(
|
|
244
244
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
245
245
|
state: Annotated[State, InjectedState],
|
|
246
246
|
agent_thought: str,
|
|
@@ -255,9 +255,9 @@ def get_clear_text_tool(ctx: MobileUseContext):
|
|
|
255
255
|
)
|
|
256
256
|
|
|
257
257
|
agent_outcome = (
|
|
258
|
-
|
|
258
|
+
focus_and_clear_text_wrapper.on_failure_fn(result.error_message)
|
|
259
259
|
if not result.success
|
|
260
|
-
else
|
|
260
|
+
else focus_and_clear_text_wrapper.on_success_fn(
|
|
261
261
|
nb_char_erased=result.chars_erased, new_text_value=result.final_text
|
|
262
262
|
)
|
|
263
263
|
)
|
|
@@ -280,7 +280,7 @@ def get_clear_text_tool(ctx: MobileUseContext):
|
|
|
280
280
|
),
|
|
281
281
|
)
|
|
282
282
|
|
|
283
|
-
return
|
|
283
|
+
return focus_and_clear_text
|
|
284
284
|
|
|
285
285
|
|
|
286
286
|
def _format_success_message(nb_char_erased: int, new_text_value: str | None) -> str:
|
|
@@ -299,8 +299,8 @@ def _format_failure_message(output: str | None) -> str:
|
|
|
299
299
|
return "Failed to erase text. " + (str(output) if output else "")
|
|
300
300
|
|
|
301
301
|
|
|
302
|
-
|
|
303
|
-
tool_fn_getter=
|
|
302
|
+
focus_and_clear_text_wrapper = ToolWrapper(
|
|
303
|
+
tool_fn_getter=get_focus_and_clear_text_tool,
|
|
304
304
|
on_success_fn=_format_success_message,
|
|
305
305
|
on_failure_fn=_format_failure_message,
|
|
306
306
|
)
|
|
@@ -42,9 +42,9 @@ def _controller_input_text(ctx: MobileUseContext, text: str) -> InputResult:
|
|
|
42
42
|
return InputResult(ok=False, error=str(controller_out))
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def
|
|
45
|
+
def get_focus_and_input_text_tool(ctx: MobileUseContext):
|
|
46
46
|
@tool
|
|
47
|
-
async def
|
|
47
|
+
async def focus_and_input_text(
|
|
48
48
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
49
49
|
state: Annotated[State, InjectedState],
|
|
50
50
|
agent_thought: str,
|
|
@@ -70,7 +70,7 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
70
70
|
error_message = "Failed to focus the text input element before typing."
|
|
71
71
|
tool_message = ToolMessage(
|
|
72
72
|
tool_call_id=tool_call_id,
|
|
73
|
-
content=
|
|
73
|
+
content=focus_and_input_text_wrapper.on_failure_fn(text, error_message),
|
|
74
74
|
additional_kwargs={"error": error_message},
|
|
75
75
|
status="error",
|
|
76
76
|
)
|
|
@@ -103,9 +103,9 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
103
103
|
text_input_content = get_element_text(element)
|
|
104
104
|
|
|
105
105
|
agent_outcome = (
|
|
106
|
-
|
|
106
|
+
focus_and_input_text_wrapper.on_success_fn(text, text_input_content, target.resource_id)
|
|
107
107
|
if result.ok
|
|
108
|
-
else
|
|
108
|
+
else focus_and_input_text_wrapper.on_failure_fn(text, result.error)
|
|
109
109
|
)
|
|
110
110
|
|
|
111
111
|
tool_message = ToolMessage(
|
|
@@ -126,7 +126,7 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
126
126
|
),
|
|
127
127
|
)
|
|
128
128
|
|
|
129
|
-
return
|
|
129
|
+
return focus_and_input_text
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
def _on_input_success(text, text_input_content, text_input_resource_id):
|
|
@@ -141,8 +141,8 @@ def _on_input_success(text, text_input_content, text_input_resource_id):
|
|
|
141
141
|
return "Typed text, should now verify before moving forward"
|
|
142
142
|
|
|
143
143
|
|
|
144
|
-
|
|
145
|
-
tool_fn_getter=
|
|
144
|
+
focus_and_input_text_wrapper = ToolWrapper(
|
|
145
|
+
tool_fn_getter=get_focus_and_input_text_tool,
|
|
146
146
|
on_success_fn=_on_input_success,
|
|
147
147
|
on_failure_fn=lambda text, error: f"Failed to input text {repr(text)}. Reason: {error}",
|
|
148
148
|
)
|
|
@@ -2,46 +2,158 @@ from typing import Annotated
|
|
|
2
2
|
|
|
3
3
|
from langchain_core.messages import ToolMessage
|
|
4
4
|
from langchain_core.tools import tool
|
|
5
|
-
from langchain_core.tools.base import InjectedToolCallId
|
|
5
|
+
from langchain_core.tools.base import BaseTool, InjectedToolCallId
|
|
6
6
|
from langgraph.prebuilt import InjectedState
|
|
7
7
|
from langgraph.types import Command
|
|
8
8
|
|
|
9
9
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
10
10
|
from minitap.mobile_use.context import MobileUseContext
|
|
11
|
-
from minitap.mobile_use.controllers.mobile_command_controller import
|
|
11
|
+
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
12
|
+
CoordinatesSelectorRequest,
|
|
13
|
+
IdSelectorRequest,
|
|
14
|
+
SelectorRequestWithCoordinates,
|
|
15
|
+
TextSelectorRequest,
|
|
16
|
+
)
|
|
12
17
|
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
13
18
|
long_press_on as long_press_on_controller,
|
|
14
19
|
)
|
|
15
20
|
from minitap.mobile_use.graph.state import State
|
|
16
21
|
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
22
|
+
from minitap.mobile_use.tools.types import Target
|
|
23
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
17
24
|
|
|
25
|
+
logger = get_logger(__name__)
|
|
18
26
|
|
|
19
|
-
|
|
27
|
+
|
|
28
|
+
def get_long_press_on_tool(ctx: MobileUseContext) -> BaseTool:
|
|
20
29
|
@tool
|
|
21
30
|
async def long_press_on(
|
|
22
31
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
23
32
|
state: Annotated[State, InjectedState],
|
|
24
33
|
agent_thought: str,
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
)
|
|
34
|
+
target: Target,
|
|
35
|
+
duration_ms: int = 1000,
|
|
36
|
+
):
|
|
28
37
|
"""
|
|
29
|
-
Long
|
|
30
|
-
|
|
38
|
+
Long presses on a UI element identified by the 'target' object.
|
|
39
|
+
|
|
40
|
+
The 'target' object allows specifying an element by its resource_id
|
|
41
|
+
(with an optional index), its coordinates, or its text content (with an optional index).
|
|
42
|
+
The tool uses a fallback strategy, trying the locators in that order.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
target: The UI element to long press on (coordinates, resource_id, or text).
|
|
46
|
+
duration_ms: Duration of the long press in milliseconds. Choose based on interaction:
|
|
47
|
+
- 500-800ms: Quick long press (e.g., selecting text, haptic feedback)
|
|
48
|
+
- 1000ms (default): Standard long press (most common use case)
|
|
49
|
+
- 1500-2000ms: Extended long press (e.g., context menus, special actions)
|
|
50
|
+
- 2500ms+: Very long press (e.g., accessibility, advanced gestures)
|
|
31
51
|
"""
|
|
32
|
-
|
|
33
|
-
|
|
52
|
+
error_obj: dict | None = {
|
|
53
|
+
"error": "No valid selector provided or all selectors failed."
|
|
54
|
+
} # Default to failure
|
|
55
|
+
latest_selector_info: str | None = None
|
|
56
|
+
|
|
57
|
+
# 1. Try with COORDINATES FIRST (visual approach)
|
|
58
|
+
if target.coordinates:
|
|
59
|
+
try:
|
|
60
|
+
center_point = target.coordinates.get_center()
|
|
61
|
+
selector = SelectorRequestWithCoordinates(
|
|
62
|
+
coordinates=CoordinatesSelectorRequest(x=center_point.x, y=center_point.y)
|
|
63
|
+
)
|
|
64
|
+
logger.info(
|
|
65
|
+
f"Attempting to long press using coordinates: {center_point.x},{center_point.y}"
|
|
66
|
+
)
|
|
67
|
+
latest_selector_info = f"coordinates='{target.coordinates}'"
|
|
68
|
+
result = long_press_on_controller(
|
|
69
|
+
ctx=ctx,
|
|
70
|
+
selector_request=selector,
|
|
71
|
+
ui_hierarchy=state.latest_ui_hierarchy,
|
|
72
|
+
long_press_duration=duration_ms,
|
|
73
|
+
)
|
|
74
|
+
if result is None: # Success
|
|
75
|
+
error_obj = None
|
|
76
|
+
else:
|
|
77
|
+
logger.warning(
|
|
78
|
+
f"Long press with coordinates '{target.coordinates}' failed. "
|
|
79
|
+
f"Error: {result}"
|
|
80
|
+
)
|
|
81
|
+
error_obj = {"error": result} if isinstance(result, str) else result
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logger.warning(
|
|
84
|
+
f"Exception during long press with coordinates '{target.coordinates}': {e}"
|
|
85
|
+
)
|
|
86
|
+
error_obj = {"error": str(e)}
|
|
87
|
+
|
|
88
|
+
# 2. If coordinates failed or weren't provided, try with resource_id
|
|
89
|
+
if error_obj is not None and target.resource_id:
|
|
90
|
+
try:
|
|
91
|
+
selector = IdSelectorRequest(id=target.resource_id)
|
|
92
|
+
logger.info(
|
|
93
|
+
f"Attempting to long press using resource_id: '{target.resource_id}' "
|
|
94
|
+
f"at index {target.resource_id_index}"
|
|
95
|
+
)
|
|
96
|
+
latest_selector_info = (
|
|
97
|
+
f"resource_id='{target.resource_id}' (index={target.resource_id_index})"
|
|
98
|
+
)
|
|
99
|
+
result = long_press_on_controller(
|
|
100
|
+
ctx=ctx,
|
|
101
|
+
selector_request=selector,
|
|
102
|
+
index=target.resource_id_index,
|
|
103
|
+
ui_hierarchy=state.latest_ui_hierarchy,
|
|
104
|
+
long_press_duration=duration_ms,
|
|
105
|
+
)
|
|
106
|
+
if result is None: # Success
|
|
107
|
+
error_obj = None
|
|
108
|
+
else:
|
|
109
|
+
logger.warning(
|
|
110
|
+
f"Long press with resource_id '{target.resource_id}' failed. "
|
|
111
|
+
f"Error: {result}"
|
|
112
|
+
)
|
|
113
|
+
error_obj = {"error": result} if isinstance(result, str) else result
|
|
114
|
+
except Exception as e:
|
|
115
|
+
logger.warning(
|
|
116
|
+
f"Exception during long press with resource_id '{target.resource_id}': {e}"
|
|
117
|
+
)
|
|
118
|
+
error_obj = {"error": str(e)}
|
|
119
|
+
|
|
120
|
+
# 3. If resource_id failed or wasn't provided, try with text (last resort)
|
|
121
|
+
if error_obj is not None and target.text:
|
|
122
|
+
try:
|
|
123
|
+
selector = TextSelectorRequest(text=target.text)
|
|
124
|
+
logger.info(
|
|
125
|
+
f"Attempting to long press using text: '{target.text}' "
|
|
126
|
+
f"at index {target.text_index}"
|
|
127
|
+
)
|
|
128
|
+
latest_selector_info = f"text='{target.text}' (index={target.text_index})"
|
|
129
|
+
result = long_press_on_controller(
|
|
130
|
+
ctx=ctx,
|
|
131
|
+
selector_request=selector,
|
|
132
|
+
index=target.text_index,
|
|
133
|
+
ui_hierarchy=state.latest_ui_hierarchy,
|
|
134
|
+
long_press_duration=duration_ms,
|
|
135
|
+
)
|
|
136
|
+
if result is None: # Success
|
|
137
|
+
error_obj = None
|
|
138
|
+
else:
|
|
139
|
+
logger.warning(f"Long press with text '{target.text}' failed. Error: {result}")
|
|
140
|
+
error_obj = {"error": result} if isinstance(result, str) else result
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.warning(f"Exception during long press with text '{target.text}': {e}")
|
|
143
|
+
error_obj = {"error": str(e)}
|
|
34
144
|
|
|
145
|
+
has_failed = error_obj is not None
|
|
146
|
+
final_selector_info = latest_selector_info if latest_selector_info else "N/A"
|
|
35
147
|
agent_outcome = (
|
|
36
|
-
long_press_on_wrapper.on_failure_fn()
|
|
148
|
+
long_press_on_wrapper.on_failure_fn(final_selector_info)
|
|
37
149
|
if has_failed
|
|
38
|
-
else long_press_on_wrapper.on_success_fn()
|
|
150
|
+
else long_press_on_wrapper.on_success_fn(final_selector_info)
|
|
39
151
|
)
|
|
40
152
|
|
|
41
153
|
tool_message = ToolMessage(
|
|
42
154
|
tool_call_id=tool_call_id,
|
|
43
155
|
content=agent_outcome,
|
|
44
|
-
additional_kwargs=
|
|
156
|
+
additional_kwargs=error_obj if has_failed else {},
|
|
45
157
|
status="error" if has_failed else "success",
|
|
46
158
|
)
|
|
47
159
|
return Command(
|
|
@@ -60,6 +172,9 @@ def get_long_press_on_tool(ctx: MobileUseContext):
|
|
|
60
172
|
|
|
61
173
|
long_press_on_wrapper = ToolWrapper(
|
|
62
174
|
tool_fn_getter=get_long_press_on_tool,
|
|
63
|
-
on_success_fn=lambda:
|
|
64
|
-
|
|
175
|
+
on_success_fn=lambda selector_info: (
|
|
176
|
+
f"Long press on element with {selector_info} was successful."
|
|
177
|
+
),
|
|
178
|
+
on_failure_fn=lambda selector_info: "Failed to long press on element. "
|
|
179
|
+
+ f"Last attempt was with {selector_info}.",
|
|
65
180
|
)
|
|
@@ -9,15 +9,14 @@ from pydantic import Field
|
|
|
9
9
|
|
|
10
10
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
11
11
|
from minitap.mobile_use.context import MobileUseContext
|
|
12
|
-
from minitap.mobile_use.controllers.mobile_command_controller import
|
|
12
|
+
from minitap.mobile_use.controllers.mobile_command_controller import swipe as swipe_controller
|
|
13
|
+
from minitap.mobile_use.controllers.types import (
|
|
13
14
|
CoordinatesSelectorRequest,
|
|
14
15
|
PercentagesSelectorRequest,
|
|
15
|
-
SwipeDirection,
|
|
16
16
|
SwipeRequest,
|
|
17
17
|
SwipeStartEndCoordinatesRequest,
|
|
18
18
|
SwipeStartEndPercentagesRequest,
|
|
19
19
|
)
|
|
20
|
-
from minitap.mobile_use.controllers.mobile_command_controller import swipe as swipe_controller
|
|
21
20
|
from minitap.mobile_use.graph.state import State
|
|
22
21
|
from minitap.mobile_use.tools.tool_wrapper import CompositeToolWrapper
|
|
23
22
|
|
|
@@ -123,29 +122,7 @@ def get_composite_swipe_tools(ctx: MobileUseContext) -> list[BaseTool]:
|
|
|
123
122
|
}
|
|
124
123
|
)
|
|
125
124
|
|
|
126
|
-
|
|
127
|
-
def swipe_direction(
|
|
128
|
-
agent_thought: str,
|
|
129
|
-
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
130
|
-
state: Annotated[State, InjectedState],
|
|
131
|
-
direction: SwipeDirection,
|
|
132
|
-
duration: int = Field(description="Duration in ms", ge=1, le=10000, default=400),
|
|
133
|
-
):
|
|
134
|
-
"""Swipe in a specific direction across the screen."""
|
|
135
|
-
swipe_request = SwipeRequest(
|
|
136
|
-
swipe_mode=direction,
|
|
137
|
-
duration=duration,
|
|
138
|
-
)
|
|
139
|
-
return get_swipe_tool(ctx=ctx).invoke(
|
|
140
|
-
input={
|
|
141
|
-
"tool_call_id": tool_call_id,
|
|
142
|
-
"state": state,
|
|
143
|
-
"agent_thought": agent_thought,
|
|
144
|
-
"swipe_request": swipe_request,
|
|
145
|
-
}
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
return [swipe_coordinates, swipe_percentages, swipe_direction]
|
|
125
|
+
return [swipe_coordinates, swipe_percentages]
|
|
149
126
|
|
|
150
127
|
|
|
151
128
|
swipe_wrapper = CompositeToolWrapper(
|
|
@@ -2,7 +2,7 @@ from typing import Annotated
|
|
|
2
2
|
|
|
3
3
|
from langchain_core.messages import ToolMessage
|
|
4
4
|
from langchain_core.tools import tool
|
|
5
|
-
from langchain_core.tools.base import InjectedToolCallId
|
|
5
|
+
from langchain_core.tools.base import BaseTool, InjectedToolCallId
|
|
6
6
|
from langgraph.prebuilt import InjectedState
|
|
7
7
|
from langgraph.types import Command
|
|
8
8
|
|
|
@@ -23,7 +23,7 @@ from minitap.mobile_use.utils.logger import get_logger
|
|
|
23
23
|
logger = get_logger(__name__)
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def get_tap_tool(ctx: MobileUseContext):
|
|
26
|
+
def get_tap_tool(ctx: MobileUseContext) -> BaseTool:
|
|
27
27
|
@tool
|
|
28
28
|
async def tap(
|
|
29
29
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
@@ -41,67 +41,79 @@ def get_tap_tool(ctx: MobileUseContext):
|
|
|
41
41
|
output = {
|
|
42
42
|
"error": "No valid selector provided or all selectors failed."
|
|
43
43
|
} # Default to failure
|
|
44
|
-
|
|
44
|
+
latest_selector_info: str | None = None
|
|
45
45
|
|
|
46
|
-
# 1. Try with
|
|
47
|
-
if target.
|
|
46
|
+
# 1. Try with COORDINATES FIRST (visual approach)
|
|
47
|
+
if target.coordinates:
|
|
48
48
|
try:
|
|
49
|
-
|
|
49
|
+
center_point = target.coordinates.get_center()
|
|
50
|
+
selector = SelectorRequestWithCoordinates(
|
|
51
|
+
coordinates=CoordinatesSelectorRequest(x=center_point.x, y=center_point.y)
|
|
52
|
+
)
|
|
50
53
|
logger.info(
|
|
51
|
-
f"Attempting to tap using
|
|
52
|
-
f"at index {target.resource_id_index}"
|
|
54
|
+
f"Attempting to tap using coordinates: {center_point.x},{center_point.y}"
|
|
53
55
|
)
|
|
56
|
+
latest_selector_info = f"coordinates='{target.coordinates}'"
|
|
54
57
|
result = tap_controller(
|
|
55
|
-
ctx=ctx,
|
|
58
|
+
ctx=ctx,
|
|
59
|
+
selector_request=selector,
|
|
60
|
+
ui_hierarchy=state.latest_ui_hierarchy,
|
|
56
61
|
)
|
|
57
62
|
if result is None: # Success
|
|
58
63
|
output = None
|
|
59
|
-
final_selector_info = (
|
|
60
|
-
f"resource_id='{target.resource_id}' (index={target.resource_id_index})"
|
|
61
|
-
)
|
|
62
64
|
else:
|
|
63
65
|
logger.warning(
|
|
64
|
-
f"Tap with
|
|
66
|
+
f"Tap with coordinates '{target.coordinates}' failed. Error: {result}"
|
|
65
67
|
)
|
|
66
68
|
output = result
|
|
67
69
|
except Exception as e:
|
|
68
|
-
logger.warning(f"Exception during tap with
|
|
70
|
+
logger.warning(f"Exception during tap with coordinates '{target.coordinates}': {e}")
|
|
69
71
|
output = {"error": str(e)}
|
|
70
72
|
|
|
71
|
-
# 2. If
|
|
72
|
-
if output is not None and target.
|
|
73
|
+
# 2. If coordinates failed or weren't provided, try with resource_id
|
|
74
|
+
if output is not None and target.resource_id:
|
|
73
75
|
try:
|
|
74
|
-
|
|
75
|
-
selector = SelectorRequestWithCoordinates(
|
|
76
|
-
coordinates=CoordinatesSelectorRequest(x=center_point.x, y=center_point.y)
|
|
77
|
-
)
|
|
76
|
+
selector = IdSelectorRequest(id=target.resource_id)
|
|
78
77
|
logger.info(
|
|
79
|
-
f"Attempting to tap using
|
|
78
|
+
f"Attempting to tap using resource_id: '{target.resource_id}' "
|
|
79
|
+
f"at index {target.resource_id_index}"
|
|
80
|
+
)
|
|
81
|
+
latest_selector_info = (
|
|
82
|
+
f"resource_id='{target.resource_id}' (index={target.resource_id_index})"
|
|
83
|
+
)
|
|
84
|
+
result = tap_controller(
|
|
85
|
+
ctx=ctx,
|
|
86
|
+
selector_request=selector,
|
|
87
|
+
index=target.resource_id_index,
|
|
88
|
+
ui_hierarchy=state.latest_ui_hierarchy,
|
|
80
89
|
)
|
|
81
|
-
result = tap_controller(ctx=ctx, selector_request=selector)
|
|
82
90
|
if result is None: # Success
|
|
83
91
|
output = None
|
|
84
|
-
final_selector_info = f"coordinates='{target.coordinates}'"
|
|
85
92
|
else:
|
|
86
93
|
logger.warning(
|
|
87
|
-
f"Tap with
|
|
94
|
+
f"Tap with resource_id '{target.resource_id}' failed. Error: {result}"
|
|
88
95
|
)
|
|
89
96
|
output = result
|
|
90
97
|
except Exception as e:
|
|
91
|
-
logger.warning(f"Exception during tap with
|
|
98
|
+
logger.warning(f"Exception during tap with resource_id '{target.resource_id}': {e}")
|
|
92
99
|
output = {"error": str(e)}
|
|
93
100
|
|
|
94
|
-
# 3. If
|
|
101
|
+
# 3. If resource_id failed or wasn't provided, try with text (last resort)
|
|
95
102
|
if output is not None and target.text:
|
|
96
103
|
try:
|
|
97
104
|
selector = TextSelectorRequest(text=target.text)
|
|
98
105
|
logger.info(
|
|
99
106
|
f"Attempting to tap using text: '{target.text}' at index {target.text_index}"
|
|
100
107
|
)
|
|
101
|
-
|
|
108
|
+
latest_selector_info = f"text='{target.text}' (index={target.text_index})"
|
|
109
|
+
result = tap_controller(
|
|
110
|
+
ctx=ctx,
|
|
111
|
+
selector_request=selector,
|
|
112
|
+
index=target.text_index,
|
|
113
|
+
ui_hierarchy=state.latest_ui_hierarchy,
|
|
114
|
+
)
|
|
102
115
|
if result is None: # Success
|
|
103
116
|
output = None
|
|
104
|
-
final_selector_info = f"text='{target.text}' (index={target.text_index})"
|
|
105
117
|
else:
|
|
106
118
|
logger.warning(f"Tap with text '{target.text}' failed. Error: {result}")
|
|
107
119
|
output = result
|
|
@@ -110,6 +122,7 @@ def get_tap_tool(ctx: MobileUseContext):
|
|
|
110
122
|
output = {"error": str(e)}
|
|
111
123
|
|
|
112
124
|
has_failed = output is not None
|
|
125
|
+
final_selector_info = latest_selector_info if latest_selector_info else "N/A"
|
|
113
126
|
agent_outcome = (
|
|
114
127
|
tap_wrapper.on_failure_fn(final_selector_info)
|
|
115
128
|
if has_failed
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
from langchain_core.messages import ToolMessage
|
|
5
|
+
from langchain_core.tools import tool
|
|
6
|
+
from langchain_core.tools.base import InjectedToolCallId
|
|
7
|
+
from langgraph.prebuilt import InjectedState
|
|
8
|
+
from langgraph.types import Command
|
|
9
|
+
|
|
10
|
+
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
11
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
12
|
+
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
13
|
+
wait_for_delay as wait_for_delay_controller,
|
|
14
|
+
)
|
|
15
|
+
from minitap.mobile_use.graph.state import State
|
|
16
|
+
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
17
|
+
|
|
18
|
+
MAX_DELAY_MS = 60000
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_wait_for_delay_tool(ctx: MobileUseContext):
|
|
22
|
+
@tool
|
|
23
|
+
async def wait_for_delay(
|
|
24
|
+
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
25
|
+
state: Annotated[State, InjectedState],
|
|
26
|
+
agent_thought: str,
|
|
27
|
+
time_in_ms: int,
|
|
28
|
+
) -> Command:
|
|
29
|
+
"""
|
|
30
|
+
Wait for a delay in milliseconds.
|
|
31
|
+
|
|
32
|
+
This tool pauses execution for a specified number of milliseconds.
|
|
33
|
+
Use this when you need to introduce a controlled delay to allow the UI
|
|
34
|
+
to update after an action, regardless of whether an animation is playing.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
time_in_ms: The number of milliseconds to wait. (capped at 60 seconds)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
- wait_for_delay with time_in_ms=1000 (waits 1 second)
|
|
42
|
+
- wait_for_delay with time_in_ms=500 (waits 0.5 seconds)
|
|
43
|
+
"""
|
|
44
|
+
if time_in_ms < 0:
|
|
45
|
+
time_in_ms = 1000
|
|
46
|
+
if time_in_ms > MAX_DELAY_MS:
|
|
47
|
+
time_in_ms = MAX_DELAY_MS
|
|
48
|
+
try:
|
|
49
|
+
await asyncio.to_thread(wait_for_delay_controller, time_in_ms)
|
|
50
|
+
output = None
|
|
51
|
+
has_failed = False
|
|
52
|
+
except Exception as e:
|
|
53
|
+
output = str(e)
|
|
54
|
+
has_failed = True
|
|
55
|
+
agent_outcome = (
|
|
56
|
+
wait_for_delay_wrapper.on_failure_fn()
|
|
57
|
+
if has_failed
|
|
58
|
+
else wait_for_delay_wrapper.on_success_fn(time_in_ms)
|
|
59
|
+
)
|
|
60
|
+
tool_message = ToolMessage(
|
|
61
|
+
tool_call_id=tool_call_id,
|
|
62
|
+
content=agent_outcome,
|
|
63
|
+
additional_kwargs={"error": output} if has_failed else {},
|
|
64
|
+
status="error" if has_failed else "success",
|
|
65
|
+
)
|
|
66
|
+
return Command(
|
|
67
|
+
update=await state.asanitize_update(
|
|
68
|
+
ctx=ctx,
|
|
69
|
+
update={
|
|
70
|
+
"agents_thoughts": [agent_thought, agent_outcome],
|
|
71
|
+
EXECUTOR_MESSAGES_KEY: [tool_message],
|
|
72
|
+
},
|
|
73
|
+
agent="executor",
|
|
74
|
+
),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return wait_for_delay
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
wait_for_delay_wrapper = ToolWrapper(
|
|
81
|
+
tool_fn_getter=get_wait_for_delay_tool,
|
|
82
|
+
on_success_fn=lambda delay: f"Successfully waited for {delay} milliseconds.",
|
|
83
|
+
on_failure_fn=lambda: "Failed to wait for delay.",
|
|
84
|
+
)
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
|
|
3
|
-
from minitap.mobile_use.clients.ios_client import get_ios_devices
|
|
4
3
|
from adbutils import AdbClient
|
|
5
4
|
from rich.console import Console
|
|
6
5
|
|
|
6
|
+
from minitap.mobile_use.clients.ios_client import get_ios_devices
|
|
7
|
+
|
|
7
8
|
|
|
8
9
|
def display_device_status(console: Console, adb_client: AdbClient | None = None):
|
|
9
10
|
"""Checks for connected devices and displays the status."""
|
|
@@ -17,21 +18,24 @@ def display_device_status(console: Console, adb_client: AdbClient | None = None)
|
|
|
17
18
|
console.print(f" - {device.serial}")
|
|
18
19
|
else:
|
|
19
20
|
console.print("❌ [bold red]No Android device found.[/bold red]")
|
|
20
|
-
console.print("Please make sure your emulator is running or a device is connected via USB.")
|
|
21
21
|
command = "emulator -avd <avd_name>"
|
|
22
22
|
if sys.platform not in ["win32", "darwin"]:
|
|
23
23
|
command = f"./{command}"
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
console.print(
|
|
25
|
+
f"You can start an emulator using a command like: [bold]'{command}'[/bold]"
|
|
26
|
+
)
|
|
26
27
|
|
|
27
28
|
xcrun_available, ios_devices, error_message = get_ios_devices()
|
|
28
29
|
if xcrun_available:
|
|
29
30
|
if ios_devices:
|
|
30
31
|
console.print("✅ [bold green]iOS device(s) connected:[/bold green]")
|
|
31
32
|
for device in ios_devices:
|
|
32
|
-
console.print(f" - {device}")
|
|
33
|
+
console.print(f" - [green]{device}[/green]")
|
|
33
34
|
else:
|
|
34
|
-
console.print(
|
|
35
|
+
console.print(
|
|
36
|
+
"❌ [bold red]No iOS device found. We only support iOS simulators for now."
|
|
37
|
+
"[/bold red]"
|
|
38
|
+
)
|
|
35
39
|
console.print(
|
|
36
40
|
"[iOS] Please make sure your emulator is running or a device is connected via USB."
|
|
37
41
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: minitap-mobile-use
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6.0
|
|
4
4
|
Summary: AI-powered multi-agent system that automates real Android and iOS devices through low-level control using LangGraph.
|
|
5
5
|
Author: Pierre-Louis Favreau, Jean-Pierre Lo, Nicolas Dehandschoewercker
|
|
6
6
|
License: MIT License
|