minitap-mobile-use 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- minitap/mobile_use/agents/contextor/contextor.py +4 -2
- minitap/mobile_use/agents/cortex/cortex.md +71 -25
- minitap/mobile_use/agents/cortex/cortex.py +0 -1
- minitap/mobile_use/agents/executor/executor.md +6 -4
- minitap/mobile_use/agents/executor/utils.py +2 -1
- minitap/mobile_use/agents/outputter/test_outputter.py +104 -42
- minitap/mobile_use/sdk/agent.py +16 -6
- minitap/mobile_use/sdk/types/exceptions.py +30 -0
- minitap/mobile_use/servers/device_hardware_bridge.py +2 -1
- minitap/mobile_use/servers/utils.py +6 -9
- minitap/mobile_use/tools/index.py +2 -2
- minitap/mobile_use/tools/mobile/clear_text.py +73 -25
- minitap/mobile_use/tools/mobile/copy_text_from.py +7 -5
- minitap/mobile_use/tools/mobile/{take_screenshot.py → glimpse_screen.py} +15 -11
- minitap/mobile_use/tools/mobile/input_text.py +94 -13
- minitap/mobile_use/tools/mobile/paste_text.py +34 -8
- minitap/mobile_use/tools/test_utils.py +351 -0
- minitap/mobile_use/tools/utils.py +147 -40
- minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
- minitap/mobile_use/utils/ui_hierarchy.py +2 -2
- {minitap_mobile_use-2.2.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/METADATA +4 -4
- {minitap_mobile_use-2.2.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/RECORD +24 -22
- {minitap_mobile_use-2.2.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/WHEEL +0 -0
- {minitap_mobile_use-2.2.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -23,6 +23,7 @@ from minitap.mobile_use.tools.utils import (
|
|
|
23
23
|
)
|
|
24
24
|
from minitap.mobile_use.utils.logger import get_logger
|
|
25
25
|
from minitap.mobile_use.utils.ui_hierarchy import (
|
|
26
|
+
ElementBounds,
|
|
26
27
|
find_element_by_resource_id,
|
|
27
28
|
get_element_text,
|
|
28
29
|
text_input_is_empty,
|
|
@@ -50,16 +51,20 @@ class TextClearer:
|
|
|
50
51
|
screen_data = get_screen_data(screen_api_client=self.ctx.screen_api_client)
|
|
51
52
|
self.state.latest_ui_hierarchy = screen_data.elements
|
|
52
53
|
|
|
53
|
-
def _get_element_info(
|
|
54
|
+
def _get_element_info(
|
|
55
|
+
self, resource_id: str | None
|
|
56
|
+
) -> tuple[object | None, str | None, str | None]:
|
|
54
57
|
if not self.state.latest_ui_hierarchy:
|
|
55
58
|
self._refresh_ui_hierarchy()
|
|
56
59
|
|
|
57
60
|
if not self.state.latest_ui_hierarchy:
|
|
58
61
|
return None, None, None
|
|
59
62
|
|
|
60
|
-
element =
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
element = None
|
|
64
|
+
if resource_id:
|
|
65
|
+
element = find_element_by_resource_id(
|
|
66
|
+
ui_hierarchy=self.state.latest_ui_hierarchy, resource_id=resource_id
|
|
67
|
+
)
|
|
63
68
|
|
|
64
69
|
if not element:
|
|
65
70
|
return None, None, None
|
|
@@ -83,11 +88,27 @@ class TextClearer:
|
|
|
83
88
|
def _should_clear_text(self, current_text: str | None, hint_text: str | None) -> bool:
|
|
84
89
|
return current_text is not None and current_text != "" and current_text != hint_text
|
|
85
90
|
|
|
86
|
-
def _prepare_element_for_clearing(
|
|
87
|
-
|
|
91
|
+
def _prepare_element_for_clearing(
|
|
92
|
+
self,
|
|
93
|
+
text_input_resource_id: str | None,
|
|
94
|
+
text_input_coordinates: ElementBounds | None,
|
|
95
|
+
text_input_text: str | None,
|
|
96
|
+
) -> bool:
|
|
97
|
+
if not focus_element_if_needed(
|
|
98
|
+
ctx=self.ctx,
|
|
99
|
+
input_resource_id=text_input_resource_id,
|
|
100
|
+
input_coordinates=text_input_coordinates,
|
|
101
|
+
input_text=text_input_text,
|
|
102
|
+
):
|
|
88
103
|
return False
|
|
89
104
|
|
|
90
|
-
move_cursor_to_end_if_bounds(
|
|
105
|
+
move_cursor_to_end_if_bounds(
|
|
106
|
+
ctx=self.ctx,
|
|
107
|
+
state=self.state,
|
|
108
|
+
text_input_resource_id=text_input_resource_id,
|
|
109
|
+
text_input_coordinates=text_input_coordinates,
|
|
110
|
+
text_input_text=text_input_text,
|
|
111
|
+
)
|
|
91
112
|
return True
|
|
92
113
|
|
|
93
114
|
def _erase_text_attempt(self, text_length: int) -> str | None:
|
|
@@ -102,7 +123,12 @@ class TextClearer:
|
|
|
102
123
|
return None
|
|
103
124
|
|
|
104
125
|
def _clear_with_retries(
|
|
105
|
-
self,
|
|
126
|
+
self,
|
|
127
|
+
text_input_resource_id: str | None,
|
|
128
|
+
text_input_coordinates: ElementBounds | None,
|
|
129
|
+
text_input_text: str | None,
|
|
130
|
+
initial_text: str,
|
|
131
|
+
hint_text: str | None,
|
|
106
132
|
) -> tuple[bool, str | None, int]:
|
|
107
133
|
current_text = initial_text
|
|
108
134
|
erased_chars = 0
|
|
@@ -118,18 +144,25 @@ class TextClearer:
|
|
|
118
144
|
erased_chars += chars_to_erase
|
|
119
145
|
|
|
120
146
|
self._refresh_ui_hierarchy()
|
|
121
|
-
elt =
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
147
|
+
elt = None
|
|
148
|
+
if text_input_resource_id:
|
|
149
|
+
elt = find_element_by_resource_id(
|
|
150
|
+
ui_hierarchy=self.state.latest_ui_hierarchy or [],
|
|
151
|
+
resource_id=text_input_resource_id,
|
|
152
|
+
)
|
|
153
|
+
if elt:
|
|
154
|
+
current_text = get_element_text(elt)
|
|
155
|
+
logger.info(f"Current text: {current_text}")
|
|
156
|
+
if text_input_is_empty(text=current_text, hint_text=hint_text):
|
|
157
|
+
break
|
|
130
158
|
|
|
131
159
|
move_cursor_to_end_if_bounds(
|
|
132
|
-
ctx=self.ctx,
|
|
160
|
+
ctx=self.ctx,
|
|
161
|
+
state=self.state,
|
|
162
|
+
text_input_resource_id=text_input_resource_id,
|
|
163
|
+
text_input_coordinates=text_input_coordinates,
|
|
164
|
+
text_input_text=text_input_text,
|
|
165
|
+
elt=elt,
|
|
133
166
|
)
|
|
134
167
|
|
|
135
168
|
return True, current_text, erased_chars
|
|
@@ -162,7 +195,9 @@ class TextClearer:
|
|
|
162
195
|
hint_text=hint_text,
|
|
163
196
|
)
|
|
164
197
|
|
|
165
|
-
def _handle_element_not_found(
|
|
198
|
+
def _handle_element_not_found(
|
|
199
|
+
self, resource_id: str | None, hint_text: str | None
|
|
200
|
+
) -> ClearTextResult:
|
|
166
201
|
error = erase_text_controller(ctx=self.ctx)
|
|
167
202
|
self._refresh_ui_hierarchy()
|
|
168
203
|
|
|
@@ -176,16 +211,23 @@ class TextClearer:
|
|
|
176
211
|
hint_text=hint_text,
|
|
177
212
|
)
|
|
178
213
|
|
|
179
|
-
def
|
|
180
|
-
|
|
214
|
+
def clear_input_text(
|
|
215
|
+
self,
|
|
216
|
+
text_input_resource_id: str | None,
|
|
217
|
+
text_input_coordinates: ElementBounds | None,
|
|
218
|
+
text_input_text: str | None,
|
|
219
|
+
) -> ClearTextResult:
|
|
220
|
+
element, current_text, hint_text = self._get_element_info(text_input_resource_id)
|
|
181
221
|
|
|
182
222
|
if not element:
|
|
183
|
-
return self._handle_element_not_found(
|
|
223
|
+
return self._handle_element_not_found(text_input_resource_id, hint_text)
|
|
184
224
|
|
|
185
225
|
if not self._should_clear_text(current_text, hint_text):
|
|
186
226
|
return self._handle_no_clearing_needed(current_text, hint_text)
|
|
187
227
|
|
|
188
|
-
if not self._prepare_element_for_clearing(
|
|
228
|
+
if not self._prepare_element_for_clearing(
|
|
229
|
+
text_input_resource_id, text_input_coordinates, text_input_text
|
|
230
|
+
):
|
|
189
231
|
return self._create_result(
|
|
190
232
|
success=False,
|
|
191
233
|
error_message="Failed to focus element",
|
|
@@ -195,7 +237,9 @@ class TextClearer:
|
|
|
195
237
|
)
|
|
196
238
|
|
|
197
239
|
success, final_text, chars_erased = self._clear_with_retries(
|
|
198
|
-
|
|
240
|
+
text_input_resource_id=text_input_resource_id,
|
|
241
|
+
text_input_coordinates=text_input_coordinates,
|
|
242
|
+
text_input_text=text_input_text,
|
|
199
243
|
initial_text=current_text or "",
|
|
200
244
|
hint_text=hint_text,
|
|
201
245
|
)
|
|
@@ -218,12 +262,16 @@ def get_clear_text_tool(ctx: MobileUseContext):
|
|
|
218
262
|
state: Annotated[State, InjectedState],
|
|
219
263
|
agent_thought: str,
|
|
220
264
|
text_input_resource_id: str,
|
|
265
|
+
text_input_coordinates: ElementBounds | None,
|
|
266
|
+
text_input_text: str | None,
|
|
221
267
|
):
|
|
222
268
|
"""
|
|
223
269
|
Clears all the text from the text field, by focusing it if needed.
|
|
224
270
|
"""
|
|
225
271
|
clearer = TextClearer(ctx, state)
|
|
226
|
-
result = clearer.
|
|
272
|
+
result = clearer.clear_input_text(
|
|
273
|
+
text_input_resource_id, text_input_coordinates, text_input_text
|
|
274
|
+
)
|
|
227
275
|
|
|
228
276
|
content = (
|
|
229
277
|
clear_text_wrapper.on_failure_fn(result.error_message)
|
|
@@ -1,18 +1,20 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
1
3
|
from langchain_core.messages import ToolMessage
|
|
2
4
|
from langchain_core.tools import tool
|
|
3
5
|
from langchain_core.tools.base import InjectedToolCallId
|
|
6
|
+
from langgraph.prebuilt import InjectedState
|
|
4
7
|
from langgraph.types import Command
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
5
10
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
11
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
6
12
|
from minitap.mobile_use.controllers.mobile_command_controller import SelectorRequest
|
|
7
13
|
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
8
14
|
copy_text_from as copy_text_from_controller,
|
|
9
15
|
)
|
|
10
|
-
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
11
|
-
from pydantic import Field
|
|
12
|
-
from typing import Annotated
|
|
13
|
-
from minitap.mobile_use.context import MobileUseContext
|
|
14
16
|
from minitap.mobile_use.graph.state import State
|
|
15
|
-
from
|
|
17
|
+
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
def get_copy_text_from_tool(ctx: MobileUseContext):
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
1
3
|
from langchain_core.messages import ToolMessage
|
|
2
4
|
from langchain_core.tools import tool
|
|
3
5
|
from langchain_core.tools.base import InjectedToolCallId
|
|
4
6
|
from langgraph.prebuilt import InjectedState
|
|
5
7
|
from langgraph.types import Command
|
|
8
|
+
|
|
6
9
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
7
10
|
from minitap.mobile_use.context import MobileUseContext
|
|
8
11
|
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
@@ -11,18 +14,18 @@ from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
|
11
14
|
from minitap.mobile_use.graph.state import State
|
|
12
15
|
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
13
16
|
from minitap.mobile_use.utils.media import compress_base64_jpeg
|
|
14
|
-
from typing import Annotated
|
|
15
17
|
|
|
16
18
|
|
|
17
|
-
def
|
|
19
|
+
def get_glimpse_screen_tool(ctx: MobileUseContext):
|
|
18
20
|
@tool
|
|
19
|
-
def
|
|
21
|
+
def glimpse_screen(
|
|
20
22
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
21
23
|
state: Annotated[State, InjectedState],
|
|
22
24
|
agent_thought: str,
|
|
23
25
|
):
|
|
24
26
|
"""
|
|
25
|
-
|
|
27
|
+
Captures the current screen as an image.
|
|
28
|
+
The resulting screenshot is added to the context for the next reasoning step.
|
|
26
29
|
"""
|
|
27
30
|
compressed_image_base64 = None
|
|
28
31
|
has_failed = False
|
|
@@ -36,9 +39,9 @@ def get_take_screenshot_tool(ctx: MobileUseContext):
|
|
|
36
39
|
|
|
37
40
|
tool_message = ToolMessage(
|
|
38
41
|
tool_call_id=tool_call_id,
|
|
39
|
-
content=
|
|
42
|
+
content=glimpse_screen_wrapper.on_failure_fn()
|
|
40
43
|
if has_failed
|
|
41
|
-
else
|
|
44
|
+
else glimpse_screen_wrapper.on_success_fn(),
|
|
42
45
|
additional_kwargs={"error": output} if has_failed else {},
|
|
43
46
|
status="error" if has_failed else "success",
|
|
44
47
|
)
|
|
@@ -56,11 +59,12 @@ def get_take_screenshot_tool(ctx: MobileUseContext):
|
|
|
56
59
|
),
|
|
57
60
|
)
|
|
58
61
|
|
|
59
|
-
return
|
|
62
|
+
return glimpse_screen
|
|
60
63
|
|
|
61
64
|
|
|
62
|
-
|
|
63
|
-
tool_fn_getter=
|
|
64
|
-
on_success_fn=lambda: "
|
|
65
|
-
|
|
65
|
+
glimpse_screen_wrapper = ToolWrapper(
|
|
66
|
+
tool_fn_getter=get_glimpse_screen_tool,
|
|
67
|
+
on_success_fn=lambda: "Visual context captured successfully."
|
|
68
|
+
+ "It is now available for immediate analysis.",
|
|
69
|
+
on_failure_fn=lambda: "Failed to capture visual context.",
|
|
66
70
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Literal
|
|
3
|
+
from typing import Annotated, Literal
|
|
4
4
|
|
|
5
5
|
from langchain_core.messages import ToolMessage
|
|
6
6
|
from langchain_core.tools import tool
|
|
@@ -8,10 +8,12 @@ from langchain_core.tools.base import InjectedToolCallId
|
|
|
8
8
|
from langgraph.prebuilt import InjectedState
|
|
9
9
|
from langgraph.types import Command
|
|
10
10
|
from pydantic import BaseModel
|
|
11
|
-
from typing import Annotated
|
|
12
11
|
|
|
13
12
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
14
13
|
from minitap.mobile_use.context import MobileUseContext
|
|
14
|
+
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
15
|
+
get_screen_data,
|
|
16
|
+
)
|
|
15
17
|
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
16
18
|
input_text as input_text_controller,
|
|
17
19
|
)
|
|
@@ -19,6 +21,11 @@ from minitap.mobile_use.graph.state import State
|
|
|
19
21
|
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
20
22
|
from minitap.mobile_use.tools.utils import focus_element_if_needed, move_cursor_to_end_if_bounds
|
|
21
23
|
from minitap.mobile_use.utils.logger import get_logger
|
|
24
|
+
from minitap.mobile_use.utils.ui_hierarchy import (
|
|
25
|
+
ElementBounds,
|
|
26
|
+
find_element_by_resource_id,
|
|
27
|
+
get_element_text,
|
|
28
|
+
)
|
|
22
29
|
|
|
23
30
|
logger = get_logger(__name__)
|
|
24
31
|
|
|
@@ -47,7 +54,9 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
47
54
|
state: Annotated[State, InjectedState],
|
|
48
55
|
agent_thought: str,
|
|
49
56
|
text: str,
|
|
50
|
-
text_input_resource_id: str,
|
|
57
|
+
text_input_resource_id: str | None,
|
|
58
|
+
text_input_coordinates: ElementBounds | None,
|
|
59
|
+
text_input_text: str | None,
|
|
51
60
|
):
|
|
52
61
|
"""
|
|
53
62
|
Focus a text field and type text into it.
|
|
@@ -55,23 +64,83 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
55
64
|
- Ensure the corresponding element is focused (tap if necessary).
|
|
56
65
|
- If bounds are available, tap near the end to place the cursor at the end.
|
|
57
66
|
- Type the provided `text` using the controller.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
tool_call_id: The ID of the tool call.
|
|
70
|
+
state: The state of the agent.
|
|
71
|
+
agent_thought: The thought of the agent.
|
|
72
|
+
text: The text to type.
|
|
73
|
+
text_input_resource_id: The resource ID of the text input (if available).
|
|
74
|
+
text_input_coordinates: The bounds (ElementBounds) of the text input (if available).
|
|
75
|
+
text_input_text: The current text content of the text input (if available).
|
|
58
76
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
77
|
+
|
|
78
|
+
focused = focus_element_if_needed(
|
|
79
|
+
ctx=ctx,
|
|
80
|
+
input_resource_id=text_input_resource_id,
|
|
81
|
+
input_coordinates=text_input_coordinates,
|
|
82
|
+
input_text=text_input_text,
|
|
83
|
+
)
|
|
84
|
+
if not focused:
|
|
85
|
+
error_message = "Failed to focus the text input element before typing."
|
|
86
|
+
tool_message = ToolMessage(
|
|
87
|
+
tool_call_id=tool_call_id,
|
|
88
|
+
content=input_text_wrapper.on_failure_fn(text, error_message),
|
|
89
|
+
additional_kwargs={"error": error_message},
|
|
90
|
+
status="error",
|
|
91
|
+
)
|
|
92
|
+
return Command(
|
|
93
|
+
update=state.sanitize_update(
|
|
94
|
+
ctx=ctx,
|
|
95
|
+
update={
|
|
96
|
+
"agents_thoughts": [agent_thought, error_message],
|
|
97
|
+
EXECUTOR_MESSAGES_KEY: [tool_message],
|
|
98
|
+
},
|
|
99
|
+
agent="executor",
|
|
100
|
+
),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
move_cursor_to_end_if_bounds(
|
|
104
|
+
ctx=ctx,
|
|
105
|
+
state=state,
|
|
106
|
+
text_input_resource_id=text_input_resource_id,
|
|
107
|
+
text_input_coordinates=text_input_coordinates,
|
|
108
|
+
text_input_text=text_input_text,
|
|
109
|
+
)
|
|
62
110
|
|
|
63
111
|
result = _controller_input_text(ctx=ctx, text=text)
|
|
64
112
|
|
|
65
113
|
status: Literal["success", "error"] = "success" if result.ok else "error"
|
|
66
|
-
|
|
67
|
-
|
|
114
|
+
|
|
115
|
+
text_input_content = ""
|
|
116
|
+
if status == "success":
|
|
117
|
+
if text_input_resource_id is not None:
|
|
118
|
+
# Verification phase for elements with resource_id
|
|
119
|
+
screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
|
|
120
|
+
state.latest_ui_hierarchy = screen_data.elements
|
|
121
|
+
|
|
122
|
+
element = find_element_by_resource_id(
|
|
123
|
+
ui_hierarchy=state.latest_ui_hierarchy, resource_id=text_input_resource_id
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if not element:
|
|
127
|
+
result = InputResult(ok=False, error="Element not found")
|
|
128
|
+
|
|
129
|
+
if element:
|
|
130
|
+
text_input_content = get_element_text(element)
|
|
131
|
+
else:
|
|
132
|
+
# For elements without resource_id, skip verification and use direct message
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
agent_outcome = (
|
|
136
|
+
input_text_wrapper.on_success_fn(text, text_input_content, text_input_resource_id)
|
|
68
137
|
if result.ok
|
|
69
|
-
else input_text_wrapper.on_failure_fn(text)
|
|
138
|
+
else input_text_wrapper.on_failure_fn(text, result.error)
|
|
70
139
|
)
|
|
71
140
|
|
|
72
141
|
tool_message = ToolMessage(
|
|
73
142
|
tool_call_id=tool_call_id,
|
|
74
|
-
content=
|
|
143
|
+
content=agent_outcome,
|
|
75
144
|
additional_kwargs={"error": result.error} if not result.ok else {},
|
|
76
145
|
status=status,
|
|
77
146
|
)
|
|
@@ -80,7 +149,7 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
80
149
|
update=state.sanitize_update(
|
|
81
150
|
ctx=ctx,
|
|
82
151
|
update={
|
|
83
|
-
"agents_thoughts": [agent_thought],
|
|
152
|
+
"agents_thoughts": [agent_thought, agent_outcome],
|
|
84
153
|
EXECUTOR_MESSAGES_KEY: [tool_message],
|
|
85
154
|
},
|
|
86
155
|
agent="executor",
|
|
@@ -90,8 +159,20 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
90
159
|
return input_text
|
|
91
160
|
|
|
92
161
|
|
|
162
|
+
def _on_input_success(text, text_input_content, text_input_resource_id):
|
|
163
|
+
"""Success message handler for input text operations."""
|
|
164
|
+
if text_input_resource_id is not None:
|
|
165
|
+
return (
|
|
166
|
+
f"Typed {repr(text)}.\n"
|
|
167
|
+
f"Here is the whole content of input with id {repr(text_input_resource_id)}: "
|
|
168
|
+
f"{repr(text_input_content)}"
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
return "Typed text, should now verify before moving forward"
|
|
172
|
+
|
|
173
|
+
|
|
93
174
|
input_text_wrapper = ToolWrapper(
|
|
94
175
|
tool_fn_getter=get_input_text_tool,
|
|
95
|
-
on_success_fn=
|
|
96
|
-
on_failure_fn=lambda text: f"Failed to input text {text}",
|
|
176
|
+
on_success_fn=_on_input_success,
|
|
177
|
+
on_failure_fn=lambda text, error: f"Failed to input text {repr(text)}. Reason: {error}",
|
|
97
178
|
)
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
1
3
|
from langchain_core.messages import ToolMessage
|
|
2
4
|
from langchain_core.tools import tool
|
|
3
5
|
from langchain_core.tools.base import InjectedToolCallId
|
|
6
|
+
from langgraph.prebuilt import InjectedState
|
|
4
7
|
from langgraph.types import Command
|
|
8
|
+
|
|
5
9
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
6
10
|
from minitap.mobile_use.context import MobileUseContext
|
|
11
|
+
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
12
|
+
get_screen_data,
|
|
13
|
+
)
|
|
7
14
|
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
8
15
|
paste_text as paste_text_controller,
|
|
9
16
|
)
|
|
10
17
|
from minitap.mobile_use.graph.state import State
|
|
11
|
-
from langgraph.prebuilt import InjectedState
|
|
12
18
|
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
13
|
-
from
|
|
19
|
+
from minitap.mobile_use.utils.ui_hierarchy import find_element_by_resource_id, get_element_text
|
|
14
20
|
|
|
15
21
|
|
|
16
22
|
def get_paste_text_tool(ctx: MobileUseContext):
|
|
@@ -19,6 +25,7 @@ def get_paste_text_tool(ctx: MobileUseContext):
|
|
|
19
25
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
20
26
|
state: Annotated[State, InjectedState],
|
|
21
27
|
agent_thought: str,
|
|
28
|
+
focused_element_resource_id: str,
|
|
22
29
|
):
|
|
23
30
|
"""
|
|
24
31
|
Pastes text previously copied via `copyTextFrom` into the currently focused field.
|
|
@@ -32,12 +39,29 @@ def get_paste_text_tool(ctx: MobileUseContext):
|
|
|
32
39
|
- pasteText
|
|
33
40
|
"""
|
|
34
41
|
output = paste_text_controller(ctx=ctx)
|
|
42
|
+
|
|
43
|
+
text_input_content = ""
|
|
44
|
+
screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
|
|
45
|
+
state.latest_ui_hierarchy = screen_data.elements
|
|
46
|
+
|
|
47
|
+
element = find_element_by_resource_id(
|
|
48
|
+
ui_hierarchy=state.latest_ui_hierarchy, resource_id=focused_element_resource_id
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
if element:
|
|
52
|
+
text_input_content = get_element_text(element)
|
|
53
|
+
|
|
35
54
|
has_failed = output is not None
|
|
55
|
+
|
|
56
|
+
agent_outcome = (
|
|
57
|
+
paste_text_wrapper.on_success_fn(text_input_content)
|
|
58
|
+
if not has_failed
|
|
59
|
+
else paste_text_wrapper.on_failure_fn(text_input_content)
|
|
60
|
+
)
|
|
61
|
+
|
|
36
62
|
tool_message = ToolMessage(
|
|
37
63
|
tool_call_id=tool_call_id,
|
|
38
|
-
content=
|
|
39
|
-
if has_failed
|
|
40
|
-
else paste_text_wrapper.on_success_fn(),
|
|
64
|
+
content=agent_outcome,
|
|
41
65
|
additional_kwargs={"error": output} if has_failed else {},
|
|
42
66
|
status="error" if has_failed else "success",
|
|
43
67
|
)
|
|
@@ -45,7 +69,7 @@ def get_paste_text_tool(ctx: MobileUseContext):
|
|
|
45
69
|
update=state.sanitize_update(
|
|
46
70
|
ctx=ctx,
|
|
47
71
|
update={
|
|
48
|
-
"agents_thoughts": [agent_thought],
|
|
72
|
+
"agents_thoughts": [agent_thought, agent_outcome],
|
|
49
73
|
EXECUTOR_MESSAGES_KEY: [tool_message],
|
|
50
74
|
},
|
|
51
75
|
agent="executor",
|
|
@@ -57,6 +81,8 @@ def get_paste_text_tool(ctx: MobileUseContext):
|
|
|
57
81
|
|
|
58
82
|
paste_text_wrapper = ToolWrapper(
|
|
59
83
|
tool_fn_getter=get_paste_text_tool,
|
|
60
|
-
on_success_fn=lambda: "Text pasted successfully."
|
|
61
|
-
|
|
84
|
+
on_success_fn=lambda input_content: "Text pasted successfully. Here is the actual"
|
|
85
|
+
+ f"content of the text field : {repr(input_content)}",
|
|
86
|
+
on_failure_fn=lambda input_content: "Failed to paste text."
|
|
87
|
+
+ f"Here is the actual content of the text field : {repr(input_content)}",
|
|
62
88
|
)
|