minitap-mobile-use 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

@@ -23,6 +23,7 @@ from minitap.mobile_use.tools.utils import (
23
23
  )
24
24
  from minitap.mobile_use.utils.logger import get_logger
25
25
  from minitap.mobile_use.utils.ui_hierarchy import (
26
+ ElementBounds,
26
27
  find_element_by_resource_id,
27
28
  get_element_text,
28
29
  text_input_is_empty,
@@ -50,16 +51,20 @@ class TextClearer:
50
51
  screen_data = get_screen_data(screen_api_client=self.ctx.screen_api_client)
51
52
  self.state.latest_ui_hierarchy = screen_data.elements
52
53
 
53
- def _get_element_info(self, resource_id: str) -> tuple[object | None, str | None, str | None]:
54
+ def _get_element_info(
55
+ self, resource_id: str | None
56
+ ) -> tuple[object | None, str | None, str | None]:
54
57
  if not self.state.latest_ui_hierarchy:
55
58
  self._refresh_ui_hierarchy()
56
59
 
57
60
  if not self.state.latest_ui_hierarchy:
58
61
  return None, None, None
59
62
 
60
- element = find_element_by_resource_id(
61
- ui_hierarchy=self.state.latest_ui_hierarchy, resource_id=resource_id
62
- )
63
+ element = None
64
+ if resource_id:
65
+ element = find_element_by_resource_id(
66
+ ui_hierarchy=self.state.latest_ui_hierarchy, resource_id=resource_id
67
+ )
63
68
 
64
69
  if not element:
65
70
  return None, None, None
@@ -83,11 +88,27 @@ class TextClearer:
83
88
  def _should_clear_text(self, current_text: str | None, hint_text: str | None) -> bool:
84
89
  return current_text is not None and current_text != "" and current_text != hint_text
85
90
 
86
- def _prepare_element_for_clearing(self, resource_id: str) -> bool:
87
- if not focus_element_if_needed(ctx=self.ctx, resource_id=resource_id):
91
+ def _prepare_element_for_clearing(
92
+ self,
93
+ text_input_resource_id: str | None,
94
+ text_input_coordinates: ElementBounds | None,
95
+ text_input_text: str | None,
96
+ ) -> bool:
97
+ if not focus_element_if_needed(
98
+ ctx=self.ctx,
99
+ input_resource_id=text_input_resource_id,
100
+ input_coordinates=text_input_coordinates,
101
+ input_text=text_input_text,
102
+ ):
88
103
  return False
89
104
 
90
- move_cursor_to_end_if_bounds(ctx=self.ctx, state=self.state, resource_id=resource_id)
105
+ move_cursor_to_end_if_bounds(
106
+ ctx=self.ctx,
107
+ state=self.state,
108
+ text_input_resource_id=text_input_resource_id,
109
+ text_input_coordinates=text_input_coordinates,
110
+ text_input_text=text_input_text,
111
+ )
91
112
  return True
92
113
 
93
114
  def _erase_text_attempt(self, text_length: int) -> str | None:
@@ -102,7 +123,12 @@ class TextClearer:
102
123
  return None
103
124
 
104
125
  def _clear_with_retries(
105
- self, resource_id: str, initial_text: str, hint_text: str | None
126
+ self,
127
+ text_input_resource_id: str | None,
128
+ text_input_coordinates: ElementBounds | None,
129
+ text_input_text: str | None,
130
+ initial_text: str,
131
+ hint_text: str | None,
106
132
  ) -> tuple[bool, str | None, int]:
107
133
  current_text = initial_text
108
134
  erased_chars = 0
@@ -118,18 +144,25 @@ class TextClearer:
118
144
  erased_chars += chars_to_erase
119
145
 
120
146
  self._refresh_ui_hierarchy()
121
- elt = find_element_by_resource_id(
122
- ui_hierarchy=self.state.latest_ui_hierarchy or [],
123
- resource_id=resource_id,
124
- )
125
- if elt:
126
- current_text = get_element_text(elt)
127
- logger.info(f"Current text: {current_text}")
128
- if text_input_is_empty(text=current_text, hint_text=hint_text):
129
- break
147
+ elt = None
148
+ if text_input_resource_id:
149
+ elt = find_element_by_resource_id(
150
+ ui_hierarchy=self.state.latest_ui_hierarchy or [],
151
+ resource_id=text_input_resource_id,
152
+ )
153
+ if elt:
154
+ current_text = get_element_text(elt)
155
+ logger.info(f"Current text: {current_text}")
156
+ if text_input_is_empty(text=current_text, hint_text=hint_text):
157
+ break
130
158
 
131
159
  move_cursor_to_end_if_bounds(
132
- ctx=self.ctx, state=self.state, resource_id=resource_id, elt=elt
160
+ ctx=self.ctx,
161
+ state=self.state,
162
+ text_input_resource_id=text_input_resource_id,
163
+ text_input_coordinates=text_input_coordinates,
164
+ text_input_text=text_input_text,
165
+ elt=elt,
133
166
  )
134
167
 
135
168
  return True, current_text, erased_chars
@@ -162,7 +195,9 @@ class TextClearer:
162
195
  hint_text=hint_text,
163
196
  )
164
197
 
165
- def _handle_element_not_found(self, resource_id: str, hint_text: str | None) -> ClearTextResult:
198
+ def _handle_element_not_found(
199
+ self, resource_id: str | None, hint_text: str | None
200
+ ) -> ClearTextResult:
166
201
  error = erase_text_controller(ctx=self.ctx)
167
202
  self._refresh_ui_hierarchy()
168
203
 
@@ -176,16 +211,23 @@ class TextClearer:
176
211
  hint_text=hint_text,
177
212
  )
178
213
 
179
- def clear_text_by_resource_id(self, resource_id: str) -> ClearTextResult:
180
- element, current_text, hint_text = self._get_element_info(resource_id)
214
+ def clear_input_text(
215
+ self,
216
+ text_input_resource_id: str | None,
217
+ text_input_coordinates: ElementBounds | None,
218
+ text_input_text: str | None,
219
+ ) -> ClearTextResult:
220
+ element, current_text, hint_text = self._get_element_info(text_input_resource_id)
181
221
 
182
222
  if not element:
183
- return self._handle_element_not_found(resource_id, hint_text)
223
+ return self._handle_element_not_found(text_input_resource_id, hint_text)
184
224
 
185
225
  if not self._should_clear_text(current_text, hint_text):
186
226
  return self._handle_no_clearing_needed(current_text, hint_text)
187
227
 
188
- if not self._prepare_element_for_clearing(resource_id):
228
+ if not self._prepare_element_for_clearing(
229
+ text_input_resource_id, text_input_coordinates, text_input_text
230
+ ):
189
231
  return self._create_result(
190
232
  success=False,
191
233
  error_message="Failed to focus element",
@@ -195,7 +237,9 @@ class TextClearer:
195
237
  )
196
238
 
197
239
  success, final_text, chars_erased = self._clear_with_retries(
198
- resource_id=resource_id,
240
+ text_input_resource_id=text_input_resource_id,
241
+ text_input_coordinates=text_input_coordinates,
242
+ text_input_text=text_input_text,
199
243
  initial_text=current_text or "",
200
244
  hint_text=hint_text,
201
245
  )
@@ -218,12 +262,16 @@ def get_clear_text_tool(ctx: MobileUseContext):
218
262
  state: Annotated[State, InjectedState],
219
263
  agent_thought: str,
220
264
  text_input_resource_id: str,
265
+ text_input_coordinates: ElementBounds | None,
266
+ text_input_text: str | None,
221
267
  ):
222
268
  """
223
269
  Clears all the text from the text field, by focusing it if needed.
224
270
  """
225
271
  clearer = TextClearer(ctx, state)
226
- result = clearer.clear_text_by_resource_id(text_input_resource_id)
272
+ result = clearer.clear_input_text(
273
+ text_input_resource_id, text_input_coordinates, text_input_text
274
+ )
227
275
 
228
276
  content = (
229
277
  clear_text_wrapper.on_failure_fn(result.error_message)
@@ -1,18 +1,20 @@
1
+ from typing import Annotated
2
+
1
3
  from langchain_core.messages import ToolMessage
2
4
  from langchain_core.tools import tool
3
5
  from langchain_core.tools.base import InjectedToolCallId
6
+ from langgraph.prebuilt import InjectedState
4
7
  from langgraph.types import Command
8
+ from pydantic import Field
9
+
5
10
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
11
+ from minitap.mobile_use.context import MobileUseContext
6
12
  from minitap.mobile_use.controllers.mobile_command_controller import SelectorRequest
7
13
  from minitap.mobile_use.controllers.mobile_command_controller import (
8
14
  copy_text_from as copy_text_from_controller,
9
15
  )
10
- from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
11
- from pydantic import Field
12
- from typing import Annotated
13
- from minitap.mobile_use.context import MobileUseContext
14
16
  from minitap.mobile_use.graph.state import State
15
- from langgraph.prebuilt import InjectedState
17
+ from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
16
18
 
17
19
 
18
20
  def get_copy_text_from_tool(ctx: MobileUseContext):
@@ -1,8 +1,11 @@
1
+ from typing import Annotated
2
+
1
3
  from langchain_core.messages import ToolMessage
2
4
  from langchain_core.tools import tool
3
5
  from langchain_core.tools.base import InjectedToolCallId
4
6
  from langgraph.prebuilt import InjectedState
5
7
  from langgraph.types import Command
8
+
6
9
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
7
10
  from minitap.mobile_use.context import MobileUseContext
8
11
  from minitap.mobile_use.controllers.mobile_command_controller import (
@@ -11,18 +14,18 @@ from minitap.mobile_use.controllers.mobile_command_controller import (
11
14
  from minitap.mobile_use.graph.state import State
12
15
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
13
16
  from minitap.mobile_use.utils.media import compress_base64_jpeg
14
- from typing import Annotated
15
17
 
16
18
 
17
- def get_take_screenshot_tool(ctx: MobileUseContext):
19
+ def get_glimpse_screen_tool(ctx: MobileUseContext):
18
20
  @tool
19
- def take_screenshot(
21
+ def glimpse_screen(
20
22
  tool_call_id: Annotated[str, InjectedToolCallId],
21
23
  state: Annotated[State, InjectedState],
22
24
  agent_thought: str,
23
25
  ):
24
26
  """
25
- Take a screenshot of the device.
27
+ Captures the current screen as an image.
28
+ The resulting screenshot is added to the context for the next reasoning step.
26
29
  """
27
30
  compressed_image_base64 = None
28
31
  has_failed = False
@@ -36,9 +39,9 @@ def get_take_screenshot_tool(ctx: MobileUseContext):
36
39
 
37
40
  tool_message = ToolMessage(
38
41
  tool_call_id=tool_call_id,
39
- content=take_screenshot_wrapper.on_failure_fn()
42
+ content=glimpse_screen_wrapper.on_failure_fn()
40
43
  if has_failed
41
- else take_screenshot_wrapper.on_success_fn(),
44
+ else glimpse_screen_wrapper.on_success_fn(),
42
45
  additional_kwargs={"error": output} if has_failed else {},
43
46
  status="error" if has_failed else "success",
44
47
  )
@@ -56,11 +59,12 @@ def get_take_screenshot_tool(ctx: MobileUseContext):
56
59
  ),
57
60
  )
58
61
 
59
- return take_screenshot
62
+ return glimpse_screen
60
63
 
61
64
 
62
- take_screenshot_wrapper = ToolWrapper(
63
- tool_fn_getter=get_take_screenshot_tool,
64
- on_success_fn=lambda: "Screenshot taken successfully.",
65
- on_failure_fn=lambda: "Failed to take screenshot.",
65
+ glimpse_screen_wrapper = ToolWrapper(
66
+ tool_fn_getter=get_glimpse_screen_tool,
67
+ on_success_fn=lambda: "Visual context captured successfully."
68
+ + "It is now available for immediate analysis.",
69
+ on_failure_fn=lambda: "Failed to capture visual context.",
66
70
  )
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Literal
3
+ from typing import Annotated, Literal
4
4
 
5
5
  from langchain_core.messages import ToolMessage
6
6
  from langchain_core.tools import tool
@@ -8,10 +8,12 @@ from langchain_core.tools.base import InjectedToolCallId
8
8
  from langgraph.prebuilt import InjectedState
9
9
  from langgraph.types import Command
10
10
  from pydantic import BaseModel
11
- from typing import Annotated
12
11
 
13
12
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
14
13
  from minitap.mobile_use.context import MobileUseContext
14
+ from minitap.mobile_use.controllers.mobile_command_controller import (
15
+ get_screen_data,
16
+ )
15
17
  from minitap.mobile_use.controllers.mobile_command_controller import (
16
18
  input_text as input_text_controller,
17
19
  )
@@ -19,6 +21,11 @@ from minitap.mobile_use.graph.state import State
19
21
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
20
22
  from minitap.mobile_use.tools.utils import focus_element_if_needed, move_cursor_to_end_if_bounds
21
23
  from minitap.mobile_use.utils.logger import get_logger
24
+ from minitap.mobile_use.utils.ui_hierarchy import (
25
+ ElementBounds,
26
+ find_element_by_resource_id,
27
+ get_element_text,
28
+ )
22
29
 
23
30
  logger = get_logger(__name__)
24
31
 
@@ -47,7 +54,9 @@ def get_input_text_tool(ctx: MobileUseContext):
47
54
  state: Annotated[State, InjectedState],
48
55
  agent_thought: str,
49
56
  text: str,
50
- text_input_resource_id: str,
57
+ text_input_resource_id: str | None,
58
+ text_input_coordinates: ElementBounds | None,
59
+ text_input_text: str | None,
51
60
  ):
52
61
  """
53
62
  Focus a text field and type text into it.
@@ -55,23 +64,83 @@ def get_input_text_tool(ctx: MobileUseContext):
55
64
  - Ensure the corresponding element is focused (tap if necessary).
56
65
  - If bounds are available, tap near the end to place the cursor at the end.
57
66
  - Type the provided `text` using the controller.
67
+
68
+ Args:
69
+ tool_call_id: The ID of the tool call.
70
+ state: The state of the agent.
71
+ agent_thought: The thought of the agent.
72
+ text: The text to type.
73
+ text_input_resource_id: The resource ID of the text input (if available).
74
+ text_input_coordinates: The bounds (ElementBounds) of the text input (if available).
75
+ text_input_text: The current text content of the text input (if available).
58
76
  """
59
- focused = focus_element_if_needed(ctx=ctx, resource_id=text_input_resource_id)
60
- if focused:
61
- move_cursor_to_end_if_bounds(ctx=ctx, state=state, resource_id=text_input_resource_id)
77
+
78
+ focused = focus_element_if_needed(
79
+ ctx=ctx,
80
+ input_resource_id=text_input_resource_id,
81
+ input_coordinates=text_input_coordinates,
82
+ input_text=text_input_text,
83
+ )
84
+ if not focused:
85
+ error_message = "Failed to focus the text input element before typing."
86
+ tool_message = ToolMessage(
87
+ tool_call_id=tool_call_id,
88
+ content=input_text_wrapper.on_failure_fn(text, error_message),
89
+ additional_kwargs={"error": error_message},
90
+ status="error",
91
+ )
92
+ return Command(
93
+ update=state.sanitize_update(
94
+ ctx=ctx,
95
+ update={
96
+ "agents_thoughts": [agent_thought, error_message],
97
+ EXECUTOR_MESSAGES_KEY: [tool_message],
98
+ },
99
+ agent="executor",
100
+ ),
101
+ )
102
+
103
+ move_cursor_to_end_if_bounds(
104
+ ctx=ctx,
105
+ state=state,
106
+ text_input_resource_id=text_input_resource_id,
107
+ text_input_coordinates=text_input_coordinates,
108
+ text_input_text=text_input_text,
109
+ )
62
110
 
63
111
  result = _controller_input_text(ctx=ctx, text=text)
64
112
 
65
113
  status: Literal["success", "error"] = "success" if result.ok else "error"
66
- content_msg = (
67
- input_text_wrapper.on_success_fn(text)
114
+
115
+ text_input_content = ""
116
+ if status == "success":
117
+ if text_input_resource_id is not None:
118
+ # Verification phase for elements with resource_id
119
+ screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
120
+ state.latest_ui_hierarchy = screen_data.elements
121
+
122
+ element = find_element_by_resource_id(
123
+ ui_hierarchy=state.latest_ui_hierarchy, resource_id=text_input_resource_id
124
+ )
125
+
126
+ if not element:
127
+ result = InputResult(ok=False, error="Element not found")
128
+
129
+ if element:
130
+ text_input_content = get_element_text(element)
131
+ else:
132
+ # For elements without resource_id, skip verification and use direct message
133
+ pass
134
+
135
+ agent_outcome = (
136
+ input_text_wrapper.on_success_fn(text, text_input_content, text_input_resource_id)
68
137
  if result.ok
69
- else input_text_wrapper.on_failure_fn(text)
138
+ else input_text_wrapper.on_failure_fn(text, result.error)
70
139
  )
71
140
 
72
141
  tool_message = ToolMessage(
73
142
  tool_call_id=tool_call_id,
74
- content=content_msg,
143
+ content=agent_outcome,
75
144
  additional_kwargs={"error": result.error} if not result.ok else {},
76
145
  status=status,
77
146
  )
@@ -80,7 +149,7 @@ def get_input_text_tool(ctx: MobileUseContext):
80
149
  update=state.sanitize_update(
81
150
  ctx=ctx,
82
151
  update={
83
- "agents_thoughts": [agent_thought],
152
+ "agents_thoughts": [agent_thought, agent_outcome],
84
153
  EXECUTOR_MESSAGES_KEY: [tool_message],
85
154
  },
86
155
  agent="executor",
@@ -90,8 +159,20 @@ def get_input_text_tool(ctx: MobileUseContext):
90
159
  return input_text
91
160
 
92
161
 
162
+ def _on_input_success(text, text_input_content, text_input_resource_id):
163
+ """Success message handler for input text operations."""
164
+ if text_input_resource_id is not None:
165
+ return (
166
+ f"Typed {repr(text)}.\n"
167
+ f"Here is the whole content of input with id {repr(text_input_resource_id)}: "
168
+ f"{repr(text_input_content)}"
169
+ )
170
+ else:
171
+ return "Typed text, should now verify before moving forward"
172
+
173
+
93
174
  input_text_wrapper = ToolWrapper(
94
175
  tool_fn_getter=get_input_text_tool,
95
- on_success_fn=lambda text: f"Successfully typed {text}",
96
- on_failure_fn=lambda text: f"Failed to input text {text}",
176
+ on_success_fn=_on_input_success,
177
+ on_failure_fn=lambda text, error: f"Failed to input text {repr(text)}. Reason: {error}",
97
178
  )
@@ -1,16 +1,22 @@
1
+ from typing import Annotated
2
+
1
3
  from langchain_core.messages import ToolMessage
2
4
  from langchain_core.tools import tool
3
5
  from langchain_core.tools.base import InjectedToolCallId
6
+ from langgraph.prebuilt import InjectedState
4
7
  from langgraph.types import Command
8
+
5
9
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
6
10
  from minitap.mobile_use.context import MobileUseContext
11
+ from minitap.mobile_use.controllers.mobile_command_controller import (
12
+ get_screen_data,
13
+ )
7
14
  from minitap.mobile_use.controllers.mobile_command_controller import (
8
15
  paste_text as paste_text_controller,
9
16
  )
10
17
  from minitap.mobile_use.graph.state import State
11
- from langgraph.prebuilt import InjectedState
12
18
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
13
- from typing import Annotated
19
+ from minitap.mobile_use.utils.ui_hierarchy import find_element_by_resource_id, get_element_text
14
20
 
15
21
 
16
22
  def get_paste_text_tool(ctx: MobileUseContext):
@@ -19,6 +25,7 @@ def get_paste_text_tool(ctx: MobileUseContext):
19
25
  tool_call_id: Annotated[str, InjectedToolCallId],
20
26
  state: Annotated[State, InjectedState],
21
27
  agent_thought: str,
28
+ focused_element_resource_id: str,
22
29
  ):
23
30
  """
24
31
  Pastes text previously copied via `copyTextFrom` into the currently focused field.
@@ -32,12 +39,29 @@ def get_paste_text_tool(ctx: MobileUseContext):
32
39
  - pasteText
33
40
  """
34
41
  output = paste_text_controller(ctx=ctx)
42
+
43
+ text_input_content = ""
44
+ screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
45
+ state.latest_ui_hierarchy = screen_data.elements
46
+
47
+ element = find_element_by_resource_id(
48
+ ui_hierarchy=state.latest_ui_hierarchy, resource_id=focused_element_resource_id
49
+ )
50
+
51
+ if element:
52
+ text_input_content = get_element_text(element)
53
+
35
54
  has_failed = output is not None
55
+
56
+ agent_outcome = (
57
+ paste_text_wrapper.on_success_fn(text_input_content)
58
+ if not has_failed
59
+ else paste_text_wrapper.on_failure_fn(text_input_content)
60
+ )
61
+
36
62
  tool_message = ToolMessage(
37
63
  tool_call_id=tool_call_id,
38
- content=paste_text_wrapper.on_failure_fn()
39
- if has_failed
40
- else paste_text_wrapper.on_success_fn(),
64
+ content=agent_outcome,
41
65
  additional_kwargs={"error": output} if has_failed else {},
42
66
  status="error" if has_failed else "success",
43
67
  )
@@ -45,7 +69,7 @@ def get_paste_text_tool(ctx: MobileUseContext):
45
69
  update=state.sanitize_update(
46
70
  ctx=ctx,
47
71
  update={
48
- "agents_thoughts": [agent_thought],
72
+ "agents_thoughts": [agent_thought, agent_outcome],
49
73
  EXECUTOR_MESSAGES_KEY: [tool_message],
50
74
  },
51
75
  agent="executor",
@@ -57,6 +81,8 @@ def get_paste_text_tool(ctx: MobileUseContext):
57
81
 
58
82
  paste_text_wrapper = ToolWrapper(
59
83
  tool_fn_getter=get_paste_text_tool,
60
- on_success_fn=lambda: "Text pasted successfully.",
61
- on_failure_fn=lambda: "Failed to paste text.",
84
+ on_success_fn=lambda input_content: "Text pasted successfully. Here is the actual"
85
+ + f"content of the text field : {repr(input_content)}",
86
+ on_failure_fn=lambda input_content: "Failed to paste text."
87
+ + f"Here is the actual content of the text field : {repr(input_content)}",
62
88
  )