minitap-mobile-use 2.1.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (36) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +4 -2
  2. minitap/mobile_use/agents/cortex/cortex.md +72 -26
  3. minitap/mobile_use/agents/cortex/cortex.py +1 -2
  4. minitap/mobile_use/agents/executor/executor.md +6 -4
  5. minitap/mobile_use/agents/executor/executor.py +3 -1
  6. minitap/mobile_use/agents/executor/utils.py +2 -1
  7. minitap/mobile_use/agents/outputter/test_outputter.py +104 -42
  8. minitap/mobile_use/agents/planner/planner.md +1 -1
  9. minitap/mobile_use/agents/planner/planner.py +4 -2
  10. minitap/mobile_use/config.py +16 -1
  11. minitap/mobile_use/controllers/mobile_command_controller.py +4 -4
  12. minitap/mobile_use/main.py +2 -2
  13. minitap/mobile_use/sdk/agent.py +17 -8
  14. minitap/mobile_use/sdk/builders/agent_config_builder.py +2 -2
  15. minitap/mobile_use/sdk/types/exceptions.py +30 -0
  16. minitap/mobile_use/sdk/utils.py +3 -2
  17. minitap/mobile_use/servers/device_hardware_bridge.py +2 -1
  18. minitap/mobile_use/servers/utils.py +6 -9
  19. minitap/mobile_use/services/llm.py +23 -6
  20. minitap/mobile_use/tools/index.py +21 -15
  21. minitap/mobile_use/tools/mobile/clear_text.py +73 -25
  22. minitap/mobile_use/tools/mobile/copy_text_from.py +7 -5
  23. minitap/mobile_use/tools/mobile/{take_screenshot.py → glimpse_screen.py} +15 -11
  24. minitap/mobile_use/tools/mobile/input_text.py +94 -13
  25. minitap/mobile_use/tools/mobile/paste_text.py +34 -8
  26. minitap/mobile_use/tools/mobile/swipe.py +107 -9
  27. minitap/mobile_use/tools/test_utils.py +351 -0
  28. minitap/mobile_use/tools/tool_wrapper.py +5 -0
  29. minitap/mobile_use/tools/utils.py +147 -40
  30. minitap/mobile_use/utils/recorder.py +2 -9
  31. minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
  32. minitap/mobile_use/utils/ui_hierarchy.py +2 -2
  33. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/METADATA +28 -8
  34. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/RECORD +36 -34
  35. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/WHEEL +0 -0
  36. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,351 @@
1
+ import sys
2
+ from unittest.mock import Mock, patch
3
+
4
+ import pytest
5
+
6
+ # Mock the problematic langgraph import at module level
7
+ sys.modules["langgraph.prebuilt.chat_agent_executor"] = Mock()
8
+ sys.modules["minitap.mobile_use.graph.state"] = Mock()
9
+
10
+ from minitap.mobile_use.context import MobileUseContext # noqa: E402
11
+ from minitap.mobile_use.controllers.mobile_command_controller import ( # noqa: E402
12
+ IdSelectorRequest,
13
+ SelectorRequestWithCoordinates,
14
+ )
15
+ from minitap.mobile_use.tools.utils import ( # noqa: E402
16
+ focus_element_if_needed,
17
+ move_cursor_to_end_if_bounds,
18
+ )
19
+ from minitap.mobile_use.utils.ui_hierarchy import ElementBounds # noqa: E402
20
+
21
+
22
+ @pytest.fixture
23
+ def mock_context():
24
+ """Create a mock MobileUseContext for testing."""
25
+ ctx = Mock(spec=MobileUseContext)
26
+ ctx.hw_bridge_client = Mock()
27
+ return ctx
28
+
29
+
30
+ @pytest.fixture
31
+ def mock_state():
32
+ """Create a mock State for testing."""
33
+ state = Mock()
34
+ state.latest_ui_hierarchy = []
35
+ return state
36
+
37
+
38
+ @pytest.fixture
39
+ def sample_element():
40
+ """Create a sample UI element for testing."""
41
+ return {
42
+ "resourceId": "com.example:id/text_input",
43
+ "text": "Sample text",
44
+ "bounds": {"x": 100, "y": 200, "width": 300, "height": 50},
45
+ "focused": "false",
46
+ }
47
+
48
+
49
+ @pytest.fixture
50
+ def sample_rich_element():
51
+ """Create a sample rich UI element for testing."""
52
+ return {
53
+ "attributes": {
54
+ "resource-id": "com.example:id/text_input",
55
+ "focused": "false",
56
+ "text": "Sample text",
57
+ },
58
+ "children": [],
59
+ }
60
+
61
+
62
+ class TestMoveCursorToEndIfBounds:
63
+ """Test cases for move_cursor_to_end_if_bounds function."""
64
+
65
+ @patch("minitap.mobile_use.tools.utils.tap")
66
+ @patch("minitap.mobile_use.tools.utils.find_element_by_resource_id")
67
+ def test_move_cursor_with_resource_id(
68
+ self, mock_find_element, mock_tap, mock_context, mock_state, sample_element
69
+ ):
70
+ """Test moving cursor using resource_id (highest priority)."""
71
+ mock_state.latest_ui_hierarchy = [sample_element]
72
+ mock_find_element.return_value = sample_element
73
+
74
+ result = move_cursor_to_end_if_bounds(
75
+ ctx=mock_context,
76
+ state=mock_state,
77
+ text_input_resource_id="com.example:id/text_input",
78
+ text_input_coordinates=None,
79
+ text_input_text=None,
80
+ )
81
+
82
+ mock_find_element.assert_called_once_with(
83
+ ui_hierarchy=[sample_element], resource_id="com.example:id/text_input"
84
+ )
85
+ mock_tap.assert_called_once()
86
+ call_args = mock_tap.call_args[1]
87
+ selector_request = call_args["selector_request"]
88
+ assert isinstance(selector_request, SelectorRequestWithCoordinates)
89
+ coords = selector_request.coordinates
90
+ assert coords.x == 397 # 100 + 300 * 0.99
91
+ assert coords.y == 249 # 200 + 50 * 0.99
92
+ assert result == sample_element
93
+
94
+ @patch("minitap.mobile_use.tools.utils.tap")
95
+ @patch("minitap.mobile_use.tools.utils.find_element_by_resource_id")
96
+ def test_move_cursor_with_coordinates_only(
97
+ self, mock_find_element, mock_tap, mock_context, mock_state
98
+ ):
99
+ """Test moving cursor when only coordinates are provided."""
100
+ bounds = ElementBounds(x=50, y=150, width=200, height=40)
101
+
102
+ result = move_cursor_to_end_if_bounds(
103
+ ctx=mock_context,
104
+ state=mock_state,
105
+ text_input_resource_id=None,
106
+ text_input_coordinates=bounds,
107
+ text_input_text=None,
108
+ )
109
+
110
+ mock_find_element.assert_not_called()
111
+ mock_tap.assert_called_once()
112
+ call_args = mock_tap.call_args[1]
113
+ selector_request = call_args["selector_request"]
114
+ coords = selector_request.coordinates
115
+ assert coords.x == 248 # 50 + 200 * 0.99
116
+ assert coords.y == 189 # 150 + 40 * 0.99
117
+ assert result is None # No element is returned when using coords directly
118
+
119
+ @patch("minitap.mobile_use.tools.utils.tap")
120
+ @patch("minitap.mobile_use.tools.utils.find_element_by_text")
121
+ def test_move_cursor_with_text_only_success(
122
+ self, mock_find_text, mock_tap, mock_context, mock_state, sample_element
123
+ ):
124
+ """Test moving cursor when only text is provided and succeeds."""
125
+ mock_state.latest_ui_hierarchy = [sample_element]
126
+ mock_find_text.return_value = sample_element
127
+
128
+ result = move_cursor_to_end_if_bounds(
129
+ ctx=mock_context,
130
+ state=mock_state,
131
+ text_input_resource_id=None,
132
+ text_input_coordinates=None,
133
+ text_input_text="Sample text",
134
+ )
135
+
136
+ mock_find_text.assert_called_once_with([sample_element], "Sample text")
137
+ mock_tap.assert_called_once()
138
+ assert result == sample_element
139
+
140
+ @patch("minitap.mobile_use.tools.utils.tap")
141
+ @patch("minitap.mobile_use.tools.utils.find_element_by_text")
142
+ def test_move_cursor_with_text_only_element_not_found(
143
+ self, mock_find_text, mock_tap, mock_context, mock_state
144
+ ):
145
+ """Test when searching by text finds no element."""
146
+ mock_state.latest_ui_hierarchy = []
147
+ mock_find_text.return_value = None
148
+
149
+ result = move_cursor_to_end_if_bounds(
150
+ ctx=mock_context,
151
+ state=mock_state,
152
+ text_input_resource_id=None,
153
+ text_input_coordinates=None,
154
+ text_input_text="Nonexistent text",
155
+ )
156
+
157
+ mock_tap.assert_not_called()
158
+ assert result is None
159
+
160
+ @patch("minitap.mobile_use.tools.utils.tap")
161
+ @patch("minitap.mobile_use.tools.utils.find_element_by_text")
162
+ def test_move_cursor_with_text_only_no_bounds(
163
+ self, mock_find_text, mock_tap, mock_context, mock_state
164
+ ):
165
+ """Test when element is found by text but has no bounds."""
166
+ element_no_bounds = {"text": "Text without bounds"}
167
+ mock_state.latest_ui_hierarchy = [element_no_bounds]
168
+ mock_find_text.return_value = element_no_bounds
169
+
170
+ result = move_cursor_to_end_if_bounds(
171
+ ctx=mock_context,
172
+ state=mock_state,
173
+ text_input_resource_id=None,
174
+ text_input_coordinates=None,
175
+ text_input_text="Text without bounds",
176
+ )
177
+
178
+ mock_tap.assert_not_called()
179
+ assert result is None # Should return None as no action was taken
180
+
181
+ @patch("minitap.mobile_use.tools.utils.find_element_by_resource_id")
182
+ def test_move_cursor_element_not_found_by_id(self, mock_find_element, mock_context, mock_state):
183
+ """Test when element is not found by resource_id."""
184
+ mock_find_element.return_value = None
185
+
186
+ result = move_cursor_to_end_if_bounds(
187
+ ctx=mock_context,
188
+ state=mock_state,
189
+ text_input_resource_id="com.example:id/nonexistent",
190
+ text_input_coordinates=None,
191
+ text_input_text=None,
192
+ )
193
+
194
+ assert result is None
195
+
196
+
197
+ class TestFocusElementIfNeeded:
198
+ """Test cases for focus_element_if_needed function."""
199
+
200
+ @patch("minitap.mobile_use.tools.utils.tap")
201
+ @patch("minitap.mobile_use.tools.utils.find_element_by_resource_id")
202
+ def test_focus_element_already_focused(
203
+ self, mock_find_element, mock_tap, mock_context, sample_rich_element
204
+ ):
205
+ """Test when element is already focused."""
206
+ focused_element = sample_rich_element.copy()
207
+ focused_element["attributes"]["focused"] = "true"
208
+
209
+ mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [focused_element]
210
+ mock_find_element.return_value = focused_element["attributes"]
211
+
212
+ result = focus_element_if_needed(
213
+ ctx=mock_context,
214
+ input_resource_id="com.example:id/text_input",
215
+ input_coordinates=None,
216
+ input_text=None,
217
+ )
218
+
219
+ mock_tap.assert_not_called()
220
+ assert result is True
221
+ mock_context.hw_bridge_client.get_rich_hierarchy.assert_called_once()
222
+
223
+ @patch("minitap.mobile_use.tools.utils.tap")
224
+ @patch("minitap.mobile_use.tools.utils.find_element_by_resource_id")
225
+ def test_focus_element_needs_focus_success(
226
+ self, mock_find_element, mock_tap, mock_context, sample_rich_element
227
+ ):
228
+ """Test when element needs focus and focusing succeeds."""
229
+ unfocused_element = sample_rich_element
230
+ focused_element = {
231
+ "attributes": {
232
+ "resource-id": "com.example:id/text_input",
233
+ "focused": "true",
234
+ },
235
+ "children": [],
236
+ }
237
+
238
+ mock_context.hw_bridge_client.get_rich_hierarchy.side_effect = [
239
+ [unfocused_element],
240
+ [focused_element],
241
+ ]
242
+ mock_find_element.side_effect = [
243
+ unfocused_element["attributes"],
244
+ focused_element["attributes"],
245
+ ]
246
+
247
+ result = focus_element_if_needed(
248
+ ctx=mock_context,
249
+ input_resource_id="com.example:id/text_input",
250
+ input_coordinates=None,
251
+ input_text=None,
252
+ )
253
+
254
+ mock_tap.assert_called_once_with(
255
+ ctx=mock_context,
256
+ selector_request=IdSelectorRequest(id="com.example:id/text_input"),
257
+ )
258
+ assert mock_context.hw_bridge_client.get_rich_hierarchy.call_count == 2
259
+ assert result is True
260
+
261
+ @patch("minitap.mobile_use.tools.utils.tap")
262
+ @patch("minitap.mobile_use.tools.utils.logger")
263
+ @patch("minitap.mobile_use.tools.utils.find_element_by_resource_id")
264
+ def test_focus_id_and_text_mismatch_fallback_to_text(
265
+ self, mock_find_id, mock_logger, mock_tap, mock_context, sample_rich_element
266
+ ):
267
+ """Test fallback when resource_id and text point to different elements."""
268
+ element_from_id = sample_rich_element["attributes"].copy()
269
+ element_from_id["text"] = "Different text"
270
+
271
+ # L'élément qui sera trouvé par le texte doit avoir des "bounds"
272
+ element_from_text = sample_rich_element.copy()
273
+ element_from_text["bounds"] = {"x": 10, "y": 20, "width": 100, "height": 30}
274
+
275
+ mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [element_from_text]
276
+ mock_find_id.return_value = element_from_id
277
+
278
+ with patch("minitap.mobile_use.tools.utils.find_element_by_text") as mock_find_text:
279
+ mock_find_text.return_value = element_from_text # Trouvé par le texte
280
+
281
+ result = focus_element_if_needed(
282
+ ctx=mock_context,
283
+ input_resource_id="com.example:id/text_input",
284
+ input_coordinates=None,
285
+ input_text="Sample text", # Le texte correct à rechercher
286
+ )
287
+
288
+ mock_logger.warning.assert_called_once()
289
+ # Maintenant, tap devrait être appelé car l'élément trouvé a des "bounds"
290
+ mock_tap.assert_called_once()
291
+ assert result is True
292
+
293
+ @patch("minitap.mobile_use.tools.utils.tap")
294
+ @patch("minitap.mobile_use.tools.utils.find_element_by_text")
295
+ def test_focus_fallback_to_text(
296
+ self, mock_find_text, mock_tap, mock_context, sample_rich_element
297
+ ):
298
+ """Test fallback to focusing using text."""
299
+ # L'élément doit avoir des "bounds" au premier niveau pour
300
+ # que get_bounds_for_element fonctionne
301
+ element_with_bounds = sample_rich_element.copy()
302
+ element_with_bounds["bounds"] = {"x": 10, "y": 20, "width": 100, "height": 30}
303
+
304
+ mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [element_with_bounds]
305
+ mock_find_text.return_value = element_with_bounds
306
+
307
+ result = focus_element_if_needed(
308
+ ctx=mock_context,
309
+ input_resource_id=None,
310
+ input_coordinates=None,
311
+ input_text="Sample text",
312
+ )
313
+
314
+ mock_find_text.assert_called_once()
315
+ mock_tap.assert_called_once()
316
+ call_args = mock_tap.call_args[1]
317
+ selector = call_args["selector_request"]
318
+ # Vérifie que le tap se fait bien au centre des "bounds"
319
+ assert selector.coordinates.x == 60 # 10 + 100/2
320
+ assert selector.coordinates.y == 35 # 20 + 30/2
321
+ assert result is True
322
+
323
+ @patch("minitap.mobile_use.tools.utils.logger")
324
+ def test_focus_all_locators_fail(self, mock_logger, mock_context):
325
+ """Test failure when no locator can find an element."""
326
+ mock_context.hw_bridge_client.get_rich_hierarchy.return_value = []
327
+
328
+ # Mock find_element functions to return None
329
+ with (
330
+ patch("minitap.mobile_use.tools.utils.find_element_by_resource_id") as mock_find_id,
331
+ patch("minitap.mobile_use.tools.utils.find_element_by_text") as mock_find_text,
332
+ ):
333
+ mock_find_id.return_value = None
334
+ mock_find_text.return_value = None
335
+
336
+ result = focus_element_if_needed(
337
+ ctx=mock_context,
338
+ input_resource_id="nonexistent",
339
+ input_coordinates=None,
340
+ input_text="nonexistent",
341
+ )
342
+
343
+ mock_logger.error.assert_called_once_with(
344
+ "Failed to focus element. No valid locator"
345
+ + "(resource_id, coordinates, or text) succeeded."
346
+ )
347
+ assert result is False
348
+
349
+
350
+ if __name__ == "__main__":
351
+ pytest.main([__file__])
@@ -2,6 +2,7 @@ from collections.abc import Callable
2
2
 
3
3
  from langchain_core.tools import BaseTool
4
4
  from pydantic import BaseModel
5
+
5
6
  from minitap.mobile_use.context import MobileUseContext
6
7
 
7
8
 
@@ -9,3 +10,7 @@ class ToolWrapper(BaseModel):
9
10
  tool_fn_getter: Callable[[MobileUseContext], BaseTool]
10
11
  on_success_fn: Callable[..., str]
11
12
  on_failure_fn: Callable[..., str]
13
+
14
+
15
+ class CompositeToolWrapper(ToolWrapper):
16
+ composite_tools_fn_getter: Callable[[MobileUseContext], list[BaseTool]]
@@ -10,38 +10,47 @@ from minitap.mobile_use.controllers.mobile_command_controller import (
10
10
  from minitap.mobile_use.graph.state import State
11
11
  from minitap.mobile_use.utils.logger import get_logger
12
12
  from minitap.mobile_use.utils.ui_hierarchy import (
13
+ ElementBounds,
13
14
  Point,
14
15
  find_element_by_resource_id,
15
16
  get_bounds_for_element,
17
+ get_element_text,
16
18
  is_element_focused,
17
19
  )
18
20
 
19
21
  logger = get_logger(__name__)
20
22
 
21
23
 
22
- def move_cursor_to_end_if_bounds(
23
- ctx: MobileUseContext,
24
- state: State,
25
- resource_id: str,
26
- elt: dict | None = None,
27
- ) -> dict | None:
24
+ def find_element_by_text(ui_hierarchy: list[dict], text: str) -> dict | None:
28
25
  """
29
- Best-effort move of the text cursor near the end of the input by tapping the
30
- bottom-right area of the focused element (if bounds are available).
26
+ Find a UI element by its text content (adapted to both flat and rich hierarchy)
27
+
28
+ This function performs a recursive, case-insensitive partial search.
29
+
30
+ Args:
31
+ ui_hierarchy: List of UI element dictionaries.
32
+ text: The text content to search for.
33
+
34
+ Returns:
35
+ The complete UI element dictionary if found, None otherwise.
31
36
  """
32
- if not elt:
33
- elt = find_element_by_resource_id(
34
- ui_hierarchy=state.latest_ui_hierarchy or [],
35
- resource_id=resource_id,
36
- )
37
- if not elt:
38
- return
39
37
 
40
- bounds = get_bounds_for_element(elt)
41
- if not bounds:
42
- return elt
38
+ def search_recursive(elements: list[dict]) -> dict | None:
39
+ for element in elements:
40
+ if isinstance(element, dict):
41
+ src = element.get("attributes", element)
42
+ if text and text.lower() == src.get("text", "").lower():
43
+ return element
44
+ if (children := element.get("children", [])) and (
45
+ found := search_recursive(children)
46
+ ):
47
+ return found
48
+ return None
49
+
50
+ return search_recursive(ui_hierarchy)
51
+
43
52
 
44
- logger.debug("Tapping near the end of the input to move the cursor")
53
+ def tap_bottom_right_of_element(bounds: ElementBounds, ctx: MobileUseContext):
45
54
  bottom_right: Point = bounds.get_relative_point(x_percent=0.99, y_percent=0.99)
46
55
  tap(
47
56
  ctx=ctx,
@@ -52,35 +61,133 @@ def move_cursor_to_end_if_bounds(
52
61
  ),
53
62
  ),
54
63
  )
55
- logger.debug(f"Tapped end of input {resource_id} at ({bottom_right.x}, {bottom_right.y})")
56
- return elt
64
+
65
+
66
+ def move_cursor_to_end_if_bounds(
67
+ ctx: MobileUseContext,
68
+ state: State,
69
+ text_input_resource_id: str | None,
70
+ text_input_coordinates: ElementBounds | None,
71
+ text_input_text: str | None,
72
+ elt: dict | None = None,
73
+ ) -> dict | None:
74
+ """
75
+ Best-effort move of the text cursor near the end of the input by tapping the
76
+ bottom-right area of the focused element (if bounds are available).
77
+ """
78
+ if text_input_resource_id:
79
+ if not elt:
80
+ elt = find_element_by_resource_id(
81
+ ui_hierarchy=state.latest_ui_hierarchy or [],
82
+ resource_id=text_input_resource_id,
83
+ )
84
+ if not elt:
85
+ return
86
+
87
+ bounds = get_bounds_for_element(elt)
88
+ if not bounds:
89
+ return elt
90
+
91
+ logger.debug("Tapping near the end of the input to move the cursor")
92
+ tap_bottom_right_of_element(bounds=bounds, ctx=ctx)
93
+ logger.debug(f"Tapped end of input {text_input_resource_id}")
94
+ return elt
95
+
96
+ if text_input_coordinates:
97
+ tap_bottom_right_of_element(text_input_coordinates, ctx=ctx)
98
+ logger.debug("Tapped end of input by coordinates")
99
+ return elt
100
+
101
+ if text_input_text:
102
+ text_elt = find_element_by_text(state.latest_ui_hierarchy or [], text_input_text)
103
+ if text_elt:
104
+ bounds = get_bounds_for_element(text_elt)
105
+ if bounds:
106
+ tap_bottom_right_of_element(bounds=bounds, ctx=ctx)
107
+ logger.debug(f"Tapped end of input that had text'{text_input_text}'")
108
+ return text_elt
109
+ return None
110
+
111
+ return None
57
112
 
58
113
 
59
114
  def focus_element_if_needed(
60
115
  ctx: MobileUseContext,
61
- resource_id: str,
116
+ input_resource_id: str | None,
117
+ input_coordinates: ElementBounds | None,
118
+ input_text: str | None,
62
119
  ) -> bool:
63
120
  """
64
- Ensures the element identified by `resource_id` is focused.
121
+ Ensures the element is focused, with a sanity check to prevent trusting misleading IDs.
65
122
  """
66
- rich_hierarchy: list[dict] = ctx.hw_bridge_client.get_rich_hierarchy()
67
- rich_elt = find_element_by_resource_id(
68
- ui_hierarchy=rich_hierarchy,
69
- resource_id=resource_id,
70
- is_rich_hierarchy=True,
71
- )
72
- if rich_elt and not is_element_focused(rich_elt):
73
- tap(ctx=ctx, selector_request=IdSelectorRequest(id=resource_id))
74
- logger.debug(f"Focused (tap) on resource_id={resource_id}")
75
- rich_hierarchy = ctx.hw_bridge_client.get_rich_hierarchy()
76
- rich_elt = find_element_by_resource_id(
77
- ui_hierarchy=rich_hierarchy,
78
- resource_id=resource_id,
79
- is_rich_hierarchy=True,
123
+ rich_hierarchy = ctx.hw_bridge_client.get_rich_hierarchy()
124
+
125
+ elt_from_id = None
126
+ if input_resource_id:
127
+ elt_from_id = find_element_by_resource_id(
128
+ ui_hierarchy=rich_hierarchy, resource_id=input_resource_id, is_rich_hierarchy=True
129
+ )
130
+
131
+ if elt_from_id and input_text:
132
+ text_from_id_elt = get_element_text(elt_from_id)
133
+ if not text_from_id_elt or input_text.lower() != text_from_id_elt.lower():
134
+ logger.warning(
135
+ f"ID '{input_resource_id}' and text '{input_text}'"
136
+ + "seem to be on different elements. "
137
+ "Ignoring the resource_id and falling back to other locators."
138
+ )
139
+ elt_from_id = None
140
+
141
+ if elt_from_id:
142
+ if not is_element_focused(elt_from_id):
143
+ tap(ctx=ctx, selector_request=IdSelectorRequest(id=input_resource_id)) # type: ignore
144
+ logger.debug(f"Focused (tap) on resource_id={input_resource_id}")
145
+ rich_hierarchy = ctx.hw_bridge_client.get_rich_hierarchy()
146
+ elt_from_id = find_element_by_resource_id(
147
+ ui_hierarchy=rich_hierarchy,
148
+ resource_id=input_resource_id, # type: ignore
149
+ is_rich_hierarchy=True,
150
+ )
151
+ if elt_from_id and is_element_focused(elt_from_id):
152
+ logger.debug(f"Text input is focused: {input_resource_id}")
153
+ return True
154
+
155
+ logger.warning(f"Failed to focus using resource_id='{input_resource_id}'. Fallback...")
156
+
157
+ if input_coordinates:
158
+ relative_point = input_coordinates.get_center()
159
+ tap(
160
+ ctx=ctx,
161
+ selector_request=SelectorRequestWithCoordinates(
162
+ coordinates=CoordinatesSelectorRequest(
163
+ x=relative_point.x,
164
+ y=relative_point.y,
165
+ ),
166
+ ),
80
167
  )
81
- if rich_elt and is_element_focused(rich_elt):
82
- logger.debug(f"Text input is focused: {resource_id}")
168
+ logger.debug(f"Tapped on coordinates ({relative_point.x}, {relative_point.y}) to focus.")
83
169
  return True
84
170
 
85
- logger.warning(f"Failed to focus resource_id={resource_id}")
171
+ if input_text:
172
+ text_elt = find_element_by_text(rich_hierarchy, input_text)
173
+ if text_elt:
174
+ bounds = get_bounds_for_element(text_elt)
175
+ if bounds:
176
+ relative_point = bounds.get_center()
177
+ tap(
178
+ ctx=ctx,
179
+ selector_request=SelectorRequestWithCoordinates(
180
+ coordinates=CoordinatesSelectorRequest(
181
+ x=relative_point.x,
182
+ y=relative_point.y,
183
+ ),
184
+ ),
185
+ )
186
+ logger.debug(f"Tapped on text element '{input_text}' to focus.")
187
+ return True
188
+
189
+ logger.error(
190
+ "Failed to focus element. No valid locator"
191
+ + "(resource_id, coordinates, or text) succeeded."
192
+ )
86
193
  return False
@@ -45,12 +45,5 @@ def record_interaction(ctx: MobileUseContext, response: BaseMessage):
45
45
  return "Screenshot recorded successfully"
46
46
 
47
47
 
48
- def log_agent_thought(prefix: str, agent_thought: str):
49
- if prefix:
50
- prefix = prefix[0].upper() + prefix[1:]
51
- else:
52
- prefix = "New agent thought"
53
- logger.info(
54
- f"💭 {Fore.LIGHTMAGENTA_EX + Style.BRIGHT}{prefix}{Style.RESET_ALL}: "
55
- f"{Fore.LIGHTMAGENTA_EX}{agent_thought}{Style.RESET_ALL}"
56
- )
48
+ def log_agent_thought(agent_thought: str):
49
+ logger.info(f"💭 {Fore.LIGHTMAGENTA_EX}{agent_thought}{Style.RESET_ALL}")