minitap-mobile-use 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (54) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +2 -2
  2. minitap/mobile_use/agents/cortex/cortex.md +49 -8
  3. minitap/mobile_use/agents/cortex/cortex.py +8 -4
  4. minitap/mobile_use/agents/executor/executor.md +14 -11
  5. minitap/mobile_use/agents/executor/executor.py +6 -5
  6. minitap/mobile_use/agents/hopper/hopper.py +6 -3
  7. minitap/mobile_use/agents/orchestrator/orchestrator.py +26 -11
  8. minitap/mobile_use/agents/outputter/outputter.py +6 -3
  9. minitap/mobile_use/agents/planner/planner.md +20 -22
  10. minitap/mobile_use/agents/planner/planner.py +10 -7
  11. minitap/mobile_use/agents/planner/types.py +4 -2
  12. minitap/mobile_use/agents/planner/utils.py +14 -0
  13. minitap/mobile_use/agents/summarizer/summarizer.py +2 -2
  14. minitap/mobile_use/config.py +6 -1
  15. minitap/mobile_use/context.py +13 -3
  16. minitap/mobile_use/controllers/mobile_command_controller.py +1 -14
  17. minitap/mobile_use/graph/state.py +7 -3
  18. minitap/mobile_use/sdk/agent.py +188 -23
  19. minitap/mobile_use/sdk/examples/README.md +19 -1
  20. minitap/mobile_use/sdk/examples/platform_minimal_example.py +46 -0
  21. minitap/mobile_use/sdk/services/platform.py +244 -0
  22. minitap/mobile_use/sdk/types/__init__.py +14 -14
  23. minitap/mobile_use/sdk/types/exceptions.py +27 -0
  24. minitap/mobile_use/sdk/types/platform.py +125 -0
  25. minitap/mobile_use/sdk/types/task.py +60 -17
  26. minitap/mobile_use/servers/device_hardware_bridge.py +1 -1
  27. minitap/mobile_use/servers/stop_servers.py +11 -12
  28. minitap/mobile_use/services/llm.py +89 -5
  29. minitap/mobile_use/tools/index.py +0 -6
  30. minitap/mobile_use/tools/mobile/back.py +3 -3
  31. minitap/mobile_use/tools/mobile/clear_text.py +24 -43
  32. minitap/mobile_use/tools/mobile/erase_one_char.py +5 -4
  33. minitap/mobile_use/tools/mobile/glimpse_screen.py +11 -7
  34. minitap/mobile_use/tools/mobile/input_text.py +21 -51
  35. minitap/mobile_use/tools/mobile/launch_app.py +54 -22
  36. minitap/mobile_use/tools/mobile/long_press_on.py +15 -8
  37. minitap/mobile_use/tools/mobile/open_link.py +15 -8
  38. minitap/mobile_use/tools/mobile/press_key.py +15 -8
  39. minitap/mobile_use/tools/mobile/stop_app.py +14 -8
  40. minitap/mobile_use/tools/mobile/swipe.py +11 -5
  41. minitap/mobile_use/tools/mobile/tap.py +103 -21
  42. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +3 -3
  43. minitap/mobile_use/tools/test_utils.py +104 -78
  44. minitap/mobile_use/tools/types.py +35 -0
  45. minitap/mobile_use/tools/utils.py +51 -48
  46. minitap/mobile_use/utils/recorder.py +1 -1
  47. minitap/mobile_use/utils/ui_hierarchy.py +9 -2
  48. {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.4.0.dist-info}/METADATA +3 -1
  49. {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.4.0.dist-info}/RECORD +51 -50
  50. minitap/mobile_use/tools/mobile/copy_text_from.py +0 -75
  51. minitap/mobile_use/tools/mobile/find_packages.py +0 -69
  52. minitap/mobile_use/tools/mobile/paste_text.py +0 -88
  53. {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.4.0.dist-info}/WHEEL +0 -0
  54. {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.4.0.dist-info}/entry_points.txt +0 -0
@@ -12,6 +12,7 @@ from minitap.mobile_use.controllers.mobile_command_controller import ( # noqa:
12
12
  IdSelectorRequest,
13
13
  SelectorRequestWithCoordinates,
14
14
  )
15
+ from minitap.mobile_use.tools.types import Target # noqa: E402
15
16
  from minitap.mobile_use.tools.utils import ( # noqa: E402
16
17
  focus_element_if_needed,
17
18
  move_cursor_to_end_if_bounds,
@@ -54,6 +55,7 @@ def sample_rich_element():
54
55
  "resource-id": "com.example:id/text_input",
55
56
  "focused": "false",
56
57
  "text": "Sample text",
58
+ "bounds": {"x": 100, "y": 200, "width": 300, "height": 50},
57
59
  },
58
60
  "children": [],
59
61
  }
@@ -71,16 +73,19 @@ class TestMoveCursorToEndIfBounds:
71
73
  mock_state.latest_ui_hierarchy = [sample_element]
72
74
  mock_find_element.return_value = sample_element
73
75
 
74
- result = move_cursor_to_end_if_bounds(
75
- ctx=mock_context,
76
- state=mock_state,
77
- text_input_resource_id="com.example:id/text_input",
78
- text_input_coordinates=None,
79
- text_input_text=None,
76
+ target = Target(
77
+ resource_id="com.example:id/text_input",
78
+ resource_id_index=None,
79
+ text=None,
80
+ text_index=None,
81
+ coordinates=None,
80
82
  )
83
+ result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
81
84
 
82
85
  mock_find_element.assert_called_once_with(
83
- ui_hierarchy=[sample_element], resource_id="com.example:id/text_input"
86
+ ui_hierarchy=[sample_element],
87
+ resource_id="com.example:id/text_input",
88
+ index=0,
84
89
  )
85
90
  mock_tap.assert_called_once()
86
91
  call_args = mock_tap.call_args[1]
@@ -98,15 +103,16 @@ class TestMoveCursorToEndIfBounds:
98
103
  ):
99
104
  """Test moving cursor when only coordinates are provided."""
100
105
  bounds = ElementBounds(x=50, y=150, width=200, height=40)
101
-
102
- result = move_cursor_to_end_if_bounds(
103
- ctx=mock_context,
104
- state=mock_state,
105
- text_input_resource_id=None,
106
- text_input_coordinates=bounds,
107
- text_input_text=None,
106
+ target = Target(
107
+ resource_id=None,
108
+ resource_id_index=None,
109
+ text=None,
110
+ text_index=None,
111
+ coordinates=bounds,
108
112
  )
109
113
 
114
+ result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
115
+
110
116
  mock_find_element.assert_not_called()
111
117
  mock_tap.assert_called_once()
112
118
  call_args = mock_tap.call_args[1]
@@ -125,15 +131,16 @@ class TestMoveCursorToEndIfBounds:
125
131
  mock_state.latest_ui_hierarchy = [sample_element]
126
132
  mock_find_text.return_value = sample_element
127
133
 
128
- result = move_cursor_to_end_if_bounds(
129
- ctx=mock_context,
130
- state=mock_state,
131
- text_input_resource_id=None,
132
- text_input_coordinates=None,
133
- text_input_text="Sample text",
134
+ target = Target(
135
+ resource_id=None,
136
+ resource_id_index=None,
137
+ text="Sample text",
138
+ text_index=0,
139
+ coordinates=None,
134
140
  )
141
+ result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
135
142
 
136
- mock_find_text.assert_called_once_with([sample_element], "Sample text")
143
+ mock_find_text.assert_called_once_with([sample_element], "Sample text", index=0)
137
144
  mock_tap.assert_called_once()
138
145
  assert result == sample_element
139
146
 
@@ -146,13 +153,14 @@ class TestMoveCursorToEndIfBounds:
146
153
  mock_state.latest_ui_hierarchy = []
147
154
  mock_find_text.return_value = None
148
155
 
149
- result = move_cursor_to_end_if_bounds(
150
- ctx=mock_context,
151
- state=mock_state,
152
- text_input_resource_id=None,
153
- text_input_coordinates=None,
154
- text_input_text="Nonexistent text",
156
+ target = Target(
157
+ resource_id=None,
158
+ resource_id_index=None,
159
+ text="Nonexistent text",
160
+ text_index=None,
161
+ coordinates=None,
155
162
  )
163
+ result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
156
164
 
157
165
  mock_tap.assert_not_called()
158
166
  assert result is None
@@ -167,13 +175,14 @@ class TestMoveCursorToEndIfBounds:
167
175
  mock_state.latest_ui_hierarchy = [element_no_bounds]
168
176
  mock_find_text.return_value = element_no_bounds
169
177
 
170
- result = move_cursor_to_end_if_bounds(
171
- ctx=mock_context,
172
- state=mock_state,
173
- text_input_resource_id=None,
174
- text_input_coordinates=None,
175
- text_input_text="Text without bounds",
178
+ target = Target(
179
+ resource_id=None,
180
+ resource_id_index=None,
181
+ text="Text without bounds",
182
+ text_index=None,
183
+ coordinates=None,
176
184
  )
185
+ result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
177
186
 
178
187
  mock_tap.assert_not_called()
179
188
  assert result is None # Should return None as no action was taken
@@ -183,13 +192,14 @@ class TestMoveCursorToEndIfBounds:
183
192
  """Test when element is not found by resource_id."""
184
193
  mock_find_element.return_value = None
185
194
 
186
- result = move_cursor_to_end_if_bounds(
187
- ctx=mock_context,
188
- state=mock_state,
189
- text_input_resource_id="com.example:id/nonexistent",
190
- text_input_coordinates=None,
191
- text_input_text=None,
195
+ target = Target(
196
+ resource_id="com.example:id/nonexistent",
197
+ resource_id_index=None,
198
+ text=None,
199
+ text_index=None,
200
+ coordinates=None,
192
201
  )
202
+ result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
193
203
 
194
204
  assert result is None
195
205
 
@@ -209,12 +219,14 @@ class TestFocusElementIfNeeded:
209
219
  mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [focused_element]
210
220
  mock_find_element.return_value = focused_element["attributes"]
211
221
 
212
- result = focus_element_if_needed(
213
- ctx=mock_context,
214
- input_resource_id="com.example:id/text_input",
215
- input_coordinates=None,
216
- input_text=None,
222
+ target = Target(
223
+ resource_id="com.example:id/text_input",
224
+ resource_id_index=None,
225
+ text=None,
226
+ text_index=None,
227
+ coordinates=None,
217
228
  )
229
+ result = focus_element_if_needed(ctx=mock_context, target=target)
218
230
 
219
231
  mock_tap.assert_not_called()
220
232
  assert result is True
@@ -244,16 +256,19 @@ class TestFocusElementIfNeeded:
244
256
  focused_element["attributes"],
245
257
  ]
246
258
 
247
- result = focus_element_if_needed(
248
- ctx=mock_context,
249
- input_resource_id="com.example:id/text_input",
250
- input_coordinates=None,
251
- input_text=None,
259
+ target = Target(
260
+ resource_id="com.example:id/text_input",
261
+ resource_id_index=None,
262
+ text=None,
263
+ text_index=None,
264
+ coordinates=None,
252
265
  )
266
+ result = focus_element_if_needed(ctx=mock_context, target=target)
253
267
 
254
268
  mock_tap.assert_called_once_with(
255
269
  ctx=mock_context,
256
270
  selector_request=IdSelectorRequest(id="com.example:id/text_input"),
271
+ index=0,
257
272
  )
258
273
  assert mock_context.hw_bridge_client.get_rich_hierarchy.call_count == 2
259
274
  assert result is True
@@ -268,25 +283,30 @@ class TestFocusElementIfNeeded:
268
283
  element_from_id = sample_rich_element["attributes"].copy()
269
284
  element_from_id["text"] = "Different text"
270
285
 
271
- # L'élément qui sera trouvé par le texte doit avoir des "bounds"
272
286
  element_from_text = sample_rich_element.copy()
273
- element_from_text["bounds"] = {"x": 10, "y": 20, "width": 100, "height": 30}
287
+ element_from_text["attributes"]["bounds"] = {
288
+ "x": 10,
289
+ "y": 20,
290
+ "width": 100,
291
+ "height": 30,
292
+ }
274
293
 
275
294
  mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [element_from_text]
276
295
  mock_find_id.return_value = element_from_id
277
296
 
278
297
  with patch("minitap.mobile_use.tools.utils.find_element_by_text") as mock_find_text:
279
- mock_find_text.return_value = element_from_text # Trouvé par le texte
280
-
281
- result = focus_element_if_needed(
282
- ctx=mock_context,
283
- input_resource_id="com.example:id/text_input",
284
- input_coordinates=None,
285
- input_text="Sample text", # Le texte correct à rechercher
298
+ mock_find_text.return_value = element_from_text["attributes"]
299
+
300
+ target = Target(
301
+ resource_id="com.example:id/text_input",
302
+ resource_id_index=None,
303
+ text="Sample text",
304
+ text_index=None,
305
+ coordinates=None,
286
306
  )
307
+ result = focus_element_if_needed(ctx=mock_context, target=target)
287
308
 
288
309
  mock_logger.warning.assert_called_once()
289
- # Maintenant, tap devrait être appelé car l'élément trouvé a des "bounds"
290
310
  mock_tap.assert_called_once()
291
311
  assert result is True
292
312
 
@@ -296,26 +316,31 @@ class TestFocusElementIfNeeded:
296
316
  self, mock_find_text, mock_tap, mock_context, sample_rich_element
297
317
  ):
298
318
  """Test fallback to focusing using text."""
299
- # L'élément doit avoir des "bounds" au premier niveau pour
300
- # que get_bounds_for_element fonctionne
301
319
  element_with_bounds = sample_rich_element.copy()
302
- element_with_bounds["bounds"] = {"x": 10, "y": 20, "width": 100, "height": 30}
320
+ element_with_bounds["attributes"]["bounds"] = {
321
+ "x": 10,
322
+ "y": 20,
323
+ "width": 100,
324
+ "height": 30,
325
+ }
303
326
 
304
327
  mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [element_with_bounds]
305
- mock_find_text.return_value = element_with_bounds
306
-
307
- result = focus_element_if_needed(
308
- ctx=mock_context,
309
- input_resource_id=None,
310
- input_coordinates=None,
311
- input_text="Sample text",
328
+ mock_find_text.return_value = element_with_bounds["attributes"]
329
+
330
+ target = Target(
331
+ resource_id=None,
332
+ resource_id_index=None,
333
+ text="Sample text",
334
+ text_index=None,
335
+ coordinates=None,
312
336
  )
337
+ result = focus_element_if_needed(ctx=mock_context, target=target)
313
338
 
314
339
  mock_find_text.assert_called_once()
315
340
  mock_tap.assert_called_once()
316
341
  call_args = mock_tap.call_args[1]
317
342
  selector = call_args["selector_request"]
318
- # Vérifie que le tap se fait bien au centre des "bounds"
343
+ assert isinstance(selector, SelectorRequestWithCoordinates)
319
344
  assert selector.coordinates.x == 60 # 10 + 100/2
320
345
  assert selector.coordinates.y == 35 # 20 + 30/2
321
346
  assert result is True
@@ -325,7 +350,6 @@ class TestFocusElementIfNeeded:
325
350
  """Test failure when no locator can find an element."""
326
351
  mock_context.hw_bridge_client.get_rich_hierarchy.return_value = []
327
352
 
328
- # Mock find_element functions to return None
329
353
  with (
330
354
  patch("minitap.mobile_use.tools.utils.find_element_by_resource_id") as mock_find_id,
331
355
  patch("minitap.mobile_use.tools.utils.find_element_by_text") as mock_find_text,
@@ -333,16 +357,18 @@ class TestFocusElementIfNeeded:
333
357
  mock_find_id.return_value = None
334
358
  mock_find_text.return_value = None
335
359
 
336
- result = focus_element_if_needed(
337
- ctx=mock_context,
338
- input_resource_id="nonexistent",
339
- input_coordinates=None,
340
- input_text="nonexistent",
360
+ target = Target(
361
+ resource_id="nonexistent",
362
+ resource_id_index=None,
363
+ text="nonexistent",
364
+ text_index=None,
365
+ coordinates=None,
341
366
  )
367
+ result = focus_element_if_needed(ctx=mock_context, target=target)
342
368
 
343
369
  mock_logger.error.assert_called_once_with(
344
- "Failed to focus element. No valid locator"
345
- + "(resource_id, coordinates, or text) succeeded."
370
+ "Failed to focus element."
371
+ + " No valid locator (resource_id, coordinates, or text) succeeded."
346
372
  )
347
373
  assert result is False
348
374
 
@@ -0,0 +1,35 @@
1
+ from pydantic import BaseModel, Field, model_validator
2
+
3
+ from minitap.mobile_use.utils.ui_hierarchy import ElementBounds
4
+
5
+
6
+ class Target(BaseModel):
7
+ """
8
+ A comprehensive locator for a UI element, supporting a fallback mechanism.
9
+ """
10
+
11
+ resource_id: str | None = Field(None, description="The resource-id of the element.")
12
+ resource_id_index: int | None = Field(
13
+ None,
14
+ description="The zero-based index if multiple elements share the same resource-id.",
15
+ )
16
+ text: str | None = Field(
17
+ None, description="The text content of the element (e.g., a label or placeholder)."
18
+ )
19
+ text_index: int | None = Field(
20
+ None, description="The zero-based index if multiple elements share the same text."
21
+ )
22
+ coordinates: ElementBounds | None = Field(
23
+ None, description="The x, y, width, and height of the element."
24
+ )
25
+
26
+ @model_validator(mode="after")
27
+ def _default_indices(self):
28
+ # Treat empty strings like “not provided”
29
+ if (
30
+ self.resource_id is not None and self.resource_id != ""
31
+ ) and self.resource_id_index is None:
32
+ self.resource_id_index = 0
33
+ if (self.text is not None and self.text != "") and self.text_index is None:
34
+ self.text_index = 0
35
+ return self
@@ -8,6 +8,7 @@ from minitap.mobile_use.controllers.mobile_command_controller import (
8
8
  tap,
9
9
  )
10
10
  from minitap.mobile_use.graph.state import State
11
+ from minitap.mobile_use.tools.types import Target
11
12
  from minitap.mobile_use.utils.logger import get_logger
12
13
  from minitap.mobile_use.utils.ui_hierarchy import (
13
14
  ElementBounds,
@@ -21,7 +22,9 @@ from minitap.mobile_use.utils.ui_hierarchy import (
21
22
  logger = get_logger(__name__)
22
23
 
23
24
 
24
- def find_element_by_text(ui_hierarchy: list[dict], text: str) -> dict | None:
25
+ def find_element_by_text(
26
+ ui_hierarchy: list[dict], text: str, index: int | None = None
27
+ ) -> dict | None:
25
28
  """
26
29
  Find a UI element by its text content (adapted to both flat and rich hierarchy)
27
30
 
@@ -40,7 +43,11 @@ def find_element_by_text(ui_hierarchy: list[dict], text: str) -> dict | None:
40
43
  if isinstance(element, dict):
41
44
  src = element.get("attributes", element)
42
45
  if text and text.lower() == src.get("text", "").lower():
43
- return element
46
+ idx = index or 0
47
+ if idx == 0:
48
+ return element
49
+ idx -= 1
50
+ continue
44
51
  if (children := element.get("children", [])) and (
45
52
  found := search_recursive(children)
46
53
  ):
@@ -66,23 +73,22 @@ def tap_bottom_right_of_element(bounds: ElementBounds, ctx: MobileUseContext):
66
73
  def move_cursor_to_end_if_bounds(
67
74
  ctx: MobileUseContext,
68
75
  state: State,
69
- text_input_resource_id: str | None,
70
- text_input_coordinates: ElementBounds | None,
71
- text_input_text: str | None,
76
+ target: Target,
72
77
  elt: dict | None = None,
73
78
  ) -> dict | None:
74
79
  """
75
80
  Best-effort move of the text cursor near the end of the input by tapping the
76
81
  bottom-right area of the focused element (if bounds are available).
77
82
  """
78
- if text_input_resource_id:
83
+ if target.resource_id:
79
84
  if not elt:
80
85
  elt = find_element_by_resource_id(
81
86
  ui_hierarchy=state.latest_ui_hierarchy or [],
82
- resource_id=text_input_resource_id,
87
+ resource_id=target.resource_id,
88
+ index=target.resource_id_index,
83
89
  )
84
90
  if not elt:
85
- return
91
+ return None
86
92
 
87
93
  bounds = get_bounds_for_element(elt)
88
94
  if not bounds:
@@ -90,86 +96,85 @@ def move_cursor_to_end_if_bounds(
90
96
 
91
97
  logger.debug("Tapping near the end of the input to move the cursor")
92
98
  tap_bottom_right_of_element(bounds=bounds, ctx=ctx)
93
- logger.debug(f"Tapped end of input {text_input_resource_id}")
99
+ logger.debug(f"Tapped end of input {target.resource_id}")
94
100
  return elt
95
101
 
96
- if text_input_coordinates:
97
- tap_bottom_right_of_element(text_input_coordinates, ctx=ctx)
102
+ if target.coordinates:
103
+ tap_bottom_right_of_element(target.coordinates, ctx=ctx)
98
104
  logger.debug("Tapped end of input by coordinates")
99
105
  return elt
100
106
 
101
- if text_input_text:
102
- text_elt = find_element_by_text(state.latest_ui_hierarchy or [], text_input_text)
107
+ if target.text:
108
+ text_elt = find_element_by_text(
109
+ state.latest_ui_hierarchy or [], target.text, index=target.text_index
110
+ )
103
111
  if text_elt:
104
112
  bounds = get_bounds_for_element(text_elt)
105
113
  if bounds:
106
114
  tap_bottom_right_of_element(bounds=bounds, ctx=ctx)
107
- logger.debug(f"Tapped end of input that had text'{text_input_text}'")
115
+ logger.debug(f"Tapped end of input that had text'{target.text}'")
108
116
  return text_elt
109
117
  return None
110
118
 
111
119
  return None
112
120
 
113
121
 
114
- def focus_element_if_needed(
115
- ctx: MobileUseContext,
116
- input_resource_id: str | None,
117
- input_coordinates: ElementBounds | None,
118
- input_text: str | None,
119
- ) -> bool:
122
+ def focus_element_if_needed(ctx: MobileUseContext, target: Target) -> bool:
120
123
  """
121
124
  Ensures the element is focused, with a sanity check to prevent trusting misleading IDs.
122
125
  """
123
126
  rich_hierarchy = ctx.hw_bridge_client.get_rich_hierarchy()
124
-
125
127
  elt_from_id = None
126
- if input_resource_id:
128
+ if target.resource_id:
127
129
  elt_from_id = find_element_by_resource_id(
128
- ui_hierarchy=rich_hierarchy, resource_id=input_resource_id, is_rich_hierarchy=True
130
+ ui_hierarchy=rich_hierarchy,
131
+ resource_id=target.resource_id,
132
+ index=target.resource_id_index,
133
+ is_rich_hierarchy=True,
129
134
  )
130
135
 
131
- if elt_from_id and input_text:
136
+ if elt_from_id and target.text:
132
137
  text_from_id_elt = get_element_text(elt_from_id)
133
- if not text_from_id_elt or input_text.lower() != text_from_id_elt.lower():
138
+ if not text_from_id_elt or target.text.lower() != text_from_id_elt.lower():
134
139
  logger.warning(
135
- f"ID '{input_resource_id}' and text '{input_text}'"
136
- + "seem to be on different elements. "
137
- "Ignoring the resource_id and falling back to other locators."
140
+ f"ID '{target.resource_id}' and text '{target.text}' seem to be on different "
141
+ "elements. Ignoring the resource_id and falling back to other locators."
138
142
  )
139
143
  elt_from_id = None
140
144
 
141
145
  if elt_from_id:
142
146
  if not is_element_focused(elt_from_id):
143
- tap(ctx=ctx, selector_request=IdSelectorRequest(id=input_resource_id)) # type: ignore
144
- logger.debug(f"Focused (tap) on resource_id={input_resource_id}")
147
+ tap(
148
+ ctx=ctx,
149
+ selector_request=IdSelectorRequest(id=target.resource_id), # type: ignore
150
+ index=target.resource_id_index,
151
+ )
152
+ logger.debug(f"Focused (tap) on resource_id={target.resource_id}")
145
153
  rich_hierarchy = ctx.hw_bridge_client.get_rich_hierarchy()
146
154
  elt_from_id = find_element_by_resource_id(
147
155
  ui_hierarchy=rich_hierarchy,
148
- resource_id=input_resource_id, # type: ignore
156
+ resource_id=target.resource_id, # type: ignore
157
+ index=target.resource_id_index,
149
158
  is_rich_hierarchy=True,
150
159
  )
151
160
  if elt_from_id and is_element_focused(elt_from_id):
152
- logger.debug(f"Text input is focused: {input_resource_id}")
161
+ logger.debug(f"Text input is focused: {target.resource_id}")
153
162
  return True
163
+ logger.warning(f"Failed to focus using resource_id='{target.resource_id}'. Fallback...")
154
164
 
155
- logger.warning(f"Failed to focus using resource_id='{input_resource_id}'. Fallback...")
156
-
157
- if input_coordinates:
158
- relative_point = input_coordinates.get_center()
165
+ if target.coordinates:
166
+ relative_point = target.coordinates.get_center()
159
167
  tap(
160
168
  ctx=ctx,
161
169
  selector_request=SelectorRequestWithCoordinates(
162
- coordinates=CoordinatesSelectorRequest(
163
- x=relative_point.x,
164
- y=relative_point.y,
165
- ),
170
+ coordinates=CoordinatesSelectorRequest(x=relative_point.x, y=relative_point.y)
166
171
  ),
167
172
  )
168
173
  logger.debug(f"Tapped on coordinates ({relative_point.x}, {relative_point.y}) to focus.")
169
174
  return True
170
175
 
171
- if input_text:
172
- text_elt = find_element_by_text(rich_hierarchy, input_text)
176
+ if target.text:
177
+ text_elt = find_element_by_text(rich_hierarchy, target.text, index=target.text_index)
173
178
  if text_elt:
174
179
  bounds = get_bounds_for_element(text_elt)
175
180
  if bounds:
@@ -178,16 +183,14 @@ def focus_element_if_needed(
178
183
  ctx=ctx,
179
184
  selector_request=SelectorRequestWithCoordinates(
180
185
  coordinates=CoordinatesSelectorRequest(
181
- x=relative_point.x,
182
- y=relative_point.y,
183
- ),
186
+ x=relative_point.x, y=relative_point.y
187
+ )
184
188
  ),
185
189
  )
186
- logger.debug(f"Tapped on text element '{input_text}' to focus.")
190
+ logger.debug(f"Tapped on text element '{target.text}' to focus.")
187
191
  return True
188
192
 
189
193
  logger.error(
190
- "Failed to focus element. No valid locator"
191
- + "(resource_id, coordinates, or text) succeeded."
194
+ "Failed to focus element. No valid locator (resource_id, coordinates, or text) succeeded."
192
195
  )
193
196
  return False
@@ -25,7 +25,7 @@ def record_interaction(ctx: MobileUseContext, response: BaseMessage):
25
25
  logger.error(f"Error compressing screenshot: {e}")
26
26
  return "Could not record this interaction"
27
27
  timestamp = time.time()
28
- folder = ctx.execution_setup.traces_path.joinpath(ctx.execution_setup.trace_id).resolve()
28
+ folder = ctx.execution_setup.traces_path.joinpath(ctx.execution_setup.trace_name).resolve()
29
29
  folder.mkdir(parents=True, exist_ok=True)
30
30
  try:
31
31
  with open(
@@ -40,7 +40,10 @@ def text_input_is_empty(text: str | None, hint_text: str | None) -> bool:
40
40
 
41
41
 
42
42
  def find_element_by_resource_id(
43
- ui_hierarchy: list[dict], resource_id: str, is_rich_hierarchy: bool = False
43
+ ui_hierarchy: list[dict],
44
+ resource_id: str,
45
+ index: int | None = None,
46
+ is_rich_hierarchy: bool = False,
44
47
  ) -> dict | None:
45
48
  """
46
49
  Find a UI element by its resource-id in the UI hierarchy.
@@ -60,7 +63,11 @@ def find_element_by_resource_id(
60
63
  for element in elements:
61
64
  if isinstance(element, dict):
62
65
  if element.get("resourceId") == resource_id:
63
- return element
66
+ idx = index or 0
67
+ if idx == 0:
68
+ return element
69
+ idx -= 1
70
+ continue
64
71
 
65
72
  children = element.get("children", [])
66
73
  if children:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: minitap-mobile-use
3
- Version: 2.3.0
3
+ Version: 2.4.0
4
4
  Summary: AI-powered multi-agent system that automates real Android and iOS devices through low-level control using LangGraph.
5
5
  Author: Pierre-Louis Favreau, Jean-Pierre Lo, Nicolas Dehandschoewercker
6
6
  License: MIT License
@@ -43,9 +43,11 @@ Requires-Dist: uvicorn[standard]==0.30.1
43
43
  Requires-Dist: colorama>=0.4.6
44
44
  Requires-Dist: psutil>=5.9.0
45
45
  Requires-Dist: langchain-google-vertexai>=2.0.28
46
+ Requires-Dist: httpx>=0.28.1
46
47
  Requires-Dist: ruff==0.5.3 ; extra == 'dev'
47
48
  Requires-Dist: pytest==8.4.1 ; extra == 'dev'
48
49
  Requires-Dist: pytest-cov==5.0.0 ; extra == 'dev'
50
+ Requires-Dist: pyright==1.1.405 ; extra == 'dev'
49
51
  Requires-Python: >=3.12
50
52
  Project-URL: Homepage, https://minitap.ai/
51
53
  Project-URL: Source, https://github.com/minitap-ai/mobile-use