minitap-mobile-use 2.3.0__py3-none-any.whl → 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- minitap/mobile_use/agents/contextor/contextor.py +2 -2
- minitap/mobile_use/agents/cortex/cortex.md +49 -8
- minitap/mobile_use/agents/cortex/cortex.py +8 -4
- minitap/mobile_use/agents/executor/executor.md +14 -11
- minitap/mobile_use/agents/executor/executor.py +6 -5
- minitap/mobile_use/agents/hopper/hopper.py +6 -3
- minitap/mobile_use/agents/orchestrator/orchestrator.py +26 -11
- minitap/mobile_use/agents/outputter/outputter.py +6 -3
- minitap/mobile_use/agents/planner/planner.md +20 -22
- minitap/mobile_use/agents/planner/planner.py +10 -7
- minitap/mobile_use/agents/planner/types.py +4 -2
- minitap/mobile_use/agents/planner/utils.py +14 -0
- minitap/mobile_use/agents/summarizer/summarizer.py +2 -2
- minitap/mobile_use/config.py +6 -1
- minitap/mobile_use/context.py +13 -3
- minitap/mobile_use/controllers/mobile_command_controller.py +1 -14
- minitap/mobile_use/graph/state.py +7 -3
- minitap/mobile_use/sdk/agent.py +188 -23
- minitap/mobile_use/sdk/examples/README.md +19 -1
- minitap/mobile_use/sdk/examples/platform_manual_task_example.py +65 -0
- minitap/mobile_use/sdk/examples/platform_minimal_example.py +46 -0
- minitap/mobile_use/sdk/services/platform.py +307 -0
- minitap/mobile_use/sdk/types/__init__.py +16 -14
- minitap/mobile_use/sdk/types/exceptions.py +27 -0
- minitap/mobile_use/sdk/types/platform.py +127 -0
- minitap/mobile_use/sdk/types/task.py +78 -17
- minitap/mobile_use/servers/device_hardware_bridge.py +1 -1
- minitap/mobile_use/servers/stop_servers.py +11 -12
- minitap/mobile_use/services/llm.py +89 -5
- minitap/mobile_use/tools/index.py +0 -6
- minitap/mobile_use/tools/mobile/back.py +3 -3
- minitap/mobile_use/tools/mobile/clear_text.py +24 -43
- minitap/mobile_use/tools/mobile/erase_one_char.py +5 -4
- minitap/mobile_use/tools/mobile/glimpse_screen.py +11 -7
- minitap/mobile_use/tools/mobile/input_text.py +21 -51
- minitap/mobile_use/tools/mobile/launch_app.py +54 -22
- minitap/mobile_use/tools/mobile/long_press_on.py +15 -8
- minitap/mobile_use/tools/mobile/open_link.py +15 -8
- minitap/mobile_use/tools/mobile/press_key.py +15 -8
- minitap/mobile_use/tools/mobile/stop_app.py +14 -8
- minitap/mobile_use/tools/mobile/swipe.py +11 -5
- minitap/mobile_use/tools/mobile/tap.py +103 -21
- minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +3 -3
- minitap/mobile_use/tools/test_utils.py +104 -78
- minitap/mobile_use/tools/types.py +35 -0
- minitap/mobile_use/tools/utils.py +51 -48
- minitap/mobile_use/utils/recorder.py +1 -1
- minitap/mobile_use/utils/ui_hierarchy.py +9 -2
- {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/METADATA +3 -1
- minitap_mobile_use-2.5.0.dist-info/RECORD +100 -0
- minitap/mobile_use/tools/mobile/copy_text_from.py +0 -75
- minitap/mobile_use/tools/mobile/find_packages.py +0 -69
- minitap/mobile_use/tools/mobile/paste_text.py +0 -88
- minitap_mobile_use-2.3.0.dist-info/RECORD +0 -98
- {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/WHEEL +0 -0
- {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/entry_points.txt +0 -0
|
@@ -12,6 +12,7 @@ from minitap.mobile_use.controllers.mobile_command_controller import ( # noqa:
|
|
|
12
12
|
IdSelectorRequest,
|
|
13
13
|
SelectorRequestWithCoordinates,
|
|
14
14
|
)
|
|
15
|
+
from minitap.mobile_use.tools.types import Target # noqa: E402
|
|
15
16
|
from minitap.mobile_use.tools.utils import ( # noqa: E402
|
|
16
17
|
focus_element_if_needed,
|
|
17
18
|
move_cursor_to_end_if_bounds,
|
|
@@ -54,6 +55,7 @@ def sample_rich_element():
|
|
|
54
55
|
"resource-id": "com.example:id/text_input",
|
|
55
56
|
"focused": "false",
|
|
56
57
|
"text": "Sample text",
|
|
58
|
+
"bounds": {"x": 100, "y": 200, "width": 300, "height": 50},
|
|
57
59
|
},
|
|
58
60
|
"children": [],
|
|
59
61
|
}
|
|
@@ -71,16 +73,19 @@ class TestMoveCursorToEndIfBounds:
|
|
|
71
73
|
mock_state.latest_ui_hierarchy = [sample_element]
|
|
72
74
|
mock_find_element.return_value = sample_element
|
|
73
75
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
76
|
+
target = Target(
|
|
77
|
+
resource_id="com.example:id/text_input",
|
|
78
|
+
resource_id_index=None,
|
|
79
|
+
text=None,
|
|
80
|
+
text_index=None,
|
|
81
|
+
coordinates=None,
|
|
80
82
|
)
|
|
83
|
+
result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
|
|
81
84
|
|
|
82
85
|
mock_find_element.assert_called_once_with(
|
|
83
|
-
ui_hierarchy=[sample_element],
|
|
86
|
+
ui_hierarchy=[sample_element],
|
|
87
|
+
resource_id="com.example:id/text_input",
|
|
88
|
+
index=0,
|
|
84
89
|
)
|
|
85
90
|
mock_tap.assert_called_once()
|
|
86
91
|
call_args = mock_tap.call_args[1]
|
|
@@ -98,15 +103,16 @@ class TestMoveCursorToEndIfBounds:
|
|
|
98
103
|
):
|
|
99
104
|
"""Test moving cursor when only coordinates are provided."""
|
|
100
105
|
bounds = ElementBounds(x=50, y=150, width=200, height=40)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
text_input_text=None,
|
|
106
|
+
target = Target(
|
|
107
|
+
resource_id=None,
|
|
108
|
+
resource_id_index=None,
|
|
109
|
+
text=None,
|
|
110
|
+
text_index=None,
|
|
111
|
+
coordinates=bounds,
|
|
108
112
|
)
|
|
109
113
|
|
|
114
|
+
result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
|
|
115
|
+
|
|
110
116
|
mock_find_element.assert_not_called()
|
|
111
117
|
mock_tap.assert_called_once()
|
|
112
118
|
call_args = mock_tap.call_args[1]
|
|
@@ -125,15 +131,16 @@ class TestMoveCursorToEndIfBounds:
|
|
|
125
131
|
mock_state.latest_ui_hierarchy = [sample_element]
|
|
126
132
|
mock_find_text.return_value = sample_element
|
|
127
133
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
+
target = Target(
|
|
135
|
+
resource_id=None,
|
|
136
|
+
resource_id_index=None,
|
|
137
|
+
text="Sample text",
|
|
138
|
+
text_index=0,
|
|
139
|
+
coordinates=None,
|
|
134
140
|
)
|
|
141
|
+
result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
|
|
135
142
|
|
|
136
|
-
mock_find_text.assert_called_once_with([sample_element], "Sample text")
|
|
143
|
+
mock_find_text.assert_called_once_with([sample_element], "Sample text", index=0)
|
|
137
144
|
mock_tap.assert_called_once()
|
|
138
145
|
assert result == sample_element
|
|
139
146
|
|
|
@@ -146,13 +153,14 @@ class TestMoveCursorToEndIfBounds:
|
|
|
146
153
|
mock_state.latest_ui_hierarchy = []
|
|
147
154
|
mock_find_text.return_value = None
|
|
148
155
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
156
|
+
target = Target(
|
|
157
|
+
resource_id=None,
|
|
158
|
+
resource_id_index=None,
|
|
159
|
+
text="Nonexistent text",
|
|
160
|
+
text_index=None,
|
|
161
|
+
coordinates=None,
|
|
155
162
|
)
|
|
163
|
+
result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
|
|
156
164
|
|
|
157
165
|
mock_tap.assert_not_called()
|
|
158
166
|
assert result is None
|
|
@@ -167,13 +175,14 @@ class TestMoveCursorToEndIfBounds:
|
|
|
167
175
|
mock_state.latest_ui_hierarchy = [element_no_bounds]
|
|
168
176
|
mock_find_text.return_value = element_no_bounds
|
|
169
177
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
178
|
+
target = Target(
|
|
179
|
+
resource_id=None,
|
|
180
|
+
resource_id_index=None,
|
|
181
|
+
text="Text without bounds",
|
|
182
|
+
text_index=None,
|
|
183
|
+
coordinates=None,
|
|
176
184
|
)
|
|
185
|
+
result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
|
|
177
186
|
|
|
178
187
|
mock_tap.assert_not_called()
|
|
179
188
|
assert result is None # Should return None as no action was taken
|
|
@@ -183,13 +192,14 @@ class TestMoveCursorToEndIfBounds:
|
|
|
183
192
|
"""Test when element is not found by resource_id."""
|
|
184
193
|
mock_find_element.return_value = None
|
|
185
194
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
195
|
+
target = Target(
|
|
196
|
+
resource_id="com.example:id/nonexistent",
|
|
197
|
+
resource_id_index=None,
|
|
198
|
+
text=None,
|
|
199
|
+
text_index=None,
|
|
200
|
+
coordinates=None,
|
|
192
201
|
)
|
|
202
|
+
result = move_cursor_to_end_if_bounds(ctx=mock_context, state=mock_state, target=target)
|
|
193
203
|
|
|
194
204
|
assert result is None
|
|
195
205
|
|
|
@@ -209,12 +219,14 @@ class TestFocusElementIfNeeded:
|
|
|
209
219
|
mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [focused_element]
|
|
210
220
|
mock_find_element.return_value = focused_element["attributes"]
|
|
211
221
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
222
|
+
target = Target(
|
|
223
|
+
resource_id="com.example:id/text_input",
|
|
224
|
+
resource_id_index=None,
|
|
225
|
+
text=None,
|
|
226
|
+
text_index=None,
|
|
227
|
+
coordinates=None,
|
|
217
228
|
)
|
|
229
|
+
result = focus_element_if_needed(ctx=mock_context, target=target)
|
|
218
230
|
|
|
219
231
|
mock_tap.assert_not_called()
|
|
220
232
|
assert result is True
|
|
@@ -244,16 +256,19 @@ class TestFocusElementIfNeeded:
|
|
|
244
256
|
focused_element["attributes"],
|
|
245
257
|
]
|
|
246
258
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
259
|
+
target = Target(
|
|
260
|
+
resource_id="com.example:id/text_input",
|
|
261
|
+
resource_id_index=None,
|
|
262
|
+
text=None,
|
|
263
|
+
text_index=None,
|
|
264
|
+
coordinates=None,
|
|
252
265
|
)
|
|
266
|
+
result = focus_element_if_needed(ctx=mock_context, target=target)
|
|
253
267
|
|
|
254
268
|
mock_tap.assert_called_once_with(
|
|
255
269
|
ctx=mock_context,
|
|
256
270
|
selector_request=IdSelectorRequest(id="com.example:id/text_input"),
|
|
271
|
+
index=0,
|
|
257
272
|
)
|
|
258
273
|
assert mock_context.hw_bridge_client.get_rich_hierarchy.call_count == 2
|
|
259
274
|
assert result is True
|
|
@@ -268,25 +283,30 @@ class TestFocusElementIfNeeded:
|
|
|
268
283
|
element_from_id = sample_rich_element["attributes"].copy()
|
|
269
284
|
element_from_id["text"] = "Different text"
|
|
270
285
|
|
|
271
|
-
# L'élément qui sera trouvé par le texte doit avoir des "bounds"
|
|
272
286
|
element_from_text = sample_rich_element.copy()
|
|
273
|
-
element_from_text["bounds"] = {
|
|
287
|
+
element_from_text["attributes"]["bounds"] = {
|
|
288
|
+
"x": 10,
|
|
289
|
+
"y": 20,
|
|
290
|
+
"width": 100,
|
|
291
|
+
"height": 30,
|
|
292
|
+
}
|
|
274
293
|
|
|
275
294
|
mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [element_from_text]
|
|
276
295
|
mock_find_id.return_value = element_from_id
|
|
277
296
|
|
|
278
297
|
with patch("minitap.mobile_use.tools.utils.find_element_by_text") as mock_find_text:
|
|
279
|
-
mock_find_text.return_value = element_from_text
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
298
|
+
mock_find_text.return_value = element_from_text["attributes"]
|
|
299
|
+
|
|
300
|
+
target = Target(
|
|
301
|
+
resource_id="com.example:id/text_input",
|
|
302
|
+
resource_id_index=None,
|
|
303
|
+
text="Sample text",
|
|
304
|
+
text_index=None,
|
|
305
|
+
coordinates=None,
|
|
286
306
|
)
|
|
307
|
+
result = focus_element_if_needed(ctx=mock_context, target=target)
|
|
287
308
|
|
|
288
309
|
mock_logger.warning.assert_called_once()
|
|
289
|
-
# Maintenant, tap devrait être appelé car l'élément trouvé a des "bounds"
|
|
290
310
|
mock_tap.assert_called_once()
|
|
291
311
|
assert result is True
|
|
292
312
|
|
|
@@ -296,26 +316,31 @@ class TestFocusElementIfNeeded:
|
|
|
296
316
|
self, mock_find_text, mock_tap, mock_context, sample_rich_element
|
|
297
317
|
):
|
|
298
318
|
"""Test fallback to focusing using text."""
|
|
299
|
-
# L'élément doit avoir des "bounds" au premier niveau pour
|
|
300
|
-
# que get_bounds_for_element fonctionne
|
|
301
319
|
element_with_bounds = sample_rich_element.copy()
|
|
302
|
-
element_with_bounds["bounds"] = {
|
|
320
|
+
element_with_bounds["attributes"]["bounds"] = {
|
|
321
|
+
"x": 10,
|
|
322
|
+
"y": 20,
|
|
323
|
+
"width": 100,
|
|
324
|
+
"height": 30,
|
|
325
|
+
}
|
|
303
326
|
|
|
304
327
|
mock_context.hw_bridge_client.get_rich_hierarchy.return_value = [element_with_bounds]
|
|
305
|
-
mock_find_text.return_value = element_with_bounds
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
328
|
+
mock_find_text.return_value = element_with_bounds["attributes"]
|
|
329
|
+
|
|
330
|
+
target = Target(
|
|
331
|
+
resource_id=None,
|
|
332
|
+
resource_id_index=None,
|
|
333
|
+
text="Sample text",
|
|
334
|
+
text_index=None,
|
|
335
|
+
coordinates=None,
|
|
312
336
|
)
|
|
337
|
+
result = focus_element_if_needed(ctx=mock_context, target=target)
|
|
313
338
|
|
|
314
339
|
mock_find_text.assert_called_once()
|
|
315
340
|
mock_tap.assert_called_once()
|
|
316
341
|
call_args = mock_tap.call_args[1]
|
|
317
342
|
selector = call_args["selector_request"]
|
|
318
|
-
|
|
343
|
+
assert isinstance(selector, SelectorRequestWithCoordinates)
|
|
319
344
|
assert selector.coordinates.x == 60 # 10 + 100/2
|
|
320
345
|
assert selector.coordinates.y == 35 # 20 + 30/2
|
|
321
346
|
assert result is True
|
|
@@ -325,7 +350,6 @@ class TestFocusElementIfNeeded:
|
|
|
325
350
|
"""Test failure when no locator can find an element."""
|
|
326
351
|
mock_context.hw_bridge_client.get_rich_hierarchy.return_value = []
|
|
327
352
|
|
|
328
|
-
# Mock find_element functions to return None
|
|
329
353
|
with (
|
|
330
354
|
patch("minitap.mobile_use.tools.utils.find_element_by_resource_id") as mock_find_id,
|
|
331
355
|
patch("minitap.mobile_use.tools.utils.find_element_by_text") as mock_find_text,
|
|
@@ -333,16 +357,18 @@ class TestFocusElementIfNeeded:
|
|
|
333
357
|
mock_find_id.return_value = None
|
|
334
358
|
mock_find_text.return_value = None
|
|
335
359
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
360
|
+
target = Target(
|
|
361
|
+
resource_id="nonexistent",
|
|
362
|
+
resource_id_index=None,
|
|
363
|
+
text="nonexistent",
|
|
364
|
+
text_index=None,
|
|
365
|
+
coordinates=None,
|
|
341
366
|
)
|
|
367
|
+
result = focus_element_if_needed(ctx=mock_context, target=target)
|
|
342
368
|
|
|
343
369
|
mock_logger.error.assert_called_once_with(
|
|
344
|
-
"Failed to focus element.
|
|
345
|
-
+ "(resource_id, coordinates, or text) succeeded."
|
|
370
|
+
"Failed to focus element."
|
|
371
|
+
+ " No valid locator (resource_id, coordinates, or text) succeeded."
|
|
346
372
|
)
|
|
347
373
|
assert result is False
|
|
348
374
|
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field, model_validator
|
|
2
|
+
|
|
3
|
+
from minitap.mobile_use.utils.ui_hierarchy import ElementBounds
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Target(BaseModel):
|
|
7
|
+
"""
|
|
8
|
+
A comprehensive locator for a UI element, supporting a fallback mechanism.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
resource_id: str | None = Field(None, description="The resource-id of the element.")
|
|
12
|
+
resource_id_index: int | None = Field(
|
|
13
|
+
None,
|
|
14
|
+
description="The zero-based index if multiple elements share the same resource-id.",
|
|
15
|
+
)
|
|
16
|
+
text: str | None = Field(
|
|
17
|
+
None, description="The text content of the element (e.g., a label or placeholder)."
|
|
18
|
+
)
|
|
19
|
+
text_index: int | None = Field(
|
|
20
|
+
None, description="The zero-based index if multiple elements share the same text."
|
|
21
|
+
)
|
|
22
|
+
coordinates: ElementBounds | None = Field(
|
|
23
|
+
None, description="The x, y, width, and height of the element."
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@model_validator(mode="after")
|
|
27
|
+
def _default_indices(self):
|
|
28
|
+
# Treat empty strings like “not provided”
|
|
29
|
+
if (
|
|
30
|
+
self.resource_id is not None and self.resource_id != ""
|
|
31
|
+
) and self.resource_id_index is None:
|
|
32
|
+
self.resource_id_index = 0
|
|
33
|
+
if (self.text is not None and self.text != "") and self.text_index is None:
|
|
34
|
+
self.text_index = 0
|
|
35
|
+
return self
|
|
@@ -8,6 +8,7 @@ from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
|
8
8
|
tap,
|
|
9
9
|
)
|
|
10
10
|
from minitap.mobile_use.graph.state import State
|
|
11
|
+
from minitap.mobile_use.tools.types import Target
|
|
11
12
|
from minitap.mobile_use.utils.logger import get_logger
|
|
12
13
|
from minitap.mobile_use.utils.ui_hierarchy import (
|
|
13
14
|
ElementBounds,
|
|
@@ -21,7 +22,9 @@ from minitap.mobile_use.utils.ui_hierarchy import (
|
|
|
21
22
|
logger = get_logger(__name__)
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
def find_element_by_text(
|
|
25
|
+
def find_element_by_text(
|
|
26
|
+
ui_hierarchy: list[dict], text: str, index: int | None = None
|
|
27
|
+
) -> dict | None:
|
|
25
28
|
"""
|
|
26
29
|
Find a UI element by its text content (adapted to both flat and rich hierarchy)
|
|
27
30
|
|
|
@@ -40,7 +43,11 @@ def find_element_by_text(ui_hierarchy: list[dict], text: str) -> dict | None:
|
|
|
40
43
|
if isinstance(element, dict):
|
|
41
44
|
src = element.get("attributes", element)
|
|
42
45
|
if text and text.lower() == src.get("text", "").lower():
|
|
43
|
-
|
|
46
|
+
idx = index or 0
|
|
47
|
+
if idx == 0:
|
|
48
|
+
return element
|
|
49
|
+
idx -= 1
|
|
50
|
+
continue
|
|
44
51
|
if (children := element.get("children", [])) and (
|
|
45
52
|
found := search_recursive(children)
|
|
46
53
|
):
|
|
@@ -66,23 +73,22 @@ def tap_bottom_right_of_element(bounds: ElementBounds, ctx: MobileUseContext):
|
|
|
66
73
|
def move_cursor_to_end_if_bounds(
|
|
67
74
|
ctx: MobileUseContext,
|
|
68
75
|
state: State,
|
|
69
|
-
|
|
70
|
-
text_input_coordinates: ElementBounds | None,
|
|
71
|
-
text_input_text: str | None,
|
|
76
|
+
target: Target,
|
|
72
77
|
elt: dict | None = None,
|
|
73
78
|
) -> dict | None:
|
|
74
79
|
"""
|
|
75
80
|
Best-effort move of the text cursor near the end of the input by tapping the
|
|
76
81
|
bottom-right area of the focused element (if bounds are available).
|
|
77
82
|
"""
|
|
78
|
-
if
|
|
83
|
+
if target.resource_id:
|
|
79
84
|
if not elt:
|
|
80
85
|
elt = find_element_by_resource_id(
|
|
81
86
|
ui_hierarchy=state.latest_ui_hierarchy or [],
|
|
82
|
-
resource_id=
|
|
87
|
+
resource_id=target.resource_id,
|
|
88
|
+
index=target.resource_id_index,
|
|
83
89
|
)
|
|
84
90
|
if not elt:
|
|
85
|
-
return
|
|
91
|
+
return None
|
|
86
92
|
|
|
87
93
|
bounds = get_bounds_for_element(elt)
|
|
88
94
|
if not bounds:
|
|
@@ -90,86 +96,85 @@ def move_cursor_to_end_if_bounds(
|
|
|
90
96
|
|
|
91
97
|
logger.debug("Tapping near the end of the input to move the cursor")
|
|
92
98
|
tap_bottom_right_of_element(bounds=bounds, ctx=ctx)
|
|
93
|
-
logger.debug(f"Tapped end of input {
|
|
99
|
+
logger.debug(f"Tapped end of input {target.resource_id}")
|
|
94
100
|
return elt
|
|
95
101
|
|
|
96
|
-
if
|
|
97
|
-
tap_bottom_right_of_element(
|
|
102
|
+
if target.coordinates:
|
|
103
|
+
tap_bottom_right_of_element(target.coordinates, ctx=ctx)
|
|
98
104
|
logger.debug("Tapped end of input by coordinates")
|
|
99
105
|
return elt
|
|
100
106
|
|
|
101
|
-
if
|
|
102
|
-
text_elt = find_element_by_text(
|
|
107
|
+
if target.text:
|
|
108
|
+
text_elt = find_element_by_text(
|
|
109
|
+
state.latest_ui_hierarchy or [], target.text, index=target.text_index
|
|
110
|
+
)
|
|
103
111
|
if text_elt:
|
|
104
112
|
bounds = get_bounds_for_element(text_elt)
|
|
105
113
|
if bounds:
|
|
106
114
|
tap_bottom_right_of_element(bounds=bounds, ctx=ctx)
|
|
107
|
-
logger.debug(f"Tapped end of input that had text'{
|
|
115
|
+
logger.debug(f"Tapped end of input that had text'{target.text}'")
|
|
108
116
|
return text_elt
|
|
109
117
|
return None
|
|
110
118
|
|
|
111
119
|
return None
|
|
112
120
|
|
|
113
121
|
|
|
114
|
-
def focus_element_if_needed(
|
|
115
|
-
ctx: MobileUseContext,
|
|
116
|
-
input_resource_id: str | None,
|
|
117
|
-
input_coordinates: ElementBounds | None,
|
|
118
|
-
input_text: str | None,
|
|
119
|
-
) -> bool:
|
|
122
|
+
def focus_element_if_needed(ctx: MobileUseContext, target: Target) -> bool:
|
|
120
123
|
"""
|
|
121
124
|
Ensures the element is focused, with a sanity check to prevent trusting misleading IDs.
|
|
122
125
|
"""
|
|
123
126
|
rich_hierarchy = ctx.hw_bridge_client.get_rich_hierarchy()
|
|
124
|
-
|
|
125
127
|
elt_from_id = None
|
|
126
|
-
if
|
|
128
|
+
if target.resource_id:
|
|
127
129
|
elt_from_id = find_element_by_resource_id(
|
|
128
|
-
ui_hierarchy=rich_hierarchy,
|
|
130
|
+
ui_hierarchy=rich_hierarchy,
|
|
131
|
+
resource_id=target.resource_id,
|
|
132
|
+
index=target.resource_id_index,
|
|
133
|
+
is_rich_hierarchy=True,
|
|
129
134
|
)
|
|
130
135
|
|
|
131
|
-
if elt_from_id and
|
|
136
|
+
if elt_from_id and target.text:
|
|
132
137
|
text_from_id_elt = get_element_text(elt_from_id)
|
|
133
|
-
if not text_from_id_elt or
|
|
138
|
+
if not text_from_id_elt or target.text.lower() != text_from_id_elt.lower():
|
|
134
139
|
logger.warning(
|
|
135
|
-
f"ID '{
|
|
136
|
-
|
|
137
|
-
"Ignoring the resource_id and falling back to other locators."
|
|
140
|
+
f"ID '{target.resource_id}' and text '{target.text}' seem to be on different "
|
|
141
|
+
"elements. Ignoring the resource_id and falling back to other locators."
|
|
138
142
|
)
|
|
139
143
|
elt_from_id = None
|
|
140
144
|
|
|
141
145
|
if elt_from_id:
|
|
142
146
|
if not is_element_focused(elt_from_id):
|
|
143
|
-
tap(
|
|
144
|
-
|
|
147
|
+
tap(
|
|
148
|
+
ctx=ctx,
|
|
149
|
+
selector_request=IdSelectorRequest(id=target.resource_id), # type: ignore
|
|
150
|
+
index=target.resource_id_index,
|
|
151
|
+
)
|
|
152
|
+
logger.debug(f"Focused (tap) on resource_id={target.resource_id}")
|
|
145
153
|
rich_hierarchy = ctx.hw_bridge_client.get_rich_hierarchy()
|
|
146
154
|
elt_from_id = find_element_by_resource_id(
|
|
147
155
|
ui_hierarchy=rich_hierarchy,
|
|
148
|
-
resource_id=
|
|
156
|
+
resource_id=target.resource_id, # type: ignore
|
|
157
|
+
index=target.resource_id_index,
|
|
149
158
|
is_rich_hierarchy=True,
|
|
150
159
|
)
|
|
151
160
|
if elt_from_id and is_element_focused(elt_from_id):
|
|
152
|
-
logger.debug(f"Text input is focused: {
|
|
161
|
+
logger.debug(f"Text input is focused: {target.resource_id}")
|
|
153
162
|
return True
|
|
163
|
+
logger.warning(f"Failed to focus using resource_id='{target.resource_id}'. Fallback...")
|
|
154
164
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
if input_coordinates:
|
|
158
|
-
relative_point = input_coordinates.get_center()
|
|
165
|
+
if target.coordinates:
|
|
166
|
+
relative_point = target.coordinates.get_center()
|
|
159
167
|
tap(
|
|
160
168
|
ctx=ctx,
|
|
161
169
|
selector_request=SelectorRequestWithCoordinates(
|
|
162
|
-
coordinates=CoordinatesSelectorRequest(
|
|
163
|
-
x=relative_point.x,
|
|
164
|
-
y=relative_point.y,
|
|
165
|
-
),
|
|
170
|
+
coordinates=CoordinatesSelectorRequest(x=relative_point.x, y=relative_point.y)
|
|
166
171
|
),
|
|
167
172
|
)
|
|
168
173
|
logger.debug(f"Tapped on coordinates ({relative_point.x}, {relative_point.y}) to focus.")
|
|
169
174
|
return True
|
|
170
175
|
|
|
171
|
-
if
|
|
172
|
-
text_elt = find_element_by_text(rich_hierarchy,
|
|
176
|
+
if target.text:
|
|
177
|
+
text_elt = find_element_by_text(rich_hierarchy, target.text, index=target.text_index)
|
|
173
178
|
if text_elt:
|
|
174
179
|
bounds = get_bounds_for_element(text_elt)
|
|
175
180
|
if bounds:
|
|
@@ -178,16 +183,14 @@ def focus_element_if_needed(
|
|
|
178
183
|
ctx=ctx,
|
|
179
184
|
selector_request=SelectorRequestWithCoordinates(
|
|
180
185
|
coordinates=CoordinatesSelectorRequest(
|
|
181
|
-
x=relative_point.x,
|
|
182
|
-
|
|
183
|
-
),
|
|
186
|
+
x=relative_point.x, y=relative_point.y
|
|
187
|
+
)
|
|
184
188
|
),
|
|
185
189
|
)
|
|
186
|
-
logger.debug(f"Tapped on text element '{
|
|
190
|
+
logger.debug(f"Tapped on text element '{target.text}' to focus.")
|
|
187
191
|
return True
|
|
188
192
|
|
|
189
193
|
logger.error(
|
|
190
|
-
"Failed to focus element. No valid locator"
|
|
191
|
-
+ "(resource_id, coordinates, or text) succeeded."
|
|
194
|
+
"Failed to focus element. No valid locator (resource_id, coordinates, or text) succeeded."
|
|
192
195
|
)
|
|
193
196
|
return False
|
|
@@ -25,7 +25,7 @@ def record_interaction(ctx: MobileUseContext, response: BaseMessage):
|
|
|
25
25
|
logger.error(f"Error compressing screenshot: {e}")
|
|
26
26
|
return "Could not record this interaction"
|
|
27
27
|
timestamp = time.time()
|
|
28
|
-
folder = ctx.execution_setup.traces_path.joinpath(ctx.execution_setup.
|
|
28
|
+
folder = ctx.execution_setup.traces_path.joinpath(ctx.execution_setup.trace_name).resolve()
|
|
29
29
|
folder.mkdir(parents=True, exist_ok=True)
|
|
30
30
|
try:
|
|
31
31
|
with open(
|
|
@@ -40,7 +40,10 @@ def text_input_is_empty(text: str | None, hint_text: str | None) -> bool:
|
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
def find_element_by_resource_id(
|
|
43
|
-
ui_hierarchy: list[dict],
|
|
43
|
+
ui_hierarchy: list[dict],
|
|
44
|
+
resource_id: str,
|
|
45
|
+
index: int | None = None,
|
|
46
|
+
is_rich_hierarchy: bool = False,
|
|
44
47
|
) -> dict | None:
|
|
45
48
|
"""
|
|
46
49
|
Find a UI element by its resource-id in the UI hierarchy.
|
|
@@ -60,7 +63,11 @@ def find_element_by_resource_id(
|
|
|
60
63
|
for element in elements:
|
|
61
64
|
if isinstance(element, dict):
|
|
62
65
|
if element.get("resourceId") == resource_id:
|
|
63
|
-
|
|
66
|
+
idx = index or 0
|
|
67
|
+
if idx == 0:
|
|
68
|
+
return element
|
|
69
|
+
idx -= 1
|
|
70
|
+
continue
|
|
64
71
|
|
|
65
72
|
children = element.get("children", [])
|
|
66
73
|
if children:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: minitap-mobile-use
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.0
|
|
4
4
|
Summary: AI-powered multi-agent system that automates real Android and iOS devices through low-level control using LangGraph.
|
|
5
5
|
Author: Pierre-Louis Favreau, Jean-Pierre Lo, Nicolas Dehandschoewercker
|
|
6
6
|
License: MIT License
|
|
@@ -43,9 +43,11 @@ Requires-Dist: uvicorn[standard]==0.30.1
|
|
|
43
43
|
Requires-Dist: colorama>=0.4.6
|
|
44
44
|
Requires-Dist: psutil>=5.9.0
|
|
45
45
|
Requires-Dist: langchain-google-vertexai>=2.0.28
|
|
46
|
+
Requires-Dist: httpx>=0.28.1
|
|
46
47
|
Requires-Dist: ruff==0.5.3 ; extra == 'dev'
|
|
47
48
|
Requires-Dist: pytest==8.4.1 ; extra == 'dev'
|
|
48
49
|
Requires-Dist: pytest-cov==5.0.0 ; extra == 'dev'
|
|
50
|
+
Requires-Dist: pyright==1.1.405 ; extra == 'dev'
|
|
49
51
|
Requires-Python: >=3.12
|
|
50
52
|
Project-URL: Homepage, https://minitap.ai/
|
|
51
53
|
Project-URL: Source, https://github.com/minitap-ai/mobile-use
|