minitap-mobile-use 2.1.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (36) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +4 -2
  2. minitap/mobile_use/agents/cortex/cortex.md +72 -26
  3. minitap/mobile_use/agents/cortex/cortex.py +1 -2
  4. minitap/mobile_use/agents/executor/executor.md +6 -4
  5. minitap/mobile_use/agents/executor/executor.py +3 -1
  6. minitap/mobile_use/agents/executor/utils.py +2 -1
  7. minitap/mobile_use/agents/outputter/test_outputter.py +104 -42
  8. minitap/mobile_use/agents/planner/planner.md +1 -1
  9. minitap/mobile_use/agents/planner/planner.py +4 -2
  10. minitap/mobile_use/config.py +16 -1
  11. minitap/mobile_use/controllers/mobile_command_controller.py +4 -4
  12. minitap/mobile_use/main.py +2 -2
  13. minitap/mobile_use/sdk/agent.py +17 -8
  14. minitap/mobile_use/sdk/builders/agent_config_builder.py +2 -2
  15. minitap/mobile_use/sdk/types/exceptions.py +30 -0
  16. minitap/mobile_use/sdk/utils.py +3 -2
  17. minitap/mobile_use/servers/device_hardware_bridge.py +2 -1
  18. minitap/mobile_use/servers/utils.py +6 -9
  19. minitap/mobile_use/services/llm.py +23 -6
  20. minitap/mobile_use/tools/index.py +21 -15
  21. minitap/mobile_use/tools/mobile/clear_text.py +73 -25
  22. minitap/mobile_use/tools/mobile/copy_text_from.py +7 -5
  23. minitap/mobile_use/tools/mobile/{take_screenshot.py → glimpse_screen.py} +15 -11
  24. minitap/mobile_use/tools/mobile/input_text.py +94 -13
  25. minitap/mobile_use/tools/mobile/paste_text.py +34 -8
  26. minitap/mobile_use/tools/mobile/swipe.py +107 -9
  27. minitap/mobile_use/tools/test_utils.py +351 -0
  28. minitap/mobile_use/tools/tool_wrapper.py +5 -0
  29. minitap/mobile_use/tools/utils.py +147 -40
  30. minitap/mobile_use/utils/recorder.py +2 -9
  31. minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
  32. minitap/mobile_use/utils/ui_hierarchy.py +2 -2
  33. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/METADATA +28 -8
  34. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/RECORD +36 -34
  35. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/WHEEL +0 -0
  36. {minitap_mobile_use-2.1.0.dist-info → minitap_mobile_use-2.3.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Literal
3
+ from typing import Annotated, Literal
4
4
 
5
5
  from langchain_core.messages import ToolMessage
6
6
  from langchain_core.tools import tool
@@ -8,10 +8,12 @@ from langchain_core.tools.base import InjectedToolCallId
8
8
  from langgraph.prebuilt import InjectedState
9
9
  from langgraph.types import Command
10
10
  from pydantic import BaseModel
11
- from typing import Annotated
12
11
 
13
12
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
14
13
  from minitap.mobile_use.context import MobileUseContext
14
+ from minitap.mobile_use.controllers.mobile_command_controller import (
15
+ get_screen_data,
16
+ )
15
17
  from minitap.mobile_use.controllers.mobile_command_controller import (
16
18
  input_text as input_text_controller,
17
19
  )
@@ -19,6 +21,11 @@ from minitap.mobile_use.graph.state import State
19
21
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
20
22
  from minitap.mobile_use.tools.utils import focus_element_if_needed, move_cursor_to_end_if_bounds
21
23
  from minitap.mobile_use.utils.logger import get_logger
24
+ from minitap.mobile_use.utils.ui_hierarchy import (
25
+ ElementBounds,
26
+ find_element_by_resource_id,
27
+ get_element_text,
28
+ )
22
29
 
23
30
  logger = get_logger(__name__)
24
31
 
@@ -47,7 +54,9 @@ def get_input_text_tool(ctx: MobileUseContext):
47
54
  state: Annotated[State, InjectedState],
48
55
  agent_thought: str,
49
56
  text: str,
50
- text_input_resource_id: str,
57
+ text_input_resource_id: str | None,
58
+ text_input_coordinates: ElementBounds | None,
59
+ text_input_text: str | None,
51
60
  ):
52
61
  """
53
62
  Focus a text field and type text into it.
@@ -55,23 +64,83 @@ def get_input_text_tool(ctx: MobileUseContext):
55
64
  - Ensure the corresponding element is focused (tap if necessary).
56
65
  - If bounds are available, tap near the end to place the cursor at the end.
57
66
  - Type the provided `text` using the controller.
67
+
68
+ Args:
69
+ tool_call_id: The ID of the tool call.
70
+ state: The state of the agent.
71
+ agent_thought: The thought of the agent.
72
+ text: The text to type.
73
+ text_input_resource_id: The resource ID of the text input (if available).
74
+ text_input_coordinates: The bounds (ElementBounds) of the text input (if available).
75
+ text_input_text: The current text content of the text input (if available).
58
76
  """
59
- focused = focus_element_if_needed(ctx=ctx, resource_id=text_input_resource_id)
60
- if focused:
61
- move_cursor_to_end_if_bounds(ctx=ctx, state=state, resource_id=text_input_resource_id)
77
+
78
+ focused = focus_element_if_needed(
79
+ ctx=ctx,
80
+ input_resource_id=text_input_resource_id,
81
+ input_coordinates=text_input_coordinates,
82
+ input_text=text_input_text,
83
+ )
84
+ if not focused:
85
+ error_message = "Failed to focus the text input element before typing."
86
+ tool_message = ToolMessage(
87
+ tool_call_id=tool_call_id,
88
+ content=input_text_wrapper.on_failure_fn(text, error_message),
89
+ additional_kwargs={"error": error_message},
90
+ status="error",
91
+ )
92
+ return Command(
93
+ update=state.sanitize_update(
94
+ ctx=ctx,
95
+ update={
96
+ "agents_thoughts": [agent_thought, error_message],
97
+ EXECUTOR_MESSAGES_KEY: [tool_message],
98
+ },
99
+ agent="executor",
100
+ ),
101
+ )
102
+
103
+ move_cursor_to_end_if_bounds(
104
+ ctx=ctx,
105
+ state=state,
106
+ text_input_resource_id=text_input_resource_id,
107
+ text_input_coordinates=text_input_coordinates,
108
+ text_input_text=text_input_text,
109
+ )
62
110
 
63
111
  result = _controller_input_text(ctx=ctx, text=text)
64
112
 
65
113
  status: Literal["success", "error"] = "success" if result.ok else "error"
66
- content_msg = (
67
- input_text_wrapper.on_success_fn(text)
114
+
115
+ text_input_content = ""
116
+ if status == "success":
117
+ if text_input_resource_id is not None:
118
+ # Verification phase for elements with resource_id
119
+ screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
120
+ state.latest_ui_hierarchy = screen_data.elements
121
+
122
+ element = find_element_by_resource_id(
123
+ ui_hierarchy=state.latest_ui_hierarchy, resource_id=text_input_resource_id
124
+ )
125
+
126
+ if not element:
127
+ result = InputResult(ok=False, error="Element not found")
128
+
129
+ if element:
130
+ text_input_content = get_element_text(element)
131
+ else:
132
+ # For elements without resource_id, skip verification and use direct message
133
+ pass
134
+
135
+ agent_outcome = (
136
+ input_text_wrapper.on_success_fn(text, text_input_content, text_input_resource_id)
68
137
  if result.ok
69
- else input_text_wrapper.on_failure_fn(text)
138
+ else input_text_wrapper.on_failure_fn(text, result.error)
70
139
  )
71
140
 
72
141
  tool_message = ToolMessage(
73
142
  tool_call_id=tool_call_id,
74
- content=content_msg,
143
+ content=agent_outcome,
75
144
  additional_kwargs={"error": result.error} if not result.ok else {},
76
145
  status=status,
77
146
  )
@@ -80,7 +149,7 @@ def get_input_text_tool(ctx: MobileUseContext):
80
149
  update=state.sanitize_update(
81
150
  ctx=ctx,
82
151
  update={
83
- "agents_thoughts": [agent_thought],
152
+ "agents_thoughts": [agent_thought, agent_outcome],
84
153
  EXECUTOR_MESSAGES_KEY: [tool_message],
85
154
  },
86
155
  agent="executor",
@@ -90,8 +159,20 @@ def get_input_text_tool(ctx: MobileUseContext):
90
159
  return input_text
91
160
 
92
161
 
162
+ def _on_input_success(text, text_input_content, text_input_resource_id):
163
+ """Success message handler for input text operations."""
164
+ if text_input_resource_id is not None:
165
+ return (
166
+ f"Typed {repr(text)}.\n"
167
+ f"Here is the whole content of input with id {repr(text_input_resource_id)}: "
168
+ f"{repr(text_input_content)}"
169
+ )
170
+ else:
171
+ return "Typed text, should now verify before moving forward"
172
+
173
+
93
174
  input_text_wrapper = ToolWrapper(
94
175
  tool_fn_getter=get_input_text_tool,
95
- on_success_fn=lambda text: f"Successfully typed {text}",
96
- on_failure_fn=lambda text: f"Failed to input text {text}",
176
+ on_success_fn=_on_input_success,
177
+ on_failure_fn=lambda text, error: f"Failed to input text {repr(text)}. Reason: {error}",
97
178
  )
@@ -1,16 +1,22 @@
1
+ from typing import Annotated
2
+
1
3
  from langchain_core.messages import ToolMessage
2
4
  from langchain_core.tools import tool
3
5
  from langchain_core.tools.base import InjectedToolCallId
6
+ from langgraph.prebuilt import InjectedState
4
7
  from langgraph.types import Command
8
+
5
9
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
6
10
  from minitap.mobile_use.context import MobileUseContext
11
+ from minitap.mobile_use.controllers.mobile_command_controller import (
12
+ get_screen_data,
13
+ )
7
14
  from minitap.mobile_use.controllers.mobile_command_controller import (
8
15
  paste_text as paste_text_controller,
9
16
  )
10
17
  from minitap.mobile_use.graph.state import State
11
- from langgraph.prebuilt import InjectedState
12
18
  from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
13
- from typing import Annotated
19
+ from minitap.mobile_use.utils.ui_hierarchy import find_element_by_resource_id, get_element_text
14
20
 
15
21
 
16
22
  def get_paste_text_tool(ctx: MobileUseContext):
@@ -19,6 +25,7 @@ def get_paste_text_tool(ctx: MobileUseContext):
19
25
  tool_call_id: Annotated[str, InjectedToolCallId],
20
26
  state: Annotated[State, InjectedState],
21
27
  agent_thought: str,
28
+ focused_element_resource_id: str,
22
29
  ):
23
30
  """
24
31
  Pastes text previously copied via `copyTextFrom` into the currently focused field.
@@ -32,12 +39,29 @@ def get_paste_text_tool(ctx: MobileUseContext):
32
39
  - pasteText
33
40
  """
34
41
  output = paste_text_controller(ctx=ctx)
42
+
43
+ text_input_content = ""
44
+ screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
45
+ state.latest_ui_hierarchy = screen_data.elements
46
+
47
+ element = find_element_by_resource_id(
48
+ ui_hierarchy=state.latest_ui_hierarchy, resource_id=focused_element_resource_id
49
+ )
50
+
51
+ if element:
52
+ text_input_content = get_element_text(element)
53
+
35
54
  has_failed = output is not None
55
+
56
+ agent_outcome = (
57
+ paste_text_wrapper.on_success_fn(text_input_content)
58
+ if not has_failed
59
+ else paste_text_wrapper.on_failure_fn(text_input_content)
60
+ )
61
+
36
62
  tool_message = ToolMessage(
37
63
  tool_call_id=tool_call_id,
38
- content=paste_text_wrapper.on_failure_fn()
39
- if has_failed
40
- else paste_text_wrapper.on_success_fn(),
64
+ content=agent_outcome,
41
65
  additional_kwargs={"error": output} if has_failed else {},
42
66
  status="error" if has_failed else "success",
43
67
  )
@@ -45,7 +69,7 @@ def get_paste_text_tool(ctx: MobileUseContext):
45
69
  update=state.sanitize_update(
46
70
  ctx=ctx,
47
71
  update={
48
- "agents_thoughts": [agent_thought],
72
+ "agents_thoughts": [agent_thought, agent_outcome],
49
73
  EXECUTOR_MESSAGES_KEY: [tool_message],
50
74
  },
51
75
  agent="executor",
@@ -57,6 +81,8 @@ def get_paste_text_tool(ctx: MobileUseContext):
57
81
 
58
82
  paste_text_wrapper = ToolWrapper(
59
83
  tool_fn_getter=get_paste_text_tool,
60
- on_success_fn=lambda: "Text pasted successfully.",
61
- on_failure_fn=lambda: "Failed to paste text.",
84
+ on_success_fn=lambda input_content: "Text pasted successfully. Here is the actual"
85
+ + f"content of the text field : {repr(input_content)}",
86
+ on_failure_fn=lambda input_content: "Failed to paste text."
87
+ + f"Here is the actual content of the text field : {repr(input_content)}",
62
88
  )
@@ -1,18 +1,28 @@
1
+ from typing import Annotated
2
+
1
3
  from langchain_core.messages import ToolMessage
2
4
  from langchain_core.tools import tool
3
- from langchain_core.tools.base import InjectedToolCallId
5
+ from langchain_core.tools.base import BaseTool, InjectedToolCallId
4
6
  from langgraph.prebuilt import InjectedState
5
7
  from langgraph.types import Command
8
+ from pydantic import Field
9
+
6
10
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
7
11
  from minitap.mobile_use.context import MobileUseContext
8
- from minitap.mobile_use.controllers.mobile_command_controller import SwipeRequest
12
+ from minitap.mobile_use.controllers.mobile_command_controller import (
13
+ CoordinatesSelectorRequest,
14
+ PercentagesSelectorRequest,
15
+ SwipeDirection,
16
+ SwipeRequest,
17
+ SwipeStartEndCoordinatesRequest,
18
+ SwipeStartEndPercentagesRequest,
19
+ )
9
20
  from minitap.mobile_use.controllers.mobile_command_controller import swipe as swipe_controller
10
21
  from minitap.mobile_use.graph.state import State
11
- from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
12
- from typing import Annotated
22
+ from minitap.mobile_use.tools.tool_wrapper import CompositeToolWrapper
13
23
 
14
24
 
15
- def get_swipe_tool(ctx: MobileUseContext):
25
+ def get_swipe_tool(ctx: MobileUseContext) -> BaseTool:
16
26
  @tool
17
27
  def swipe(
18
28
  tool_call_id: Annotated[str, InjectedToolCallId],
@@ -20,9 +30,7 @@ def get_swipe_tool(ctx: MobileUseContext):
20
30
  agent_thought: str,
21
31
  swipe_request: SwipeRequest,
22
32
  ):
23
- """
24
- Swipes on the screen.
25
- """
33
+ """Swipes on the screen."""
26
34
  output = swipe_controller(ctx=ctx, swipe_request=swipe_request)
27
35
  has_failed = output is not None
28
36
  tool_message = ToolMessage(
@@ -45,8 +53,98 @@ def get_swipe_tool(ctx: MobileUseContext):
45
53
  return swipe
46
54
 
47
55
 
48
- swipe_wrapper = ToolWrapper(
56
+ def get_composite_swipe_tools(ctx: MobileUseContext) -> list[BaseTool]:
57
+ """
58
+ Returns composite swipe tools for use with Vertex AI LLMs.
59
+ Each tool handles a specific swipe mode to avoid complex Union type issues.
60
+ """
61
+
62
+ @tool
63
+ def swipe_coordinates(
64
+ agent_thought: str,
65
+ tool_call_id: Annotated[str, InjectedToolCallId],
66
+ state: Annotated[State, InjectedState],
67
+ start_x: int = Field(description="Start X coordinate in pixels"),
68
+ start_y: int = Field(description="Start Y coordinate in pixels"),
69
+ end_x: int = Field(description="End X coordinate in pixels"),
70
+ end_y: int = Field(description="End Y coordinate in pixels"),
71
+ duration: int = Field(description="Duration in ms", ge=1, le=10000, default=400),
72
+ ):
73
+ """Swipe using pixel coordinates from start position to end position."""
74
+ swipe_request = SwipeRequest(
75
+ swipe_mode=SwipeStartEndCoordinatesRequest(
76
+ start=CoordinatesSelectorRequest(x=start_x, y=start_y),
77
+ end=CoordinatesSelectorRequest(x=end_x, y=end_y),
78
+ ),
79
+ duration=duration,
80
+ )
81
+ return get_swipe_tool(ctx=ctx).invoke(
82
+ input={
83
+ "tool_call_id": tool_call_id,
84
+ "state": state,
85
+ "agent_thought": agent_thought,
86
+ "swipe_request": swipe_request,
87
+ }
88
+ )
89
+
90
+ @tool
91
+ def swipe_percentages(
92
+ agent_thought: str,
93
+ tool_call_id: Annotated[str, InjectedToolCallId],
94
+ state: Annotated[State, InjectedState],
95
+ start_x_percent: int = Field(description="Start X percent (0-100)", ge=0, le=100),
96
+ start_y_percent: int = Field(description="Start Y percent (0-100)", ge=0, le=100),
97
+ end_x_percent: int = Field(description="End X percent (0-100)", ge=0, le=100),
98
+ end_y_percent: int = Field(description="End Y percent (0-100)", ge=0, le=100),
99
+ duration: int = Field(description="Duration in ms", ge=1, le=10000, default=400),
100
+ ):
101
+ """Swipe using percentage coordinates from start position to end position."""
102
+ swipe_request = SwipeRequest(
103
+ swipe_mode=SwipeStartEndPercentagesRequest(
104
+ start=PercentagesSelectorRequest(
105
+ x_percent=start_x_percent, y_percent=start_y_percent
106
+ ),
107
+ end=PercentagesSelectorRequest(x_percent=end_x_percent, y_percent=end_y_percent),
108
+ ),
109
+ duration=duration,
110
+ )
111
+ return get_swipe_tool(ctx=ctx).invoke(
112
+ input={
113
+ "tool_call_id": tool_call_id,
114
+ "state": state,
115
+ "agent_thought": agent_thought,
116
+ "swipe_request": swipe_request,
117
+ }
118
+ )
119
+
120
+ @tool
121
+ def swipe_direction(
122
+ agent_thought: str,
123
+ tool_call_id: Annotated[str, InjectedToolCallId],
124
+ state: Annotated[State, InjectedState],
125
+ direction: SwipeDirection,
126
+ duration: int = Field(description="Duration in ms", ge=1, le=10000, default=400),
127
+ ):
128
+ """Swipe in a specific direction across the screen."""
129
+ swipe_request = SwipeRequest(
130
+ swipe_mode=direction,
131
+ duration=duration,
132
+ )
133
+ return get_swipe_tool(ctx=ctx).invoke(
134
+ input={
135
+ "tool_call_id": tool_call_id,
136
+ "state": state,
137
+ "agent_thought": agent_thought,
138
+ "swipe_request": swipe_request,
139
+ }
140
+ )
141
+
142
+ return [swipe_coordinates, swipe_percentages, swipe_direction]
143
+
144
+
145
+ swipe_wrapper = CompositeToolWrapper(
49
146
  tool_fn_getter=get_swipe_tool,
147
+ composite_tools_fn_getter=get_composite_swipe_tools,
50
148
  on_success_fn=lambda: "Swipe is successful.",
51
149
  on_failure_fn=lambda: "Failed to swipe.",
52
150
  )