droidrun 0.3.9__py3-none-any.whl → 0.3.10.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/__init__.py +2 -3
- droidrun/__main__.py +1 -1
- droidrun/agent/__init__.py +1 -1
- droidrun/agent/codeact/__init__.py +1 -4
- droidrun/agent/codeact/codeact_agent.py +66 -40
- droidrun/agent/codeact/events.py +6 -3
- droidrun/agent/codeact/prompts.py +2 -2
- droidrun/agent/common/events.py +4 -2
- droidrun/agent/context/__init__.py +1 -3
- droidrun/agent/context/agent_persona.py +2 -1
- droidrun/agent/context/context_injection_manager.py +6 -6
- droidrun/agent/context/episodic_memory.py +5 -3
- droidrun/agent/context/personas/__init__.py +3 -3
- droidrun/agent/context/personas/app_starter.py +3 -3
- droidrun/agent/context/personas/big_agent.py +3 -3
- droidrun/agent/context/personas/default.py +3 -3
- droidrun/agent/context/personas/ui_expert.py +5 -5
- droidrun/agent/context/task_manager.py +15 -17
- droidrun/agent/droid/__init__.py +1 -1
- droidrun/agent/droid/droid_agent.py +327 -180
- droidrun/agent/droid/events.py +91 -9
- droidrun/agent/executor/__init__.py +13 -0
- droidrun/agent/executor/events.py +24 -0
- droidrun/agent/executor/executor_agent.py +327 -0
- droidrun/agent/executor/prompts.py +136 -0
- droidrun/agent/manager/__init__.py +18 -0
- droidrun/agent/manager/events.py +20 -0
- droidrun/agent/manager/manager_agent.py +459 -0
- droidrun/agent/manager/prompts.py +223 -0
- droidrun/agent/oneflows/app_starter_workflow.py +118 -0
- droidrun/agent/oneflows/text_manipulator.py +204 -0
- droidrun/agent/planner/__init__.py +3 -3
- droidrun/agent/planner/events.py +6 -3
- droidrun/agent/planner/planner_agent.py +27 -42
- droidrun/agent/planner/prompts.py +2 -2
- droidrun/agent/usage.py +11 -11
- droidrun/agent/utils/__init__.py +11 -1
- droidrun/agent/utils/async_utils.py +2 -1
- droidrun/agent/utils/chat_utils.py +48 -60
- droidrun/agent/utils/device_state_formatter.py +177 -0
- droidrun/agent/utils/executer.py +12 -11
- droidrun/agent/utils/inference.py +114 -0
- droidrun/agent/utils/llm_picker.py +2 -0
- droidrun/agent/utils/message_utils.py +85 -0
- droidrun/agent/utils/tools.py +220 -0
- droidrun/agent/utils/trajectory.py +8 -7
- droidrun/cli/__init__.py +1 -1
- droidrun/cli/logs.py +29 -28
- droidrun/cli/main.py +279 -143
- droidrun/config_manager/__init__.py +25 -0
- droidrun/config_manager/config_manager.py +583 -0
- droidrun/macro/__init__.py +2 -2
- droidrun/macro/__main__.py +1 -1
- droidrun/macro/cli.py +36 -34
- droidrun/macro/replay.py +7 -9
- droidrun/portal.py +1 -1
- droidrun/telemetry/__init__.py +2 -2
- droidrun/telemetry/events.py +3 -4
- droidrun/telemetry/phoenix.py +173 -0
- droidrun/telemetry/tracker.py +7 -5
- droidrun/tools/__init__.py +1 -1
- droidrun/tools/adb.py +210 -82
- droidrun/tools/ios.py +7 -5
- droidrun/tools/tools.py +25 -8
- {droidrun-0.3.9.dist-info → droidrun-0.3.10.dev3.dist-info}/METADATA +5 -3
- droidrun-0.3.10.dev3.dist-info/RECORD +70 -0
- droidrun/agent/common/default.py +0 -5
- droidrun/agent/context/reflection.py +0 -20
- droidrun/agent/oneflows/reflector.py +0 -265
- droidrun-0.3.9.dist-info/RECORD +0 -56
- {droidrun-0.3.9.dist-info → droidrun-0.3.10.dev3.dist-info}/WHEEL +0 -0
- {droidrun-0.3.9.dist-info → droidrun-0.3.10.dev3.dist-info}/entry_points.txt +0 -0
- {droidrun-0.3.9.dist-info → droidrun-0.3.10.dev3.dist-info}/licenses/LICENSE +0 -0
droidrun/agent/droid/events.py
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
+
from typing import Dict, List
|
2
|
+
|
1
3
|
from llama_index.core.workflow import Event
|
2
|
-
from
|
3
|
-
|
4
|
+
from pydantic import BaseModel, Field
|
5
|
+
|
6
|
+
from droidrun.agent.context import Task
|
7
|
+
|
4
8
|
|
5
9
|
class CodeActExecuteEvent(Event):
|
6
10
|
task: Task
|
7
|
-
reflection: Optional[Reflection]
|
8
11
|
|
9
12
|
class CodeActResultEvent(Event):
|
10
13
|
success: bool
|
11
14
|
reason: str
|
12
15
|
steps: int
|
13
16
|
|
14
|
-
class ReasoningLogicEvent(Event):
|
15
|
-
reflection: Optional[Reflection] = None
|
16
|
-
force_planning: bool = False
|
17
17
|
|
18
18
|
class FinalizeEvent(Event):
|
19
19
|
success: bool
|
@@ -28,6 +28,88 @@ class FinalizeEvent(Event):
|
|
28
28
|
class TaskRunnerEvent(Event):
|
29
29
|
pass
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
|
31
|
+
|
32
|
+
|
33
|
+
# ============================================================================
|
34
|
+
# DroidAgentState - State model for llama-index Context
|
35
|
+
# ============================================================================
|
36
|
+
|
37
|
+
class DroidAgentState(BaseModel):
|
38
|
+
"""
|
39
|
+
State model for DroidAgent workflow - shared across parent and child workflows.
|
40
|
+
"""
|
41
|
+
|
42
|
+
# Task context
|
43
|
+
instruction: str = ""
|
44
|
+
|
45
|
+
# UI State
|
46
|
+
ui_elements_list_before: str = ""
|
47
|
+
ui_elements_list_after: str = ""
|
48
|
+
focused_text: str = ""
|
49
|
+
device_state_text: str = ""
|
50
|
+
width: int = 0
|
51
|
+
height: int = 0
|
52
|
+
screenshot: str | bytes | None = None
|
53
|
+
has_text_to_modify: bool = False
|
54
|
+
|
55
|
+
# Action tracking
|
56
|
+
action_pool: List[Dict] = Field(default_factory=list)
|
57
|
+
action_history: List[Dict] = Field(default_factory=list)
|
58
|
+
summary_history: List[str] = Field(default_factory=list)
|
59
|
+
action_outcomes: List[str] = Field(default_factory=list) # "A", "B", "C"
|
60
|
+
error_descriptions: List[str] = Field(default_factory=list)
|
61
|
+
|
62
|
+
# Last action info
|
63
|
+
last_action: Dict = Field(default_factory=dict)
|
64
|
+
last_summary: str = ""
|
65
|
+
last_action_thought: str = ""
|
66
|
+
|
67
|
+
# Memory
|
68
|
+
memory: str = ""
|
69
|
+
message_history: List[Dict] = Field(default_factory=list)
|
70
|
+
|
71
|
+
# Planning
|
72
|
+
plan: str = ""
|
73
|
+
completed_plan: str = ""
|
74
|
+
current_subgoal: str = ""
|
75
|
+
finish_thought: str = ""
|
76
|
+
progress_status: str = ""
|
77
|
+
manager_answer: str = "" # For answer-type tasks
|
78
|
+
|
79
|
+
# Error handling
|
80
|
+
error_flag_plan: bool = False
|
81
|
+
err_to_manager_thresh: int = 2
|
82
|
+
|
83
|
+
# Output
|
84
|
+
output_dir: str = ""
|
85
|
+
|
86
|
+
|
87
|
+
# ============================================================================
|
88
|
+
# Manager/Executor coordination events
|
89
|
+
# ============================================================================
|
90
|
+
|
91
|
+
class ManagerInputEvent(Event):
|
92
|
+
"""Trigger Manager workflow for planning"""
|
93
|
+
pass
|
94
|
+
|
95
|
+
|
96
|
+
class ManagerPlanEvent(Event):
|
97
|
+
"""Manager has created a plan"""
|
98
|
+
plan: str
|
99
|
+
current_subgoal: str
|
100
|
+
completed_plan: str
|
101
|
+
thought: str
|
102
|
+
manager_answer: str = ""
|
103
|
+
|
104
|
+
|
105
|
+
class ExecutorInputEvent(Event):
|
106
|
+
"""Trigger Executor workflow for action execution"""
|
107
|
+
current_subgoal: str
|
108
|
+
|
109
|
+
|
110
|
+
class ExecutorResultEvent(Event):
|
111
|
+
"""Executor action result"""
|
112
|
+
action: Dict
|
113
|
+
outcome: bool
|
114
|
+
error: str
|
115
|
+
summary: str
|
@@ -0,0 +1,13 @@
|
|
1
|
+
"""
|
2
|
+
Executor Agent - Action execution workflow.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from droidrun.agent.executor.events import ExecutorActionEvent, ExecutorResultEvent
|
6
|
+
from droidrun.agent.executor.executor_agent import ExecutorAgent
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"ExecutorAgent",
|
10
|
+
"ExecutorThinkingEvent",
|
11
|
+
"ExecutorActionEvent",
|
12
|
+
"ExecutorResultEvent"
|
13
|
+
]
|
@@ -0,0 +1,24 @@
|
|
1
|
+
"""
|
2
|
+
Events for the ExecutorAgent workflow.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Dict
|
6
|
+
|
7
|
+
from llama_index.core.workflow.events import Event
|
8
|
+
|
9
|
+
|
10
|
+
class ExecutorActionEvent(Event):
|
11
|
+
"""Executor has selected an action to execute"""
|
12
|
+
action_json: str
|
13
|
+
thought: str
|
14
|
+
description: str
|
15
|
+
|
16
|
+
|
17
|
+
class ExecutorResultEvent(Event):
|
18
|
+
"""Executor action result"""
|
19
|
+
action: Dict
|
20
|
+
outcome: bool
|
21
|
+
error: str
|
22
|
+
summary: str
|
23
|
+
thought: str = ""
|
24
|
+
action_json: str = ""
|
@@ -0,0 +1,327 @@
|
|
1
|
+
"""
|
2
|
+
ExecutorAgent - Action execution workflow.
|
3
|
+
|
4
|
+
This agent is responsible for:
|
5
|
+
- Taking a specific subgoal from the Manager
|
6
|
+
- Analyzing the current UI state
|
7
|
+
- Selecting and executing appropriate actions
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import json
|
13
|
+
import logging
|
14
|
+
from typing import TYPE_CHECKING
|
15
|
+
|
16
|
+
from llama_index.core.llms import ChatMessage, ImageBlock, TextBlock
|
17
|
+
from llama_index.core.llms.llm import LLM
|
18
|
+
from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
|
19
|
+
|
20
|
+
from droidrun.agent.executor.events import ExecutorActionEvent, ExecutorResultEvent
|
21
|
+
from droidrun.agent.executor.prompts import build_executor_system_prompt, parse_executor_response
|
22
|
+
from droidrun.agent.utils.tools import click, long_press, open_app, swipe, system_button, type
|
23
|
+
from droidrun.agent.utils.inference import acall_with_retries
|
24
|
+
from droidrun.config_manager import config
|
25
|
+
import asyncio
|
26
|
+
|
27
|
+
if TYPE_CHECKING:
|
28
|
+
from droidrun.agent.droid.events import DroidAgentState
|
29
|
+
|
30
|
+
logger = logging.getLogger("droidrun")
|
31
|
+
|
32
|
+
|
33
|
+
class ExecutorAgent(Workflow):
|
34
|
+
"""
|
35
|
+
Action execution agent that performs specific actions.
|
36
|
+
|
37
|
+
The Executor:
|
38
|
+
1. Receives a subgoal from the Manager
|
39
|
+
2. Analyzes current UI state and context
|
40
|
+
3. Selects an appropriate action to take
|
41
|
+
4. Executes the action on the device
|
42
|
+
5. Reports the outcome
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(
|
46
|
+
self,
|
47
|
+
llm: LLM,
|
48
|
+
vision: bool,
|
49
|
+
tools_instance,
|
50
|
+
shared_state: "DroidAgentState",
|
51
|
+
persona=None,
|
52
|
+
custom_tools: dict = None,
|
53
|
+
debug: bool = False,
|
54
|
+
**kwargs
|
55
|
+
):
|
56
|
+
super().__init__(**kwargs)
|
57
|
+
self.llm = llm
|
58
|
+
self.vision = vision
|
59
|
+
self.tools_instance = tools_instance
|
60
|
+
self.shared_state = shared_state
|
61
|
+
self.persona = persona
|
62
|
+
self.custom_tools = custom_tools or {}
|
63
|
+
self.debug = debug
|
64
|
+
|
65
|
+
logger.info("✅ ExecutorAgent initialized successfully.")
|
66
|
+
|
67
|
+
|
68
|
+
@step
|
69
|
+
async def think(
|
70
|
+
self,
|
71
|
+
ctx: Context,
|
72
|
+
ev: StartEvent
|
73
|
+
) -> ExecutorActionEvent:
|
74
|
+
"""
|
75
|
+
Executor decides which action to take.
|
76
|
+
|
77
|
+
This step:
|
78
|
+
1. Calls LLM with executor prompt and context
|
79
|
+
2. Parses the response for action, thought, description
|
80
|
+
3. Validates action format (blocks answer actions!)
|
81
|
+
4. Returns action event
|
82
|
+
"""
|
83
|
+
subgoal = ev.get("subgoal", "")
|
84
|
+
logger.info(f"🧠 Executor thinking about action for: {subgoal}")
|
85
|
+
|
86
|
+
app_card = "" # TODO: Implement app card retrieval
|
87
|
+
|
88
|
+
system_prompt = build_executor_system_prompt(
|
89
|
+
state=self.shared_state,
|
90
|
+
subgoal=subgoal,
|
91
|
+
app_card=app_card
|
92
|
+
)
|
93
|
+
|
94
|
+
blocks = [TextBlock(text=system_prompt)]
|
95
|
+
if self.vision:
|
96
|
+
screenshot = self.shared_state.screenshot
|
97
|
+
if screenshot is not None:
|
98
|
+
blocks.append(ImageBlock(image=screenshot))
|
99
|
+
logger.debug("📸 Using screenshot for Executor")
|
100
|
+
else:
|
101
|
+
logger.warning("⚠️ Vision enabled but no screenshot available")
|
102
|
+
messages = [ChatMessage(role="user", blocks=blocks)]
|
103
|
+
|
104
|
+
try:
|
105
|
+
response = await acall_with_retries(self.llm, messages)
|
106
|
+
response_text = str(response)
|
107
|
+
except Exception as e:
|
108
|
+
raise RuntimeError(f"Error calling LLM in executor: {e}") from e
|
109
|
+
|
110
|
+
# Parse response
|
111
|
+
try:
|
112
|
+
parsed = parse_executor_response(response_text)
|
113
|
+
except Exception as e:
|
114
|
+
logger.error(f"❌ Failed to parse executor response: {e}")
|
115
|
+
return ExecutorActionEvent(
|
116
|
+
action_json=json.dumps({"action": "invalid"}),
|
117
|
+
thought=f"Failed to parse response: {str(e)}",
|
118
|
+
description="Invalid response format from LLM"
|
119
|
+
)
|
120
|
+
|
121
|
+
logger.info(f"💡 Thought: {parsed['thought']}")
|
122
|
+
logger.info(f"🎯 Action: {parsed['action']}")
|
123
|
+
logger.debug(f" - Description: {parsed['description']}")
|
124
|
+
|
125
|
+
return ExecutorActionEvent(
|
126
|
+
action_json=parsed["action"],
|
127
|
+
thought=parsed["thought"],
|
128
|
+
description=parsed["description"]
|
129
|
+
)
|
130
|
+
|
131
|
+
@step
|
132
|
+
async def execute(
|
133
|
+
self,
|
134
|
+
ctx: Context,
|
135
|
+
ev: ExecutorActionEvent
|
136
|
+
) -> ExecutorResultEvent:
|
137
|
+
"""
|
138
|
+
Execute the selected action using the tools instance.
|
139
|
+
|
140
|
+
Maps action JSON to appropriate tool calls and handles execution.
|
141
|
+
"""
|
142
|
+
logger.info(f"⚡ Executing action: {ev.description}")
|
143
|
+
|
144
|
+
# Parse action JSON
|
145
|
+
try:
|
146
|
+
action_dict = json.loads(ev.action_json)
|
147
|
+
except json.JSONDecodeError as e:
|
148
|
+
logger.error(f"❌ Failed to parse action JSON: {e}")
|
149
|
+
return ExecutorResultEvent(
|
150
|
+
action={"action": "invalid"},
|
151
|
+
outcome=False,
|
152
|
+
error=f"Invalid action JSON: {str(e)}",
|
153
|
+
summary="Failed to parse action",
|
154
|
+
thought=ev.thought,
|
155
|
+
action_json=ev.action_json
|
156
|
+
)
|
157
|
+
|
158
|
+
# Execute the action
|
159
|
+
outcome, error, summary = await self._execute_action(action_dict, ev.description)
|
160
|
+
|
161
|
+
if outcome:
|
162
|
+
await asyncio.sleep(config.agent.after_sleep_action)
|
163
|
+
|
164
|
+
logger.info(f"{'✅' if outcome else '❌'} Execution complete: {summary}")
|
165
|
+
|
166
|
+
return ExecutorResultEvent(
|
167
|
+
action=action_dict,
|
168
|
+
outcome=outcome,
|
169
|
+
error=error,
|
170
|
+
summary=summary,
|
171
|
+
thought=ev.thought,
|
172
|
+
action_json=ev.action_json
|
173
|
+
)
|
174
|
+
|
175
|
+
async def _execute_action(self, action_dict: dict, description: str) -> tuple[bool, str, str]:
|
176
|
+
"""
|
177
|
+
Execute a single action based on the action dictionary.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
action_dict: Dictionary containing action type and parameters
|
181
|
+
description: Human-readable description of the action
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
Tuple of (outcome: bool, error: str, summary: str)
|
185
|
+
"""
|
186
|
+
|
187
|
+
action_type = action_dict.get("action", "unknown")
|
188
|
+
|
189
|
+
# Check custom_tools first (before atomic actions)
|
190
|
+
if action_type in self.custom_tools:
|
191
|
+
return await self._execute_custom_tool(action_type, action_dict)
|
192
|
+
|
193
|
+
try:
|
194
|
+
if action_type == "click":
|
195
|
+
index = action_dict.get("index")
|
196
|
+
if index is None:
|
197
|
+
return False, "Missing 'index' parameter", "Failed: click requires index"
|
198
|
+
|
199
|
+
result = click(self.tools_instance, index)
|
200
|
+
return True, "None", f"Clicked element at index {index}"
|
201
|
+
|
202
|
+
elif action_type == "long_press":
|
203
|
+
index = action_dict.get("index")
|
204
|
+
if index is None:
|
205
|
+
return False, "Missing 'index' parameter", "Failed: long_press requires index"
|
206
|
+
|
207
|
+
success = long_press(self.tools_instance, index)
|
208
|
+
if success:
|
209
|
+
return True, "None", f"Long pressed element at index {index}"
|
210
|
+
else:
|
211
|
+
return False, "Long press failed", f"Failed to long press at index {index}"
|
212
|
+
|
213
|
+
elif action_type == "type":
|
214
|
+
text = action_dict.get("text")
|
215
|
+
index = action_dict.get("index", -1)
|
216
|
+
|
217
|
+
if text is None:
|
218
|
+
return False, "Missing 'text' parameter", "Failed: type requires text"
|
219
|
+
|
220
|
+
result = type(self.tools_instance, text, index)
|
221
|
+
return True, "None", f"Typed '{text}' into element at index {index}"
|
222
|
+
|
223
|
+
elif action_type == "system_button":
|
224
|
+
button = action_dict.get("button")
|
225
|
+
if button is None:
|
226
|
+
return False, "Missing 'button' parameter", "Failed: system_button requires button"
|
227
|
+
|
228
|
+
result = system_button(self.tools_instance, button)
|
229
|
+
if "Error" in result:
|
230
|
+
return False, result, f"Failed to press {button} button"
|
231
|
+
return True, "None", f"Pressed {button} button"
|
232
|
+
|
233
|
+
elif action_type == "swipe":
|
234
|
+
coordinate = action_dict.get("coordinate")
|
235
|
+
coordinate2 = action_dict.get("coordinate2")
|
236
|
+
|
237
|
+
if coordinate is None or coordinate2 is None:
|
238
|
+
return False, "Missing coordinate parameters", "Failed: swipe requires coordinate and coordinate2"
|
239
|
+
|
240
|
+
# Validate coordinate format before calling swipe
|
241
|
+
if not isinstance(coordinate, list) or len(coordinate) != 2:
|
242
|
+
return False, f"Invalid coordinate format: {coordinate}", "Failed: coordinate must be [x, y]"
|
243
|
+
if not isinstance(coordinate2, list) or len(coordinate2) != 2:
|
244
|
+
return False, f"Invalid coordinate2 format: {coordinate2}", "Failed: coordinate2 must be [x, y]"
|
245
|
+
|
246
|
+
success = swipe(self.tools_instance, coordinate, coordinate2)
|
247
|
+
if success:
|
248
|
+
return True, "None", f"Swiped from {coordinate} to {coordinate2}"
|
249
|
+
else:
|
250
|
+
return False, "Swipe failed", f"Failed to swipe from {coordinate} to {coordinate2}"
|
251
|
+
|
252
|
+
elif action_type == "open_app":
|
253
|
+
text = action_dict.get("text")
|
254
|
+
if text is None:
|
255
|
+
return False, "Missing 'text' parameter", "Failed: open_app requires text"
|
256
|
+
|
257
|
+
result = open_app(self.tools_instance, text)
|
258
|
+
return True, "None", f"Opened app: {text}"
|
259
|
+
|
260
|
+
else:
|
261
|
+
return False, f"Unknown action type: {action_type}", f"Failed: unknown action '{action_type}'"
|
262
|
+
|
263
|
+
except Exception as e:
|
264
|
+
logger.error(f"❌ Exception during action execution: {e}", exc_info=True)
|
265
|
+
return False, f"Exception: {str(e)}", f"Failed to execute {action_type}: {str(e)}"
|
266
|
+
|
267
|
+
async def _execute_custom_tool(self, action_type: str, action_dict: dict) -> tuple[bool, str, str]:
|
268
|
+
"""
|
269
|
+
Execute a custom tool based on the action dictionary.
|
270
|
+
|
271
|
+
Args:
|
272
|
+
action_type: The custom tool name
|
273
|
+
action_dict: Dictionary containing action parameters
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
Tuple of (outcome: bool, error: str, summary: str)
|
277
|
+
"""
|
278
|
+
try:
|
279
|
+
tool_spec = self.custom_tools[action_type]
|
280
|
+
tool_func = tool_spec["function"]
|
281
|
+
|
282
|
+
# Extract arguments (exclude 'action' key)
|
283
|
+
tool_args = {k: v for k, v in action_dict.items() if k != "action"}
|
284
|
+
|
285
|
+
# Execute the custom tool function
|
286
|
+
# First argument is always tools_instance (bound in same pattern as atomic actions)
|
287
|
+
if asyncio.iscoroutinefunction(tool_func):
|
288
|
+
result = await tool_func(self.tools_instance, **tool_args)
|
289
|
+
else:
|
290
|
+
result = tool_func(self.tools_instance, **tool_args)
|
291
|
+
|
292
|
+
# Success case
|
293
|
+
summary = f"Executed custom tool '{action_type}'"
|
294
|
+
if result is not None:
|
295
|
+
summary += f": {str(result)}"
|
296
|
+
|
297
|
+
return True, "None", summary
|
298
|
+
|
299
|
+
except TypeError as e:
|
300
|
+
# Likely missing or wrong arguments
|
301
|
+
error_msg = f"Invalid arguments for custom tool '{action_type}': {str(e)}"
|
302
|
+
logger.error(f"❌ {error_msg}")
|
303
|
+
return False, error_msg, f"Failed: {action_type}"
|
304
|
+
|
305
|
+
except Exception as e:
|
306
|
+
# General execution error
|
307
|
+
error_msg = f"Error executing custom tool '{action_type}': {str(e)}"
|
308
|
+
logger.error(f"❌ {error_msg}", exc_info=True)
|
309
|
+
return False, error_msg, f"Failed: {action_type}"
|
310
|
+
|
311
|
+
@step
|
312
|
+
async def finalize(
|
313
|
+
self,
|
314
|
+
ctx: Context,
|
315
|
+
ev: ExecutorResultEvent
|
316
|
+
) -> StopEvent:
|
317
|
+
"""Return executor results to parent workflow."""
|
318
|
+
logger.debug("✅ Executor execution complete")
|
319
|
+
|
320
|
+
return StopEvent(result={
|
321
|
+
"action": ev.action,
|
322
|
+
"outcome": ev.outcome,
|
323
|
+
"error": ev.error,
|
324
|
+
"summary": ev.summary,
|
325
|
+
"thought": ev.thought,
|
326
|
+
"action_json": ev.action_json
|
327
|
+
})
|
@@ -0,0 +1,136 @@
|
|
1
|
+
"""
|
2
|
+
Prompts for the ExecutorAgent.
|
3
|
+
"""
|
4
|
+
|
5
|
+
|
6
|
+
from droidrun.agent.droid.events import DroidAgentState
|
7
|
+
from droidrun.agent.utils.tools import ATOMIC_ACTION_SIGNATURES
|
8
|
+
|
9
|
+
|
10
|
+
def build_executor_system_prompt(
|
11
|
+
state: "DroidAgentState",
|
12
|
+
subgoal: str,
|
13
|
+
app_card: str = "",
|
14
|
+
) -> str:
|
15
|
+
"""
|
16
|
+
Build the complete Executor system prompt with all context.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
state: Current DroidAgentState with all context
|
20
|
+
subgoal: Current subgoal to execute
|
21
|
+
app_card: Optional app-specific instructions
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
Complete system prompt for the Executor
|
25
|
+
"""
|
26
|
+
prompt = f"""You are a LOW-LEVEL ACTION EXECUTOR for an Android phone. You do NOT answer questions or provide results. You ONLY perform individual atomic actions as specified in the current subgoal. You are part of a larger system - your job is to execute actions, not to think about or answer the user's original question.
|
27
|
+
|
28
|
+
### User Request ###
|
29
|
+
{state.instruction}
|
30
|
+
|
31
|
+
{("App card gives information on how to operate the app and perform actions.\n" + "### App Card ###\n" + app_card.strip() + "\n\n") if app_card.strip() else ""}{(("### Device State ###\n" + state.device_state_text.strip() + "\n\n") if state.device_state_text.strip() else "")}### Overall Plan ###
|
32
|
+
{state.plan}
|
33
|
+
|
34
|
+
### Current Subgoal ###
|
35
|
+
EXECUTE THIS SUBGOAL: {subgoal}
|
36
|
+
|
37
|
+
EXECUTION MODE: You are a dumb robot. Find the exact text/element mentioned in the subgoal above and perform the specified action on it. Do not read anything below this line until after you execute the subgoal.
|
38
|
+
|
39
|
+
### SUBGOAL PARSING MODE ###
|
40
|
+
Read the current subgoal exactly as written. Look for:
|
41
|
+
- Action words: "tap", "click", "swipe", "type", "press", "open" etc.
|
42
|
+
- Target elements: specific text, buttons, fields, coordinates mentioned
|
43
|
+
- Locations: "header", "bottom", "left", "right", specific coordinates
|
44
|
+
Convert directly to atomic action:
|
45
|
+
- "tap/click" → click action
|
46
|
+
- "swipe" → swipe action
|
47
|
+
- "type" → type action
|
48
|
+
- "press [system button]" → system_button action
|
49
|
+
- "open [app]" → open_app action
|
50
|
+
Execute the atomic action for the exact target mentioned. Ignore everything else.
|
51
|
+
|
52
|
+
### Progress Status ###
|
53
|
+
{(state.progress_status + "\n\n") if state.progress_status != "" else "No progress yet.\n\n"}
|
54
|
+
|
55
|
+
### Guidelines ###
|
56
|
+
General:
|
57
|
+
- For any pop-up window, such as a permission request, you need to close it (e.g., by clicking `Don't Allow` or `Accept & continue`) before proceeding. Never choose to add any account or log in.
|
58
|
+
Action Related:
|
59
|
+
- Use the `open_app` action whenever you want to open an app (nothing will happen if the app is not installed), do not use the app drawer to open an app.
|
60
|
+
- Consider exploring the screen by using the `swipe` action with different directions to reveal additional content. Or use search to quickly find a specific entry, if applicable.
|
61
|
+
- If you cannot change the page content by swiping in the same direction continuously, the page may have been swiped to the bottom. Please try another operation to display more content.
|
62
|
+
- For some horizontally distributed tags, you can swipe horizontally to view more.
|
63
|
+
Text Related Operations:
|
64
|
+
- Activated input box: If an input box is activated, it may have a cursor inside it and the keyboard is visible. If there is no cursor on the screen but the keyboard is visible, it may be because the cursor is blinking. The color of the activated input box will be highlighted. If you are not sure whether the input box is activated, click it before typing.
|
65
|
+
- To input some text: first click the input box that you want to input, make sure the correct input box is activated and the keyboard is visible, then use `type` action to enter the specified text.
|
66
|
+
- To clear the text: long press the backspace button in the keyboard.
|
67
|
+
- To copy some text: first long press the text you want to copy, then click the `copy` button in bar.
|
68
|
+
- To paste text into a text box: first long press the text box, then click the `paste` button in bar.
|
69
|
+
|
70
|
+
---
|
71
|
+
Execute the current subgoal mechanically. Do NOT examine the screen content or make decisions about what you see. Parse the current subgoal text to identify the required action and execute it exactly as written. You must choose your action from one of the atomic actions.
|
72
|
+
|
73
|
+
#### Atomic Actions ####
|
74
|
+
The atomic action functions are listed in the format of `action(arguments): description` as follows:
|
75
|
+
{chr(10).join(f"- {action_name}({', '.join(action_info['arguments'])}): {action_info['description']}" for action_name, action_info in ATOMIC_ACTION_SIGNATURES.items())}
|
76
|
+
\n
|
77
|
+
### Latest Action History ###
|
78
|
+
{(("Recent actions you took previously and whether they were successful:\n" + "\n".join(
|
79
|
+
(f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome == "A"
|
80
|
+
else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
|
81
|
+
for act, summ, outcome, err_des in zip(
|
82
|
+
state.action_history[-min(5, len(state.action_history)):],
|
83
|
+
state.summary_history[-min(5, len(state.action_history)):],
|
84
|
+
state.action_outcomes[-min(5, len(state.action_history)):],
|
85
|
+
state.error_descriptions[-min(5, len(state.action_history)):], strict=True)
|
86
|
+
) + "\n\n")) if state.action_history else "No actions have been taken yet.\n\n"}
|
87
|
+
|
88
|
+
---
|
89
|
+
### LITERAL EXECUTION RULE ###
|
90
|
+
Whatever the current subgoal says to do, do that EXACTLY. Do not substitute with what you think is better. Do not optimize. Do not consider screen state. Parse the subgoal text literally and execute the matching atomic action.
|
91
|
+
|
92
|
+
IMPORTANT:
|
93
|
+
1. Do NOT repeat previously failed actions multiple times. Try changing to another action.
|
94
|
+
2. Must do the current subgoal.
|
95
|
+
|
96
|
+
Provide your output in the following format, which contains three parts:
|
97
|
+
|
98
|
+
### Thought ###
|
99
|
+
Break down the current subgoal into: (1) What atomic action is required? (2) What target/location is specified? (3) What parameters do I need? Do NOT reason about whether this makes sense - just mechanically convert the subgoal text into the appropriate action format.
|
100
|
+
|
101
|
+
### Action ###
|
102
|
+
Choose only one action or shortcut from the options provided.
|
103
|
+
You must provide your decision using a valid JSON format specifying the `action` and the arguments of the action. For example, if you want to open an App, you should write {{ "action":"open_app", "text": "app name" }}.
|
104
|
+
|
105
|
+
### Description ###
|
106
|
+
A brief description of the chosen action. Do not describe expected outcome.
|
107
|
+
"""
|
108
|
+
|
109
|
+
|
110
|
+
return prompt
|
111
|
+
|
112
|
+
|
113
|
+
def parse_executor_response(response: str) -> dict:
|
114
|
+
"""
|
115
|
+
Parse the Executor LLM response.
|
116
|
+
|
117
|
+
Extracts:
|
118
|
+
- thought: Content between "### Thought" and "### Action"
|
119
|
+
- action: Content between "### Action" and "### Description"
|
120
|
+
- description: Content after "### Description"
|
121
|
+
|
122
|
+
Args:
|
123
|
+
response: Raw LLM response string
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
Dictionary with 'thought', 'action', 'description' keys
|
127
|
+
"""
|
128
|
+
thought = response.split("### Thought")[-1].split("### Action")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
|
129
|
+
action = response.split("### Action")[-1].split("### Description")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
|
130
|
+
description = response.split("### Description")[-1].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
|
131
|
+
|
132
|
+
return {
|
133
|
+
"thought": thought,
|
134
|
+
"action": action,
|
135
|
+
"description": description
|
136
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Manager Agent - Planning and reasoning workflow.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from droidrun.agent.manager.events import ManagerPlanEvent, ManagerThinkingEvent
|
6
|
+
from droidrun.agent.manager.manager_agent import ManagerAgent
|
7
|
+
from droidrun.agent.manager.prompts import (
|
8
|
+
build_manager_system_prompt,
|
9
|
+
parse_manager_response,
|
10
|
+
)
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"ManagerAgent",
|
14
|
+
"ManagerThinkingEvent",
|
15
|
+
"ManagerPlanEvent",
|
16
|
+
"build_manager_system_prompt",
|
17
|
+
"parse_manager_response",
|
18
|
+
]
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
Events for the ManagerAgent workflow.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from llama_index.core.workflow.events import Event
|
6
|
+
|
7
|
+
|
8
|
+
class ManagerThinkingEvent(Event):
|
9
|
+
"""Manager is thinking about the plan"""
|
10
|
+
pass
|
11
|
+
|
12
|
+
|
13
|
+
class ManagerPlanEvent(Event):
|
14
|
+
"""Manager has created a plan"""
|
15
|
+
plan: str
|
16
|
+
current_subgoal: str
|
17
|
+
completed_plan: str
|
18
|
+
thought: str
|
19
|
+
manager_answer: str = ""
|
20
|
+
memory_update: str = ""
|