droidrun 0.3.9__py3-none-any.whl → 0.3.10.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. droidrun/__init__.py +2 -3
  2. droidrun/__main__.py +1 -1
  3. droidrun/agent/__init__.py +1 -1
  4. droidrun/agent/codeact/__init__.py +1 -4
  5. droidrun/agent/codeact/codeact_agent.py +66 -40
  6. droidrun/agent/codeact/events.py +6 -3
  7. droidrun/agent/codeact/prompts.py +2 -2
  8. droidrun/agent/common/events.py +4 -2
  9. droidrun/agent/context/__init__.py +1 -3
  10. droidrun/agent/context/agent_persona.py +2 -1
  11. droidrun/agent/context/context_injection_manager.py +6 -6
  12. droidrun/agent/context/episodic_memory.py +5 -3
  13. droidrun/agent/context/personas/__init__.py +3 -3
  14. droidrun/agent/context/personas/app_starter.py +3 -3
  15. droidrun/agent/context/personas/big_agent.py +3 -3
  16. droidrun/agent/context/personas/default.py +3 -3
  17. droidrun/agent/context/personas/ui_expert.py +5 -5
  18. droidrun/agent/context/task_manager.py +15 -17
  19. droidrun/agent/droid/__init__.py +1 -1
  20. droidrun/agent/droid/droid_agent.py +327 -180
  21. droidrun/agent/droid/events.py +91 -9
  22. droidrun/agent/executor/__init__.py +13 -0
  23. droidrun/agent/executor/events.py +24 -0
  24. droidrun/agent/executor/executor_agent.py +327 -0
  25. droidrun/agent/executor/prompts.py +136 -0
  26. droidrun/agent/manager/__init__.py +18 -0
  27. droidrun/agent/manager/events.py +20 -0
  28. droidrun/agent/manager/manager_agent.py +459 -0
  29. droidrun/agent/manager/prompts.py +223 -0
  30. droidrun/agent/oneflows/app_starter_workflow.py +118 -0
  31. droidrun/agent/oneflows/text_manipulator.py +204 -0
  32. droidrun/agent/planner/__init__.py +3 -3
  33. droidrun/agent/planner/events.py +6 -3
  34. droidrun/agent/planner/planner_agent.py +27 -42
  35. droidrun/agent/planner/prompts.py +2 -2
  36. droidrun/agent/usage.py +11 -11
  37. droidrun/agent/utils/__init__.py +11 -1
  38. droidrun/agent/utils/async_utils.py +2 -1
  39. droidrun/agent/utils/chat_utils.py +48 -60
  40. droidrun/agent/utils/device_state_formatter.py +177 -0
  41. droidrun/agent/utils/executer.py +12 -11
  42. droidrun/agent/utils/inference.py +114 -0
  43. droidrun/agent/utils/llm_picker.py +2 -0
  44. droidrun/agent/utils/message_utils.py +85 -0
  45. droidrun/agent/utils/tools.py +220 -0
  46. droidrun/agent/utils/trajectory.py +8 -7
  47. droidrun/cli/__init__.py +1 -1
  48. droidrun/cli/logs.py +29 -28
  49. droidrun/cli/main.py +279 -143
  50. droidrun/config_manager/__init__.py +25 -0
  51. droidrun/config_manager/config_manager.py +583 -0
  52. droidrun/macro/__init__.py +2 -2
  53. droidrun/macro/__main__.py +1 -1
  54. droidrun/macro/cli.py +36 -34
  55. droidrun/macro/replay.py +7 -9
  56. droidrun/portal.py +1 -1
  57. droidrun/telemetry/__init__.py +2 -2
  58. droidrun/telemetry/events.py +3 -4
  59. droidrun/telemetry/phoenix.py +173 -0
  60. droidrun/telemetry/tracker.py +7 -5
  61. droidrun/tools/__init__.py +1 -1
  62. droidrun/tools/adb.py +210 -82
  63. droidrun/tools/ios.py +7 -5
  64. droidrun/tools/tools.py +25 -8
  65. {droidrun-0.3.9.dist-info → droidrun-0.3.10.dev3.dist-info}/METADATA +5 -3
  66. droidrun-0.3.10.dev3.dist-info/RECORD +70 -0
  67. droidrun/agent/common/default.py +0 -5
  68. droidrun/agent/context/reflection.py +0 -20
  69. droidrun/agent/oneflows/reflector.py +0 -265
  70. droidrun-0.3.9.dist-info/RECORD +0 -56
  71. {droidrun-0.3.9.dist-info → droidrun-0.3.10.dev3.dist-info}/WHEEL +0 -0
  72. {droidrun-0.3.9.dist-info → droidrun-0.3.10.dev3.dist-info}/entry_points.txt +0 -0
  73. {droidrun-0.3.9.dist-info → droidrun-0.3.10.dev3.dist-info}/licenses/LICENSE +0 -0
@@ -1,19 +1,19 @@
1
+ from typing import Dict, List
2
+
1
3
  from llama_index.core.workflow import Event
2
- from droidrun.agent.context import Reflection, Task
3
- from typing import List, Optional
4
+ from pydantic import BaseModel, Field
5
+
6
+ from droidrun.agent.context import Task
7
+
4
8
 
5
9
  class CodeActExecuteEvent(Event):
6
10
  task: Task
7
- reflection: Optional[Reflection]
8
11
 
9
12
  class CodeActResultEvent(Event):
10
13
  success: bool
11
14
  reason: str
12
15
  steps: int
13
16
 
14
- class ReasoningLogicEvent(Event):
15
- reflection: Optional[Reflection] = None
16
- force_planning: bool = False
17
17
 
18
18
  class FinalizeEvent(Event):
19
19
  success: bool
@@ -28,6 +28,88 @@ class FinalizeEvent(Event):
28
28
  class TaskRunnerEvent(Event):
29
29
  pass
30
30
 
31
- class ReflectionEvent(Event):
32
- task: Task
33
- pass
31
+
32
+
33
+ # ============================================================================
34
+ # DroidAgentState - State model for llama-index Context
35
+ # ============================================================================
36
+
37
+ class DroidAgentState(BaseModel):
38
+ """
39
+ State model for DroidAgent workflow - shared across parent and child workflows.
40
+ """
41
+
42
+ # Task context
43
+ instruction: str = ""
44
+
45
+ # UI State
46
+ ui_elements_list_before: str = ""
47
+ ui_elements_list_after: str = ""
48
+ focused_text: str = ""
49
+ device_state_text: str = ""
50
+ width: int = 0
51
+ height: int = 0
52
+ screenshot: str | bytes | None = None
53
+ has_text_to_modify: bool = False
54
+
55
+ # Action tracking
56
+ action_pool: List[Dict] = Field(default_factory=list)
57
+ action_history: List[Dict] = Field(default_factory=list)
58
+ summary_history: List[str] = Field(default_factory=list)
59
+ action_outcomes: List[str] = Field(default_factory=list) # "A", "B", "C"
60
+ error_descriptions: List[str] = Field(default_factory=list)
61
+
62
+ # Last action info
63
+ last_action: Dict = Field(default_factory=dict)
64
+ last_summary: str = ""
65
+ last_action_thought: str = ""
66
+
67
+ # Memory
68
+ memory: str = ""
69
+ message_history: List[Dict] = Field(default_factory=list)
70
+
71
+ # Planning
72
+ plan: str = ""
73
+ completed_plan: str = ""
74
+ current_subgoal: str = ""
75
+ finish_thought: str = ""
76
+ progress_status: str = ""
77
+ manager_answer: str = "" # For answer-type tasks
78
+
79
+ # Error handling
80
+ error_flag_plan: bool = False
81
+ err_to_manager_thresh: int = 2
82
+
83
+ # Output
84
+ output_dir: str = ""
85
+
86
+
87
+ # ============================================================================
88
+ # Manager/Executor coordination events
89
+ # ============================================================================
90
+
91
+ class ManagerInputEvent(Event):
92
+ """Trigger Manager workflow for planning"""
93
+ pass
94
+
95
+
96
+ class ManagerPlanEvent(Event):
97
+ """Manager has created a plan"""
98
+ plan: str
99
+ current_subgoal: str
100
+ completed_plan: str
101
+ thought: str
102
+ manager_answer: str = ""
103
+
104
+
105
+ class ExecutorInputEvent(Event):
106
+ """Trigger Executor workflow for action execution"""
107
+ current_subgoal: str
108
+
109
+
110
+ class ExecutorResultEvent(Event):
111
+ """Executor action result"""
112
+ action: Dict
113
+ outcome: bool
114
+ error: str
115
+ summary: str
@@ -0,0 +1,13 @@
1
+ """
2
+ Executor Agent - Action execution workflow.
3
+ """
4
+
5
+ from droidrun.agent.executor.events import ExecutorActionEvent, ExecutorResultEvent
6
+ from droidrun.agent.executor.executor_agent import ExecutorAgent
7
+
8
+ __all__ = [
9
+ "ExecutorAgent",
10
+ "ExecutorThinkingEvent",
11
+ "ExecutorActionEvent",
12
+ "ExecutorResultEvent"
13
+ ]
@@ -0,0 +1,24 @@
1
+ """
2
+ Events for the ExecutorAgent workflow.
3
+ """
4
+
5
+ from typing import Dict
6
+
7
+ from llama_index.core.workflow.events import Event
8
+
9
+
10
+ class ExecutorActionEvent(Event):
11
+ """Executor has selected an action to execute"""
12
+ action_json: str
13
+ thought: str
14
+ description: str
15
+
16
+
17
+ class ExecutorResultEvent(Event):
18
+ """Executor action result"""
19
+ action: Dict
20
+ outcome: bool
21
+ error: str
22
+ summary: str
23
+ thought: str = ""
24
+ action_json: str = ""
@@ -0,0 +1,327 @@
1
+ """
2
+ ExecutorAgent - Action execution workflow.
3
+
4
+ This agent is responsible for:
5
+ - Taking a specific subgoal from the Manager
6
+ - Analyzing the current UI state
7
+ - Selecting and executing appropriate actions
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import logging
14
+ from typing import TYPE_CHECKING
15
+
16
+ from llama_index.core.llms import ChatMessage, ImageBlock, TextBlock
17
+ from llama_index.core.llms.llm import LLM
18
+ from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
19
+
20
+ from droidrun.agent.executor.events import ExecutorActionEvent, ExecutorResultEvent
21
+ from droidrun.agent.executor.prompts import build_executor_system_prompt, parse_executor_response
22
+ from droidrun.agent.utils.tools import click, long_press, open_app, swipe, system_button, type
23
+ from droidrun.agent.utils.inference import acall_with_retries
24
+ from droidrun.config_manager import config
25
+ import asyncio
26
+
27
+ if TYPE_CHECKING:
28
+ from droidrun.agent.droid.events import DroidAgentState
29
+
30
+ logger = logging.getLogger("droidrun")
31
+
32
+
33
+ class ExecutorAgent(Workflow):
34
+ """
35
+ Action execution agent that performs specific actions.
36
+
37
+ The Executor:
38
+ 1. Receives a subgoal from the Manager
39
+ 2. Analyzes current UI state and context
40
+ 3. Selects an appropriate action to take
41
+ 4. Executes the action on the device
42
+ 5. Reports the outcome
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ llm: LLM,
48
+ vision: bool,
49
+ tools_instance,
50
+ shared_state: "DroidAgentState",
51
+ persona=None,
52
+ custom_tools: dict = None,
53
+ debug: bool = False,
54
+ **kwargs
55
+ ):
56
+ super().__init__(**kwargs)
57
+ self.llm = llm
58
+ self.vision = vision
59
+ self.tools_instance = tools_instance
60
+ self.shared_state = shared_state
61
+ self.persona = persona
62
+ self.custom_tools = custom_tools or {}
63
+ self.debug = debug
64
+
65
+ logger.info("✅ ExecutorAgent initialized successfully.")
66
+
67
+
68
+ @step
69
+ async def think(
70
+ self,
71
+ ctx: Context,
72
+ ev: StartEvent
73
+ ) -> ExecutorActionEvent:
74
+ """
75
+ Executor decides which action to take.
76
+
77
+ This step:
78
+ 1. Calls LLM with executor prompt and context
79
+ 2. Parses the response for action, thought, description
80
+ 3. Validates action format (blocks answer actions!)
81
+ 4. Returns action event
82
+ """
83
+ subgoal = ev.get("subgoal", "")
84
+ logger.info(f"🧠 Executor thinking about action for: {subgoal}")
85
+
86
+ app_card = "" # TODO: Implement app card retrieval
87
+
88
+ system_prompt = build_executor_system_prompt(
89
+ state=self.shared_state,
90
+ subgoal=subgoal,
91
+ app_card=app_card
92
+ )
93
+
94
+ blocks = [TextBlock(text=system_prompt)]
95
+ if self.vision:
96
+ screenshot = self.shared_state.screenshot
97
+ if screenshot is not None:
98
+ blocks.append(ImageBlock(image=screenshot))
99
+ logger.debug("📸 Using screenshot for Executor")
100
+ else:
101
+ logger.warning("⚠️ Vision enabled but no screenshot available")
102
+ messages = [ChatMessage(role="user", blocks=blocks)]
103
+
104
+ try:
105
+ response = await acall_with_retries(self.llm, messages)
106
+ response_text = str(response)
107
+ except Exception as e:
108
+ raise RuntimeError(f"Error calling LLM in executor: {e}") from e
109
+
110
+ # Parse response
111
+ try:
112
+ parsed = parse_executor_response(response_text)
113
+ except Exception as e:
114
+ logger.error(f"❌ Failed to parse executor response: {e}")
115
+ return ExecutorActionEvent(
116
+ action_json=json.dumps({"action": "invalid"}),
117
+ thought=f"Failed to parse response: {str(e)}",
118
+ description="Invalid response format from LLM"
119
+ )
120
+
121
+ logger.info(f"💡 Thought: {parsed['thought']}")
122
+ logger.info(f"🎯 Action: {parsed['action']}")
123
+ logger.debug(f" - Description: {parsed['description']}")
124
+
125
+ return ExecutorActionEvent(
126
+ action_json=parsed["action"],
127
+ thought=parsed["thought"],
128
+ description=parsed["description"]
129
+ )
130
+
131
+ @step
132
+ async def execute(
133
+ self,
134
+ ctx: Context,
135
+ ev: ExecutorActionEvent
136
+ ) -> ExecutorResultEvent:
137
+ """
138
+ Execute the selected action using the tools instance.
139
+
140
+ Maps action JSON to appropriate tool calls and handles execution.
141
+ """
142
+ logger.info(f"⚡ Executing action: {ev.description}")
143
+
144
+ # Parse action JSON
145
+ try:
146
+ action_dict = json.loads(ev.action_json)
147
+ except json.JSONDecodeError as e:
148
+ logger.error(f"❌ Failed to parse action JSON: {e}")
149
+ return ExecutorResultEvent(
150
+ action={"action": "invalid"},
151
+ outcome=False,
152
+ error=f"Invalid action JSON: {str(e)}",
153
+ summary="Failed to parse action",
154
+ thought=ev.thought,
155
+ action_json=ev.action_json
156
+ )
157
+
158
+ # Execute the action
159
+ outcome, error, summary = await self._execute_action(action_dict, ev.description)
160
+
161
+ if outcome:
162
+ await asyncio.sleep(config.agent.after_sleep_action)
163
+
164
+ logger.info(f"{'✅' if outcome else '❌'} Execution complete: {summary}")
165
+
166
+ return ExecutorResultEvent(
167
+ action=action_dict,
168
+ outcome=outcome,
169
+ error=error,
170
+ summary=summary,
171
+ thought=ev.thought,
172
+ action_json=ev.action_json
173
+ )
174
+
175
+ async def _execute_action(self, action_dict: dict, description: str) -> tuple[bool, str, str]:
176
+ """
177
+ Execute a single action based on the action dictionary.
178
+
179
+ Args:
180
+ action_dict: Dictionary containing action type and parameters
181
+ description: Human-readable description of the action
182
+
183
+ Returns:
184
+ Tuple of (outcome: bool, error: str, summary: str)
185
+ """
186
+
187
+ action_type = action_dict.get("action", "unknown")
188
+
189
+ # Check custom_tools first (before atomic actions)
190
+ if action_type in self.custom_tools:
191
+ return await self._execute_custom_tool(action_type, action_dict)
192
+
193
+ try:
194
+ if action_type == "click":
195
+ index = action_dict.get("index")
196
+ if index is None:
197
+ return False, "Missing 'index' parameter", "Failed: click requires index"
198
+
199
+ result = click(self.tools_instance, index)
200
+ return True, "None", f"Clicked element at index {index}"
201
+
202
+ elif action_type == "long_press":
203
+ index = action_dict.get("index")
204
+ if index is None:
205
+ return False, "Missing 'index' parameter", "Failed: long_press requires index"
206
+
207
+ success = long_press(self.tools_instance, index)
208
+ if success:
209
+ return True, "None", f"Long pressed element at index {index}"
210
+ else:
211
+ return False, "Long press failed", f"Failed to long press at index {index}"
212
+
213
+ elif action_type == "type":
214
+ text = action_dict.get("text")
215
+ index = action_dict.get("index", -1)
216
+
217
+ if text is None:
218
+ return False, "Missing 'text' parameter", "Failed: type requires text"
219
+
220
+ result = type(self.tools_instance, text, index)
221
+ return True, "None", f"Typed '{text}' into element at index {index}"
222
+
223
+ elif action_type == "system_button":
224
+ button = action_dict.get("button")
225
+ if button is None:
226
+ return False, "Missing 'button' parameter", "Failed: system_button requires button"
227
+
228
+ result = system_button(self.tools_instance, button)
229
+ if "Error" in result:
230
+ return False, result, f"Failed to press {button} button"
231
+ return True, "None", f"Pressed {button} button"
232
+
233
+ elif action_type == "swipe":
234
+ coordinate = action_dict.get("coordinate")
235
+ coordinate2 = action_dict.get("coordinate2")
236
+
237
+ if coordinate is None or coordinate2 is None:
238
+ return False, "Missing coordinate parameters", "Failed: swipe requires coordinate and coordinate2"
239
+
240
+ # Validate coordinate format before calling swipe
241
+ if not isinstance(coordinate, list) or len(coordinate) != 2:
242
+ return False, f"Invalid coordinate format: {coordinate}", "Failed: coordinate must be [x, y]"
243
+ if not isinstance(coordinate2, list) or len(coordinate2) != 2:
244
+ return False, f"Invalid coordinate2 format: {coordinate2}", "Failed: coordinate2 must be [x, y]"
245
+
246
+ success = swipe(self.tools_instance, coordinate, coordinate2)
247
+ if success:
248
+ return True, "None", f"Swiped from {coordinate} to {coordinate2}"
249
+ else:
250
+ return False, "Swipe failed", f"Failed to swipe from {coordinate} to {coordinate2}"
251
+
252
+ elif action_type == "open_app":
253
+ text = action_dict.get("text")
254
+ if text is None:
255
+ return False, "Missing 'text' parameter", "Failed: open_app requires text"
256
+
257
+ result = open_app(self.tools_instance, text)
258
+ return True, "None", f"Opened app: {text}"
259
+
260
+ else:
261
+ return False, f"Unknown action type: {action_type}", f"Failed: unknown action '{action_type}'"
262
+
263
+ except Exception as e:
264
+ logger.error(f"❌ Exception during action execution: {e}", exc_info=True)
265
+ return False, f"Exception: {str(e)}", f"Failed to execute {action_type}: {str(e)}"
266
+
267
+ async def _execute_custom_tool(self, action_type: str, action_dict: dict) -> tuple[bool, str, str]:
268
+ """
269
+ Execute a custom tool based on the action dictionary.
270
+
271
+ Args:
272
+ action_type: The custom tool name
273
+ action_dict: Dictionary containing action parameters
274
+
275
+ Returns:
276
+ Tuple of (outcome: bool, error: str, summary: str)
277
+ """
278
+ try:
279
+ tool_spec = self.custom_tools[action_type]
280
+ tool_func = tool_spec["function"]
281
+
282
+ # Extract arguments (exclude 'action' key)
283
+ tool_args = {k: v for k, v in action_dict.items() if k != "action"}
284
+
285
+ # Execute the custom tool function
286
+ # First argument is always tools_instance (bound in same pattern as atomic actions)
287
+ if asyncio.iscoroutinefunction(tool_func):
288
+ result = await tool_func(self.tools_instance, **tool_args)
289
+ else:
290
+ result = tool_func(self.tools_instance, **tool_args)
291
+
292
+ # Success case
293
+ summary = f"Executed custom tool '{action_type}'"
294
+ if result is not None:
295
+ summary += f": {str(result)}"
296
+
297
+ return True, "None", summary
298
+
299
+ except TypeError as e:
300
+ # Likely missing or wrong arguments
301
+ error_msg = f"Invalid arguments for custom tool '{action_type}': {str(e)}"
302
+ logger.error(f"❌ {error_msg}")
303
+ return False, error_msg, f"Failed: {action_type}"
304
+
305
+ except Exception as e:
306
+ # General execution error
307
+ error_msg = f"Error executing custom tool '{action_type}': {str(e)}"
308
+ logger.error(f"❌ {error_msg}", exc_info=True)
309
+ return False, error_msg, f"Failed: {action_type}"
310
+
311
+ @step
312
+ async def finalize(
313
+ self,
314
+ ctx: Context,
315
+ ev: ExecutorResultEvent
316
+ ) -> StopEvent:
317
+ """Return executor results to parent workflow."""
318
+ logger.debug("✅ Executor execution complete")
319
+
320
+ return StopEvent(result={
321
+ "action": ev.action,
322
+ "outcome": ev.outcome,
323
+ "error": ev.error,
324
+ "summary": ev.summary,
325
+ "thought": ev.thought,
326
+ "action_json": ev.action_json
327
+ })
@@ -0,0 +1,136 @@
1
+ """
2
+ Prompts for the ExecutorAgent.
3
+ """
4
+
5
+
6
+ from droidrun.agent.droid.events import DroidAgentState
7
+ from droidrun.agent.utils.tools import ATOMIC_ACTION_SIGNATURES
8
+
9
+
10
+ def build_executor_system_prompt(
11
+ state: "DroidAgentState",
12
+ subgoal: str,
13
+ app_card: str = "",
14
+ ) -> str:
15
+ """
16
+ Build the complete Executor system prompt with all context.
17
+
18
+ Args:
19
+ state: Current DroidAgentState with all context
20
+ subgoal: Current subgoal to execute
21
+ app_card: Optional app-specific instructions
22
+
23
+ Returns:
24
+ Complete system prompt for the Executor
25
+ """
26
+ prompt = f"""You are a LOW-LEVEL ACTION EXECUTOR for an Android phone. You do NOT answer questions or provide results. You ONLY perform individual atomic actions as specified in the current subgoal. You are part of a larger system - your job is to execute actions, not to think about or answer the user's original question.
27
+
28
+ ### User Request ###
29
+ {state.instruction}
30
+
31
+ {("App card gives information on how to operate the app and perform actions.\n" + "### App Card ###\n" + app_card.strip() + "\n\n") if app_card.strip() else ""}{(("### Device State ###\n" + state.device_state_text.strip() + "\n\n") if state.device_state_text.strip() else "")}### Overall Plan ###
32
+ {state.plan}
33
+
34
+ ### Current Subgoal ###
35
+ EXECUTE THIS SUBGOAL: {subgoal}
36
+
37
+ EXECUTION MODE: You are a dumb robot. Find the exact text/element mentioned in the subgoal above and perform the specified action on it. Do not read anything below this line until after you execute the subgoal.
38
+
39
+ ### SUBGOAL PARSING MODE ###
40
+ Read the current subgoal exactly as written. Look for:
41
+ - Action words: "tap", "click", "swipe", "type", "press", "open" etc.
42
+ - Target elements: specific text, buttons, fields, coordinates mentioned
43
+ - Locations: "header", "bottom", "left", "right", specific coordinates
44
+ Convert directly to atomic action:
45
+ - "tap/click" → click action
46
+ - "swipe" → swipe action
47
+ - "type" → type action
48
+ - "press [system button]" → system_button action
49
+ - "open [app]" → open_app action
50
+ Execute the atomic action for the exact target mentioned. Ignore everything else.
51
+
52
+ ### Progress Status ###
53
+ {(state.progress_status + "\n\n") if state.progress_status != "" else "No progress yet.\n\n"}
54
+
55
+ ### Guidelines ###
56
+ General:
57
+ - For any pop-up window, such as a permission request, you need to close it (e.g., by clicking `Don't Allow` or `Accept & continue`) before proceeding. Never choose to add any account or log in.
58
+ Action Related:
59
+ - Use the `open_app` action whenever you want to open an app (nothing will happen if the app is not installed), do not use the app drawer to open an app.
60
+ - Consider exploring the screen by using the `swipe` action with different directions to reveal additional content. Or use search to quickly find a specific entry, if applicable.
61
+ - If you cannot change the page content by swiping in the same direction continuously, the page may have been swiped to the bottom. Please try another operation to display more content.
62
+ - For some horizontally distributed tags, you can swipe horizontally to view more.
63
+ Text Related Operations:
64
+ - Activated input box: If an input box is activated, it may have a cursor inside it and the keyboard is visible. If there is no cursor on the screen but the keyboard is visible, it may be because the cursor is blinking. The color of the activated input box will be highlighted. If you are not sure whether the input box is activated, click it before typing.
65
+ - To input some text: first click the input box that you want to input, make sure the correct input box is activated and the keyboard is visible, then use `type` action to enter the specified text.
66
+ - To clear the text: long press the backspace button in the keyboard.
67
+ - To copy some text: first long press the text you want to copy, then click the `copy` button in bar.
68
+ - To paste text into a text box: first long press the text box, then click the `paste` button in bar.
69
+
70
+ ---
71
+ Execute the current subgoal mechanically. Do NOT examine the screen content or make decisions about what you see. Parse the current subgoal text to identify the required action and execute it exactly as written. You must choose your action from one of the atomic actions.
72
+
73
+ #### Atomic Actions ####
74
+ The atomic action functions are listed in the format of `action(arguments): description` as follows:
75
+ {chr(10).join(f"- {action_name}({', '.join(action_info['arguments'])}): {action_info['description']}" for action_name, action_info in ATOMIC_ACTION_SIGNATURES.items())}
76
+ \n
77
+ ### Latest Action History ###
78
+ {(("Recent actions you took previously and whether they were successful:\n" + "\n".join(
79
+ (f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome == "A"
80
+ else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
81
+ for act, summ, outcome, err_des in zip(
82
+ state.action_history[-min(5, len(state.action_history)):],
83
+ state.summary_history[-min(5, len(state.action_history)):],
84
+ state.action_outcomes[-min(5, len(state.action_history)):],
85
+ state.error_descriptions[-min(5, len(state.action_history)):], strict=True)
86
+ ) + "\n\n")) if state.action_history else "No actions have been taken yet.\n\n"}
87
+
88
+ ---
89
+ ### LITERAL EXECUTION RULE ###
90
+ Whatever the current subgoal says to do, do that EXACTLY. Do not substitute with what you think is better. Do not optimize. Do not consider screen state. Parse the subgoal text literally and execute the matching atomic action.
91
+
92
+ IMPORTANT:
93
+ 1. Do NOT repeat previously failed actions multiple times. Try changing to another action.
94
+ 2. Must do the current subgoal.
95
+
96
+ Provide your output in the following format, which contains three parts:
97
+
98
+ ### Thought ###
99
+ Break down the current subgoal into: (1) What atomic action is required? (2) What target/location is specified? (3) What parameters do I need? Do NOT reason about whether this makes sense - just mechanically convert the subgoal text into the appropriate action format.
100
+
101
+ ### Action ###
102
+ Choose only one action or shortcut from the options provided.
103
+ You must provide your decision using a valid JSON format specifying the `action` and the arguments of the action. For example, if you want to open an App, you should write {{ "action":"open_app", "text": "app name" }}.
104
+
105
+ ### Description ###
106
+ A brief description of the chosen action. Do not describe expected outcome.
107
+ """
108
+
109
+
110
+ return prompt
111
+
112
+
113
+ def parse_executor_response(response: str) -> dict:
114
+ """
115
+ Parse the Executor LLM response.
116
+
117
+ Extracts:
118
+ - thought: Content between "### Thought" and "### Action"
119
+ - action: Content between "### Action" and "### Description"
120
+ - description: Content after "### Description"
121
+
122
+ Args:
123
+ response: Raw LLM response string
124
+
125
+ Returns:
126
+ Dictionary with 'thought', 'action', 'description' keys
127
+ """
128
+ thought = response.split("### Thought")[-1].split("### Action")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
129
+ action = response.split("### Action")[-1].split("### Description")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
130
+ description = response.split("### Description")[-1].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
131
+
132
+ return {
133
+ "thought": thought,
134
+ "action": action,
135
+ "description": description
136
+ }
@@ -0,0 +1,18 @@
1
+ """
2
+ Manager Agent - Planning and reasoning workflow.
3
+ """
4
+
5
+ from droidrun.agent.manager.events import ManagerPlanEvent, ManagerThinkingEvent
6
+ from droidrun.agent.manager.manager_agent import ManagerAgent
7
+ from droidrun.agent.manager.prompts import (
8
+ build_manager_system_prompt,
9
+ parse_manager_response,
10
+ )
11
+
12
+ __all__ = [
13
+ "ManagerAgent",
14
+ "ManagerThinkingEvent",
15
+ "ManagerPlanEvent",
16
+ "build_manager_system_prompt",
17
+ "parse_manager_response",
18
+ ]
@@ -0,0 +1,20 @@
1
+ """
2
+ Events for the ManagerAgent workflow.
3
+ """
4
+
5
+ from llama_index.core.workflow.events import Event
6
+
7
+
8
+ class ManagerThinkingEvent(Event):
9
+ """Manager is thinking about the plan"""
10
+ pass
11
+
12
+
13
+ class ManagerPlanEvent(Event):
14
+ """Manager has created a plan"""
15
+ plan: str
16
+ current_subgoal: str
17
+ completed_plan: str
18
+ thought: str
19
+ manager_answer: str = ""
20
+ memory_update: str = ""