droidrun 0.3.8__py3-none-any.whl → 0.3.10.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/__init__.py +2 -3
- droidrun/__main__.py +1 -1
- droidrun/agent/__init__.py +1 -1
- droidrun/agent/codeact/__init__.py +1 -4
- droidrun/agent/codeact/codeact_agent.py +112 -48
- droidrun/agent/codeact/events.py +6 -3
- droidrun/agent/codeact/prompts.py +2 -2
- droidrun/agent/common/constants.py +2 -0
- droidrun/agent/common/events.py +5 -3
- droidrun/agent/context/__init__.py +1 -3
- droidrun/agent/context/agent_persona.py +2 -1
- droidrun/agent/context/context_injection_manager.py +6 -6
- droidrun/agent/context/episodic_memory.py +5 -3
- droidrun/agent/context/personas/__init__.py +3 -3
- droidrun/agent/context/personas/app_starter.py +3 -3
- droidrun/agent/context/personas/big_agent.py +3 -3
- droidrun/agent/context/personas/default.py +3 -3
- droidrun/agent/context/personas/ui_expert.py +5 -5
- droidrun/agent/context/task_manager.py +15 -17
- droidrun/agent/droid/__init__.py +1 -1
- droidrun/agent/droid/droid_agent.py +327 -182
- droidrun/agent/droid/events.py +91 -9
- droidrun/agent/executor/__init__.py +13 -0
- droidrun/agent/executor/events.py +24 -0
- droidrun/agent/executor/executor_agent.py +327 -0
- droidrun/agent/executor/prompts.py +136 -0
- droidrun/agent/manager/__init__.py +18 -0
- droidrun/agent/manager/events.py +20 -0
- droidrun/agent/manager/manager_agent.py +459 -0
- droidrun/agent/manager/prompts.py +223 -0
- droidrun/agent/oneflows/app_starter_workflow.py +118 -0
- droidrun/agent/oneflows/text_manipulator.py +204 -0
- droidrun/agent/planner/__init__.py +3 -3
- droidrun/agent/planner/events.py +6 -3
- droidrun/agent/planner/planner_agent.py +60 -53
- droidrun/agent/planner/prompts.py +2 -2
- droidrun/agent/usage.py +15 -13
- droidrun/agent/utils/__init__.py +11 -1
- droidrun/agent/utils/async_utils.py +2 -1
- droidrun/agent/utils/chat_utils.py +48 -60
- droidrun/agent/utils/device_state_formatter.py +177 -0
- droidrun/agent/utils/executer.py +13 -12
- droidrun/agent/utils/inference.py +114 -0
- droidrun/agent/utils/llm_picker.py +2 -0
- droidrun/agent/utils/message_utils.py +85 -0
- droidrun/agent/utils/tools.py +220 -0
- droidrun/agent/utils/trajectory.py +8 -7
- droidrun/cli/__init__.py +1 -1
- droidrun/cli/logs.py +29 -28
- droidrun/cli/main.py +279 -143
- droidrun/config_manager/__init__.py +25 -0
- droidrun/config_manager/config_manager.py +583 -0
- droidrun/macro/__init__.py +2 -2
- droidrun/macro/__main__.py +1 -1
- droidrun/macro/cli.py +36 -34
- droidrun/macro/replay.py +7 -9
- droidrun/portal.py +1 -1
- droidrun/telemetry/__init__.py +2 -2
- droidrun/telemetry/events.py +3 -4
- droidrun/telemetry/phoenix.py +173 -0
- droidrun/telemetry/tracker.py +7 -5
- droidrun/tools/__init__.py +1 -1
- droidrun/tools/adb.py +210 -82
- droidrun/tools/ios.py +7 -5
- droidrun/tools/tools.py +25 -8
- {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/METADATA +13 -7
- droidrun-0.3.10.dev2.dist-info/RECORD +70 -0
- droidrun/agent/common/default.py +0 -5
- droidrun/agent/context/reflection.py +0 -20
- droidrun/agent/oneflows/reflector.py +0 -265
- droidrun-0.3.8.dist-info/RECORD +0 -55
- {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/WHEEL +0 -0
- {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/entry_points.txt +0 -0
- {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,459 @@
|
|
1
|
+
"""
|
2
|
+
ManagerAgent - Planning and reasoning workflow.
|
3
|
+
|
4
|
+
This agent is responsible for:
|
5
|
+
- Analyzing the current state
|
6
|
+
- Creating plans and subgoals
|
7
|
+
- Tracking progress
|
8
|
+
- Deciding when tasks are complete
|
9
|
+
"""
|
10
|
+
|
11
|
+
from __future__ import annotations
|
12
|
+
|
13
|
+
import logging
|
14
|
+
from typing import TYPE_CHECKING, List
|
15
|
+
|
16
|
+
from llama_index.core.llms.llm import LLM
|
17
|
+
from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
|
18
|
+
|
19
|
+
from droidrun.agent.manager.events import ManagerPlanEvent, ManagerThinkingEvent
|
20
|
+
from droidrun.agent.manager.prompts import build_manager_system_prompt, parse_manager_response
|
21
|
+
from droidrun.agent.utils import convert_messages_to_chatmessages
|
22
|
+
from droidrun.agent.utils.chat_utils import remove_empty_messages
|
23
|
+
from droidrun.agent.utils.device_state_formatter import get_device_state_exact_format
|
24
|
+
from droidrun.agent.utils.inference import acall_with_retries
|
25
|
+
from droidrun.agent.utils.tools import build_custom_tool_descriptions
|
26
|
+
|
27
|
+
if TYPE_CHECKING:
|
28
|
+
from droidrun.agent.droid.events import DroidAgentState
|
29
|
+
from droidrun.tools import Tools
|
30
|
+
|
31
|
+
logger = logging.getLogger("droidrun")
|
32
|
+
|
33
|
+
|
34
|
+
class ManagerAgent(Workflow):
|
35
|
+
"""
|
36
|
+
Planning and reasoning agent that decides what to do next.
|
37
|
+
|
38
|
+
The Manager:
|
39
|
+
1. Analyzes current device state and action history
|
40
|
+
2. Creates plans with specific subgoals
|
41
|
+
3. Tracks progress and completed steps
|
42
|
+
4. Decides when tasks are complete or need to provide answers
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(
|
46
|
+
self,
|
47
|
+
llm: LLM,
|
48
|
+
vision: bool,
|
49
|
+
personas: List,
|
50
|
+
tools_instance: "Tools",
|
51
|
+
shared_state: "DroidAgentState",
|
52
|
+
custom_tools: dict = None,
|
53
|
+
debug: bool = False,
|
54
|
+
**kwargs
|
55
|
+
):
|
56
|
+
super().__init__(**kwargs)
|
57
|
+
self.llm = llm
|
58
|
+
self.vision = vision
|
59
|
+
self.personas = personas
|
60
|
+
self.tools_instance = tools_instance
|
61
|
+
self.shared_state = shared_state
|
62
|
+
self.custom_tools = custom_tools or {}
|
63
|
+
self.debug = debug
|
64
|
+
|
65
|
+
logger.info("✅ ManagerAgent initialized successfully.")
|
66
|
+
|
67
|
+
# ========================================================================
|
68
|
+
# Helper Methods
|
69
|
+
# ========================================================================
|
70
|
+
|
71
|
+
def _build_system_prompt(
|
72
|
+
self,
|
73
|
+
has_text_to_modify: bool
|
74
|
+
) -> str:
|
75
|
+
"""
|
76
|
+
Build system prompt with all context.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
has_text_to_modify: Whether text manipulation mode is enabled
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
Complete system prompt
|
83
|
+
"""
|
84
|
+
|
85
|
+
# Get error history if error_flag_plan is set
|
86
|
+
error_history = []
|
87
|
+
if self.shared_state.error_flag_plan:
|
88
|
+
k = self.shared_state.err_to_manager_thresh
|
89
|
+
error_history = [
|
90
|
+
{
|
91
|
+
"action": act,
|
92
|
+
"summary": summ,
|
93
|
+
"error": err_des
|
94
|
+
}
|
95
|
+
for act, summ, err_des in zip(
|
96
|
+
self.shared_state.action_history[-k:],
|
97
|
+
self.shared_state.summary_history[-k:],
|
98
|
+
self.shared_state.error_descriptions[-k:], strict=True
|
99
|
+
)
|
100
|
+
]
|
101
|
+
|
102
|
+
# Build custom tools descriptions
|
103
|
+
custom_tools_descriptions = build_custom_tool_descriptions(self.custom_tools)
|
104
|
+
|
105
|
+
return build_manager_system_prompt(
|
106
|
+
instruction=self.shared_state.instruction,
|
107
|
+
has_text_to_modify=has_text_to_modify,
|
108
|
+
app_card="", # TODO: implement app card retrieval system
|
109
|
+
device_date=self.tools_instance.get_date(),
|
110
|
+
important_notes="", # TODO: expose important_notes in DroidAgentState if needed
|
111
|
+
error_flag=self.shared_state.error_flag_plan,
|
112
|
+
error_history=error_history,
|
113
|
+
custom_tools_descriptions=custom_tools_descriptions
|
114
|
+
)
|
115
|
+
|
116
|
+
def _build_messages_with_context(
|
117
|
+
self,
|
118
|
+
system_prompt: str,
|
119
|
+
screenshot: str = None
|
120
|
+
) -> list[dict]:
|
121
|
+
"""
|
122
|
+
Build messages from history and inject current context.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
system_prompt: System prompt to use
|
126
|
+
screenshot: Path to current screenshot (if vision enabled)
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
List of message dicts ready for conversion
|
130
|
+
"""
|
131
|
+
import copy
|
132
|
+
|
133
|
+
# Start with system message
|
134
|
+
messages = [
|
135
|
+
{"role": "system", "content": [{"text": system_prompt}]}
|
136
|
+
]
|
137
|
+
|
138
|
+
# Add accumulated message history (deep copy to avoid mutation)
|
139
|
+
messages.extend(copy.deepcopy(self.shared_state.message_history))
|
140
|
+
|
141
|
+
# ====================================================================
|
142
|
+
# Inject memory, device state, screenshot to LAST user message
|
143
|
+
# ====================================================================
|
144
|
+
# Find last user message index
|
145
|
+
user_indices = [i for i, msg in enumerate(messages) if msg['role'] == 'user']
|
146
|
+
|
147
|
+
if user_indices:
|
148
|
+
last_user_idx = user_indices[-1]
|
149
|
+
|
150
|
+
# Add memory to last user message
|
151
|
+
current_memory = (self.shared_state.memory or "").strip()
|
152
|
+
if current_memory:
|
153
|
+
if messages[last_user_idx]['content'] and 'text' in messages[last_user_idx]['content'][0]:
|
154
|
+
messages[last_user_idx]['content'][0]['text'] += f"\n<memory>\n{current_memory}\n</memory>\n"
|
155
|
+
else:
|
156
|
+
messages[last_user_idx]['content'].insert(0, {"text": f"<memory>\n{current_memory}\n</memory>\n"})
|
157
|
+
|
158
|
+
# Add device state to last user message
|
159
|
+
current_a11y = (self.shared_state.ui_elements_list_after or self.shared_state.device_state_text or "").strip()
|
160
|
+
if current_a11y:
|
161
|
+
if messages[last_user_idx]['content'] and 'text' in messages[last_user_idx]['content'][0]:
|
162
|
+
messages[last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{current_a11y}\n</device_state>\n"
|
163
|
+
else:
|
164
|
+
messages[last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{current_a11y}\n</device_state>\n"})
|
165
|
+
|
166
|
+
# Add screenshot to last user message
|
167
|
+
if screenshot and self.vision:
|
168
|
+
messages[last_user_idx]['content'].append({"image": screenshot})
|
169
|
+
|
170
|
+
# Add previous device state to SECOND-TO-LAST user message (if exists)
|
171
|
+
if len(user_indices) >= 2:
|
172
|
+
second_last_user_idx = user_indices[-2]
|
173
|
+
prev_a11y = (self.shared_state.ui_elements_list_before or "").strip()
|
174
|
+
|
175
|
+
if prev_a11y:
|
176
|
+
if messages[second_last_user_idx]['content'] and 'text' in messages[second_last_user_idx]['content'][0]:
|
177
|
+
messages[second_last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{prev_a11y}\n</device_state>\n"
|
178
|
+
else:
|
179
|
+
messages[second_last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{prev_a11y}\n</device_state>\n"})
|
180
|
+
messages = remove_empty_messages(messages)
|
181
|
+
return messages
|
182
|
+
|
183
|
+
async def _validate_and_retry_llm_call(
|
184
|
+
self,
|
185
|
+
ctx: Context,
|
186
|
+
initial_messages: list[dict],
|
187
|
+
initial_response: str
|
188
|
+
) -> str:
|
189
|
+
"""
|
190
|
+
Validate LLM response and retry if needed.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
ctx: Workflow context
|
194
|
+
initial_messages: Messages sent to LLM
|
195
|
+
initial_response: Initial LLM response
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
Final validated response (may be same as initial or from retry)
|
199
|
+
"""
|
200
|
+
|
201
|
+
output_planning = initial_response
|
202
|
+
parsed = parse_manager_response(output_planning)
|
203
|
+
|
204
|
+
max_retries = 3
|
205
|
+
retry_count = 0
|
206
|
+
|
207
|
+
while retry_count < max_retries:
|
208
|
+
# Validation rules
|
209
|
+
error_message = None
|
210
|
+
|
211
|
+
if parsed["answer"] and not parsed["plan"]:
|
212
|
+
# Valid: answer without plan (task complete)
|
213
|
+
break
|
214
|
+
elif parsed["plan"] and parsed["answer"]:
|
215
|
+
error_message = "You cannot use both request_accomplished tag while the plan is not finished. If you want to use request_accomplished tag, please make sure the plan is finished.\nRetry again."
|
216
|
+
elif not parsed["plan"]:
|
217
|
+
error_message = "You must provide a plan to complete the task. Please provide a plan with the correct format."
|
218
|
+
else:
|
219
|
+
# Valid: plan without answer
|
220
|
+
break
|
221
|
+
|
222
|
+
if error_message:
|
223
|
+
retry_count += 1
|
224
|
+
logger.warning(f"Manager response invalid (retry {retry_count}/{max_retries}): {error_message}")
|
225
|
+
|
226
|
+
# Retry with error message
|
227
|
+
retry_messages = initial_messages + [
|
228
|
+
{"role": "assistant", "content": [{"text": output_planning}]},
|
229
|
+
{"role": "user", "content": [{"text": error_message}]}
|
230
|
+
]
|
231
|
+
|
232
|
+
chat_messages = convert_messages_to_chatmessages(retry_messages)
|
233
|
+
|
234
|
+
try:
|
235
|
+
response = await acall_with_retries(self.llm, chat_messages)
|
236
|
+
output_planning = response.message.content
|
237
|
+
parsed = parse_manager_response(output_planning)
|
238
|
+
except Exception as e:
|
239
|
+
logger.error(f"LLM retry failed: {e}")
|
240
|
+
break # Give up retrying
|
241
|
+
|
242
|
+
return output_planning
|
243
|
+
|
244
|
+
# ========================================================================
|
245
|
+
# Workflow Steps
|
246
|
+
# ========================================================================
|
247
|
+
|
248
|
+
@step
|
249
|
+
async def prepare_input(
|
250
|
+
self,
|
251
|
+
ctx: Context,
|
252
|
+
ev: StartEvent
|
253
|
+
) -> ManagerThinkingEvent:
|
254
|
+
"""
|
255
|
+
Gather context and prepare manager prompt.
|
256
|
+
|
257
|
+
This step:
|
258
|
+
1. Gets current device state (UI elements, screenshot)
|
259
|
+
2. Detects text manipulation mode
|
260
|
+
3. Builds message history entry with last action
|
261
|
+
4. Stores context for think() step
|
262
|
+
"""
|
263
|
+
logger.info("💬 Preparing manager input...")
|
264
|
+
|
265
|
+
# ====================================================================
|
266
|
+
# Step 1: Get device state (UI elements accessibility tree)
|
267
|
+
# ====================================================================
|
268
|
+
device_state_text, focused_text = get_device_state_exact_format(self.tools_instance.get_state())
|
269
|
+
|
270
|
+
# ====================================================================
|
271
|
+
# Step 2: Capture screenshot if vision enabled
|
272
|
+
# ====================================================================
|
273
|
+
screenshot = None
|
274
|
+
if self.vision:
|
275
|
+
try:
|
276
|
+
result = self.tools_instance.take_screenshot()
|
277
|
+
if isinstance(result, tuple):
|
278
|
+
success, screenshot = result
|
279
|
+
if not success:
|
280
|
+
screenshot = None
|
281
|
+
else:
|
282
|
+
screenshot = result
|
283
|
+
logger.debug("📸 Screenshot captured for Manager")
|
284
|
+
except Exception as e:
|
285
|
+
logger.warning(f"Failed to capture screenshot: {e}")
|
286
|
+
screenshot = None
|
287
|
+
|
288
|
+
# ====================================================================
|
289
|
+
# Step 3: Detect text manipulation mode
|
290
|
+
# ====================================================================
|
291
|
+
focused_text = focused_text or ""
|
292
|
+
focused_text_clean = focused_text.replace("'", "").strip()
|
293
|
+
|
294
|
+
# Check if focused text differs from last typed text
|
295
|
+
# last_typed_text = ""
|
296
|
+
# if self.shared_state.action_history:
|
297
|
+
# recent_actions = self.shared_state.action_history[-1:] if len(self.shared_state.action_history) >= 1 else []
|
298
|
+
# for action in reversed(recent_actions):
|
299
|
+
# if isinstance(action, dict) and action.get('action') == 'type':
|
300
|
+
# last_typed_text = action.get('text', '')
|
301
|
+
# break
|
302
|
+
|
303
|
+
has_text_to_modify = (focused_text_clean != "")
|
304
|
+
|
305
|
+
# ====================================================================
|
306
|
+
# Step 4: Update state with device info
|
307
|
+
# ====================================================================
|
308
|
+
self.shared_state.device_state_text = device_state_text
|
309
|
+
self.shared_state.focused_text = focused_text
|
310
|
+
# Shift UI elements: before ← after, after ← current
|
311
|
+
self.shared_state.ui_elements_list_before = self.shared_state.ui_elements_list_after
|
312
|
+
self.shared_state.ui_elements_list_after = device_state_text
|
313
|
+
|
314
|
+
# ====================================================================
|
315
|
+
# Step 5: Build user message entry
|
316
|
+
# ====================================================================
|
317
|
+
parts = []
|
318
|
+
|
319
|
+
# Add context from last action
|
320
|
+
if self.shared_state.finish_thought:
|
321
|
+
parts.append(f"<thought>\n{self.shared_state.finish_thought}\n</thought>\n")
|
322
|
+
|
323
|
+
if self.shared_state.last_action:
|
324
|
+
import json
|
325
|
+
action_str = json.dumps(self.shared_state.last_action)
|
326
|
+
parts.append(f"<last_action>\n{action_str}\n</last_action>\n")
|
327
|
+
|
328
|
+
if self.shared_state.last_summary:
|
329
|
+
parts.append(f"<last_action_description>\n{self.shared_state.last_summary}\n</last_action_description>\n")
|
330
|
+
|
331
|
+
|
332
|
+
self.shared_state.message_history.append({
|
333
|
+
"role": "user",
|
334
|
+
"content": [{"text": "".join(parts)}]
|
335
|
+
})
|
336
|
+
|
337
|
+
# Store has_text_to_modify and screenshot for next step
|
338
|
+
self.shared_state.has_text_to_modify = has_text_to_modify
|
339
|
+
self.shared_state.screenshot = screenshot
|
340
|
+
|
341
|
+
logger.debug(f" - Device state prepared (text_modify={has_text_to_modify}, screenshot={screenshot is not None})")
|
342
|
+
return ManagerThinkingEvent()
|
343
|
+
|
344
|
+
@step
|
345
|
+
async def think(
|
346
|
+
self,
|
347
|
+
ctx: Context,
|
348
|
+
ev: ManagerThinkingEvent
|
349
|
+
) -> ManagerPlanEvent:
|
350
|
+
"""
|
351
|
+
Manager reasons and creates plan.
|
352
|
+
|
353
|
+
This step:
|
354
|
+
1. Builds system prompt with all context
|
355
|
+
2. Builds messages from history with injected context
|
356
|
+
3. Calls LLM
|
357
|
+
4. Validates and retries if needed
|
358
|
+
5. Parses response
|
359
|
+
6. Updates state (memory, message history)
|
360
|
+
"""
|
361
|
+
logger.info("🧠 Manager thinking about the plan...")
|
362
|
+
|
363
|
+
has_text_to_modify = self.shared_state.has_text_to_modify
|
364
|
+
screenshot = self.shared_state.screenshot
|
365
|
+
|
366
|
+
# ====================================================================
|
367
|
+
# Step 1: Build system prompt
|
368
|
+
# ====================================================================
|
369
|
+
system_prompt = self._build_system_prompt(has_text_to_modify)
|
370
|
+
|
371
|
+
# ====================================================================
|
372
|
+
# Step 2: Build messages with context
|
373
|
+
# ====================================================================
|
374
|
+
messages = self._build_messages_with_context(
|
375
|
+
system_prompt=system_prompt,
|
376
|
+
screenshot=screenshot
|
377
|
+
)
|
378
|
+
|
379
|
+
# ====================================================================
|
380
|
+
# Step 3: Convert messages and call LLM
|
381
|
+
# ====================================================================
|
382
|
+
chat_messages = convert_messages_to_chatmessages(messages)
|
383
|
+
|
384
|
+
try:
|
385
|
+
response = await acall_with_retries(self.llm, chat_messages)
|
386
|
+
output_planning = response.message.content
|
387
|
+
except Exception as e:
|
388
|
+
logger.error(f"LLM call failed: {e}")
|
389
|
+
raise RuntimeError(f"Error calling LLM in manager: {e}") from e
|
390
|
+
|
391
|
+
# ====================================================================
|
392
|
+
# Step 4: Validate and retry if needed
|
393
|
+
# ====================================================================
|
394
|
+
output_planning = await self._validate_and_retry_llm_call(
|
395
|
+
ctx=ctx,
|
396
|
+
initial_messages=messages,
|
397
|
+
initial_response=output_planning
|
398
|
+
)
|
399
|
+
|
400
|
+
# ====================================================================
|
401
|
+
# Step 5: Parse response
|
402
|
+
# ====================================================================
|
403
|
+
parsed = parse_manager_response(output_planning)
|
404
|
+
|
405
|
+
# ====================================================================
|
406
|
+
# Step 6: Update state
|
407
|
+
# ====================================================================
|
408
|
+
memory_update = parsed.get("memory", "").strip()
|
409
|
+
|
410
|
+
# Update memory (append, not replace)
|
411
|
+
if memory_update:
|
412
|
+
if self.shared_state.memory:
|
413
|
+
self.shared_state.memory += "\n" + memory_update
|
414
|
+
else:
|
415
|
+
self.shared_state.memory = memory_update
|
416
|
+
|
417
|
+
# Append assistant response to message history
|
418
|
+
self.shared_state.message_history.append({
|
419
|
+
"role": "assistant",
|
420
|
+
"content": [{"text": output_planning}]
|
421
|
+
})
|
422
|
+
|
423
|
+
# Update planning fields
|
424
|
+
self.shared_state.plan = parsed["plan"]
|
425
|
+
self.shared_state.current_subgoal = parsed["current_subgoal"]
|
426
|
+
self.shared_state.completed_plan = parsed.get("completed_subgoal", "No completed subgoal.")
|
427
|
+
self.shared_state.finish_thought = parsed["thought"]
|
428
|
+
self.shared_state.manager_answer = parsed["answer"]
|
429
|
+
|
430
|
+
logger.info(f"📝 Plan: {parsed['plan'][:100]}...")
|
431
|
+
logger.debug(f" - Current subgoal: {parsed['current_subgoal']}")
|
432
|
+
logger.debug(f" - Manager answer: {parsed['answer'][:50] if parsed['answer'] else 'None'}")
|
433
|
+
|
434
|
+
return ManagerPlanEvent(
|
435
|
+
plan=parsed["plan"],
|
436
|
+
current_subgoal=parsed["current_subgoal"],
|
437
|
+
completed_plan=parsed.get("completed_subgoal", "No completed subgoal."),
|
438
|
+
thought=parsed["thought"],
|
439
|
+
manager_answer=parsed["answer"],
|
440
|
+
memory_update=memory_update
|
441
|
+
)
|
442
|
+
|
443
|
+
@step
|
444
|
+
async def finalize(
|
445
|
+
self,
|
446
|
+
ctx: Context,
|
447
|
+
ev: ManagerPlanEvent
|
448
|
+
) -> StopEvent:
|
449
|
+
"""Return manager results to parent workflow."""
|
450
|
+
logger.debug("✅ Manager planning complete")
|
451
|
+
|
452
|
+
return StopEvent(result={
|
453
|
+
"plan": ev.plan,
|
454
|
+
"current_subgoal": ev.current_subgoal,
|
455
|
+
"completed_plan": ev.completed_plan,
|
456
|
+
"thought": ev.thought,
|
457
|
+
"manager_answer": ev.manager_answer,
|
458
|
+
"memory_update": ev.memory_update
|
459
|
+
})
|
@@ -0,0 +1,223 @@
|
|
1
|
+
"""
|
2
|
+
Prompts for the ManagerAgent.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import re
|
6
|
+
|
7
|
+
|
8
|
+
def build_manager_system_prompt(
|
9
|
+
instruction: str,
|
10
|
+
has_text_to_modify: bool = False,
|
11
|
+
app_card: str = "",
|
12
|
+
device_date: str = "",
|
13
|
+
important_notes: str = "",
|
14
|
+
error_flag: bool = False,
|
15
|
+
error_history: list = [], # noqa: B006
|
16
|
+
custom_tools_descriptions: str = ""
|
17
|
+
) -> str:
|
18
|
+
"""
|
19
|
+
Build the manager system prompt with all context.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
instruction: User's goal/task
|
23
|
+
has_text_to_modify: Whether focused text field has editable content
|
24
|
+
app_card: App-specific instructions (TODO: implement app card system)
|
25
|
+
device_date: Current device date (TODO: implement via adb shell date)
|
26
|
+
important_notes: Additional important information
|
27
|
+
error_flag: Whether consecutive errors occurred
|
28
|
+
error_history: List of recent errors if error_flag=True
|
29
|
+
custom_tools_descriptions: Formatted descriptions of custom tools available to executor
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Complete system prompt for Manager
|
33
|
+
"""
|
34
|
+
prompt = (
|
35
|
+
"You are an agent who can operate an Android phone on behalf of a user. "
|
36
|
+
"Your goal is to track progress and devise high-level plans to achieve the user's requests.\n\n"
|
37
|
+
"<user_request>\n"
|
38
|
+
f"{instruction}\n"
|
39
|
+
"</user_request>\n\n"
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
if device_date.strip():
|
44
|
+
prompt += f"<device_date>\n{device_date}\n</device_date>\n\n"
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
if app_card.strip():
|
50
|
+
prompt += "App card gives information on how to operate the app and perform actions.\n"
|
51
|
+
prompt += f"<app_card>\n{app_card.strip()}\n</app_card>\n\n"
|
52
|
+
|
53
|
+
# Important notes
|
54
|
+
if important_notes:
|
55
|
+
prompt += "<important_notes>\n"
|
56
|
+
prompt += f"{important_notes}\n"
|
57
|
+
prompt += "</important_notes>\n\n"
|
58
|
+
|
59
|
+
# Error escalation
|
60
|
+
if error_flag and error_history:
|
61
|
+
prompt += (
|
62
|
+
"<potentially_stuck>\n"
|
63
|
+
"You have encountered several failed attempts. Here are some logs:\n"
|
64
|
+
)
|
65
|
+
for error in error_history:
|
66
|
+
prompt += (
|
67
|
+
f"- Attempt: Action: {error['action']} | "
|
68
|
+
f"Description: {error['summary']} | "
|
69
|
+
f"Outcome: Failed | "
|
70
|
+
f"Feedback: {error['error']}\n"
|
71
|
+
)
|
72
|
+
prompt += "</potentially_stuck>\n\n"
|
73
|
+
|
74
|
+
# Guidelines
|
75
|
+
prompt += """<guidelines>
|
76
|
+
The following guidelines will help you plan this request.
|
77
|
+
General:
|
78
|
+
1. Use the `open_app` action whenever you want to open an app, do not use the app drawer to open an app.
|
79
|
+
2. Use search to quickly find a file or entry with a specific name, if search function is applicable.
|
80
|
+
3. Only use copy to clipboard actions when the task specifically requires copying text to clipboard. Do not copy text just to use it later - use the Memory section instead.
|
81
|
+
4. When you need to remember information for later use, store it in the Memory section (using <add_memory> tags) with step context (e.g., "At step X, I obtained [information] from [source]").
|
82
|
+
5. File names in the user request must always match the exact file name you are working with, make that reflect in the plan too.
|
83
|
+
6. Make sure names and titles are not cutoff. If the request is to check who sent a message, make sure to check the message sender's full name not just what appears in the notification because it might be cut off.
|
84
|
+
7. Dates and file names must match the user query exactly.
|
85
|
+
8. Don't do more than what the user asks for."""
|
86
|
+
|
87
|
+
# Text manipulation guidelines (conditional)
|
88
|
+
if has_text_to_modify:
|
89
|
+
prompt += """
|
90
|
+
|
91
|
+
<text_manipulation>
|
92
|
+
1. Use **TEXT_TASK:** prefix in your plan when you need to modify text in the currently focused text input field
|
93
|
+
2. TEXT_TASK is for editing, formatting, or transforming existing text content in text boxes using Python code
|
94
|
+
3. Do not use TEXT_TASK for extracting text from messages, typing new text, or composing messages
|
95
|
+
4. The focused text field contains editable text that you can modify
|
96
|
+
5. Example plan item: 'TEXT_TASK: Add "Hello World" at the beginning of the text'
|
97
|
+
6. Always use TEXT_TASK for modifying text, do not try to select the text to copy/cut/paste or adjust the text
|
98
|
+
</text_manipulation>"""
|
99
|
+
|
100
|
+
prompt += """
|
101
|
+
|
102
|
+
Memory Usage:
|
103
|
+
- Always include step context: "At step [number], I obtained [actual content] from [source]"
|
104
|
+
- Store the actual content you observe, not just references (e.g., store full recipe text, not "found recipes")
|
105
|
+
- Use memory instead of copying text unless specifically requested
|
106
|
+
- Memory is append-only: whatever you put in <add_memory> tags gets added to existing memory, not replaced
|
107
|
+
- Update memory to track progress on multi-step tasks
|
108
|
+
|
109
|
+
</guidelines>"""
|
110
|
+
|
111
|
+
# Add custom tools section if custom tools are provided
|
112
|
+
if custom_tools_descriptions.strip():
|
113
|
+
prompt += """
|
114
|
+
|
115
|
+
<custom_actions>
|
116
|
+
The executor has access to these additional custom actions beyond the standard actions (click, type, swipe, etc.):
|
117
|
+
""" + custom_tools_descriptions + """
|
118
|
+
|
119
|
+
You can reference these custom actions or tell the Executer agent to use them in your plan when they help achieve the user's goal.
|
120
|
+
</custom_actions>"""
|
121
|
+
|
122
|
+
prompt += """
|
123
|
+
---
|
124
|
+
Carefully assess the current status and the provided screenshot. Check if the current plan needs to be revised.
|
125
|
+
Determine if the user request has been fully completed. If you are confident that no further actions are required, use the request_accomplished tag with a message in it. If the user request is not finished, update the plan and don't use it. If you are stuck with errors, think step by step about whether the overall plan needs to be revised to address the error.
|
126
|
+
NOTE: 1. If the current situation prevents proceeding with the original plan or requires clarification from the user, make reasonable assumptions and revise the plan accordingly. Act as though you are the user in such cases. 2. Please refer to the helpful information and steps in the Guidelines first for planning. 3. If the first subgoal in plan has been completed, please update the plan in time according to the screenshot and progress to ensure that the next subgoal is always the first item in the plan. 4. If the first subgoal is not completed, please copy the previous round's plan or update the plan based on the completion of the subgoal.
|
127
|
+
Provide your output in the following format, which contains four or five parts:
|
128
|
+
|
129
|
+
<thought>
|
130
|
+
An explanation of your rationale for the updated plan and current subgoal.
|
131
|
+
</thought>
|
132
|
+
|
133
|
+
<add_memory>
|
134
|
+
Store important information here with step context for later reference. Always include "At step X, I obtained [actual content] from [source]".
|
135
|
+
Examples:
|
136
|
+
- At step 5, I obtained recipe details from recipes.jpg: Recipe 1 "Chicken Pasta" - ingredients: chicken, pasta, cream. Instructions: Cook pasta, sauté chicken, add cream.
|
137
|
+
or
|
138
|
+
- At step 12, I successfully added Recipe 1 to Broccoli app. Still need to add Recipe 2 and Recipe 3 from memory.
|
139
|
+
Store important information here with step context for later reference.
|
140
|
+
</add_memory>
|
141
|
+
|
142
|
+
<plan>
|
143
|
+
Please update or copy the existing plan according to the current page and progress. Please pay close attention to the historical operations. Please do not repeat the plan of completed content unless you can judge from the screen status that a subgoal is indeed not completed.
|
144
|
+
</plan>
|
145
|
+
|
146
|
+
<request_accomplished>
|
147
|
+
Use this tag ONLY after actually completing the user's request through concrete actions, not at the beginning or for planning.
|
148
|
+
|
149
|
+
1. Always include a message inside this tag confirming what you accomplished
|
150
|
+
2. Ensure both opening and closing tags are present
|
151
|
+
3. Use exclusively for signaling completed user requests
|
152
|
+
</request_accomplished>"""
|
153
|
+
|
154
|
+
return prompt
|
155
|
+
|
156
|
+
|
157
|
+
def parse_manager_response(response: str) -> dict:
|
158
|
+
"""
|
159
|
+
Parse manager LLM response into structured dict.
|
160
|
+
|
161
|
+
Extracts XML-style tags from the response:
|
162
|
+
- <thought>...</thought>
|
163
|
+
- <add_memory>...</add_memory>
|
164
|
+
- <plan>...</plan>
|
165
|
+
- <request_accomplished>...</request_accomplished> (answer)
|
166
|
+
- <historical_operations>...</historical_operations> (optional, for completed plan)
|
167
|
+
|
168
|
+
Also derives:
|
169
|
+
- current_subgoal: first line of plan (with list markers removed)
|
170
|
+
|
171
|
+
Args:
|
172
|
+
response: Raw LLM response text
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
Dict with keys:
|
176
|
+
- thought: str
|
177
|
+
- memory: str
|
178
|
+
- plan: str
|
179
|
+
- current_subgoal: str (first line of plan, cleaned)
|
180
|
+
- completed_subgoal: str
|
181
|
+
- answer: str (from request_accomplished tag)
|
182
|
+
"""
|
183
|
+
def extract(tag: str) -> str:
|
184
|
+
"""Extract content between XML-style tags."""
|
185
|
+
if f"<{tag}>" in response and f"</{tag}>" in response:
|
186
|
+
return response.split(f"<{tag}>", 1)[-1].split(f"</{tag}>", 1)[0].strip()
|
187
|
+
return ""
|
188
|
+
|
189
|
+
thought = extract("thought")
|
190
|
+
memory_section = extract("add_memory")
|
191
|
+
plan = extract("plan")
|
192
|
+
answer = extract("request_accomplished")
|
193
|
+
|
194
|
+
# Extract completed subgoal (optional historical_operations tag)
|
195
|
+
if "<historical_operations>" in response:
|
196
|
+
completed_subgoal = extract("historical_operations")
|
197
|
+
else:
|
198
|
+
completed_subgoal = "No completed subgoal."
|
199
|
+
|
200
|
+
# Parse current subgoal from first line of plan
|
201
|
+
current_goal_text = plan
|
202
|
+
# Prefer newline-separated plans; take the first non-empty line
|
203
|
+
plan_lines = [line.strip() for line in current_goal_text.splitlines() if line.strip()]
|
204
|
+
if plan_lines:
|
205
|
+
first_line = plan_lines[0]
|
206
|
+
else:
|
207
|
+
first_line = current_goal_text.strip()
|
208
|
+
|
209
|
+
# Remove common list markers like "1.", "-", "*", or bullet characters
|
210
|
+
first_line = re.sub(r"^\s*\d+\.\s*", "", first_line) # Remove "1. ", "2. ", etc.
|
211
|
+
first_line = re.sub(r"^\s*[-*]\s*", "", first_line) # Remove "- " or "* "
|
212
|
+
first_line = re.sub(r"^\s*•\s*", "", first_line) # Remove bullet "• "
|
213
|
+
|
214
|
+
current_subgoal = first_line.strip()
|
215
|
+
|
216
|
+
return {
|
217
|
+
"thought": thought,
|
218
|
+
"completed_subgoal": completed_subgoal,
|
219
|
+
"plan": plan,
|
220
|
+
"memory": memory_section,
|
221
|
+
"current_subgoal": current_subgoal,
|
222
|
+
"answer": answer,
|
223
|
+
}
|