droidrun 0.3.8__py3-none-any.whl → 0.3.10.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. droidrun/__init__.py +2 -3
  2. droidrun/__main__.py +1 -1
  3. droidrun/agent/__init__.py +1 -1
  4. droidrun/agent/codeact/__init__.py +1 -4
  5. droidrun/agent/codeact/codeact_agent.py +112 -48
  6. droidrun/agent/codeact/events.py +6 -3
  7. droidrun/agent/codeact/prompts.py +2 -2
  8. droidrun/agent/common/constants.py +2 -0
  9. droidrun/agent/common/events.py +5 -3
  10. droidrun/agent/context/__init__.py +1 -3
  11. droidrun/agent/context/agent_persona.py +2 -1
  12. droidrun/agent/context/context_injection_manager.py +6 -6
  13. droidrun/agent/context/episodic_memory.py +5 -3
  14. droidrun/agent/context/personas/__init__.py +3 -3
  15. droidrun/agent/context/personas/app_starter.py +3 -3
  16. droidrun/agent/context/personas/big_agent.py +3 -3
  17. droidrun/agent/context/personas/default.py +3 -3
  18. droidrun/agent/context/personas/ui_expert.py +5 -5
  19. droidrun/agent/context/task_manager.py +15 -17
  20. droidrun/agent/droid/__init__.py +1 -1
  21. droidrun/agent/droid/droid_agent.py +327 -182
  22. droidrun/agent/droid/events.py +91 -9
  23. droidrun/agent/executor/__init__.py +13 -0
  24. droidrun/agent/executor/events.py +24 -0
  25. droidrun/agent/executor/executor_agent.py +327 -0
  26. droidrun/agent/executor/prompts.py +136 -0
  27. droidrun/agent/manager/__init__.py +18 -0
  28. droidrun/agent/manager/events.py +20 -0
  29. droidrun/agent/manager/manager_agent.py +459 -0
  30. droidrun/agent/manager/prompts.py +223 -0
  31. droidrun/agent/oneflows/app_starter_workflow.py +118 -0
  32. droidrun/agent/oneflows/text_manipulator.py +204 -0
  33. droidrun/agent/planner/__init__.py +3 -3
  34. droidrun/agent/planner/events.py +6 -3
  35. droidrun/agent/planner/planner_agent.py +60 -53
  36. droidrun/agent/planner/prompts.py +2 -2
  37. droidrun/agent/usage.py +15 -13
  38. droidrun/agent/utils/__init__.py +11 -1
  39. droidrun/agent/utils/async_utils.py +2 -1
  40. droidrun/agent/utils/chat_utils.py +48 -60
  41. droidrun/agent/utils/device_state_formatter.py +177 -0
  42. droidrun/agent/utils/executer.py +13 -12
  43. droidrun/agent/utils/inference.py +114 -0
  44. droidrun/agent/utils/llm_picker.py +2 -0
  45. droidrun/agent/utils/message_utils.py +85 -0
  46. droidrun/agent/utils/tools.py +220 -0
  47. droidrun/agent/utils/trajectory.py +8 -7
  48. droidrun/cli/__init__.py +1 -1
  49. droidrun/cli/logs.py +29 -28
  50. droidrun/cli/main.py +279 -143
  51. droidrun/config_manager/__init__.py +25 -0
  52. droidrun/config_manager/config_manager.py +583 -0
  53. droidrun/macro/__init__.py +2 -2
  54. droidrun/macro/__main__.py +1 -1
  55. droidrun/macro/cli.py +36 -34
  56. droidrun/macro/replay.py +7 -9
  57. droidrun/portal.py +1 -1
  58. droidrun/telemetry/__init__.py +2 -2
  59. droidrun/telemetry/events.py +3 -4
  60. droidrun/telemetry/phoenix.py +173 -0
  61. droidrun/telemetry/tracker.py +7 -5
  62. droidrun/tools/__init__.py +1 -1
  63. droidrun/tools/adb.py +210 -82
  64. droidrun/tools/ios.py +7 -5
  65. droidrun/tools/tools.py +25 -8
  66. {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/METADATA +13 -7
  67. droidrun-0.3.10.dev2.dist-info/RECORD +70 -0
  68. droidrun/agent/common/default.py +0 -5
  69. droidrun/agent/context/reflection.py +0 -20
  70. droidrun/agent/oneflows/reflector.py +0 -265
  71. droidrun-0.3.8.dist-info/RECORD +0 -55
  72. {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/WHEEL +0 -0
  73. {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/entry_points.txt +0 -0
  74. {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,459 @@
1
+ """
2
+ ManagerAgent - Planning and reasoning workflow.
3
+
4
+ This agent is responsible for:
5
+ - Analyzing the current state
6
+ - Creating plans and subgoals
7
+ - Tracking progress
8
+ - Deciding when tasks are complete
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ from typing import TYPE_CHECKING, List
15
+
16
+ from llama_index.core.llms.llm import LLM
17
+ from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
18
+
19
+ from droidrun.agent.manager.events import ManagerPlanEvent, ManagerThinkingEvent
20
+ from droidrun.agent.manager.prompts import build_manager_system_prompt, parse_manager_response
21
+ from droidrun.agent.utils import convert_messages_to_chatmessages
22
+ from droidrun.agent.utils.chat_utils import remove_empty_messages
23
+ from droidrun.agent.utils.device_state_formatter import get_device_state_exact_format
24
+ from droidrun.agent.utils.inference import acall_with_retries
25
+ from droidrun.agent.utils.tools import build_custom_tool_descriptions
26
+
27
+ if TYPE_CHECKING:
28
+ from droidrun.agent.droid.events import DroidAgentState
29
+ from droidrun.tools import Tools
30
+
31
+ logger = logging.getLogger("droidrun")
32
+
33
+
34
+ class ManagerAgent(Workflow):
35
+ """
36
+ Planning and reasoning agent that decides what to do next.
37
+
38
+ The Manager:
39
+ 1. Analyzes current device state and action history
40
+ 2. Creates plans with specific subgoals
41
+ 3. Tracks progress and completed steps
42
+ 4. Decides when tasks are complete or need to provide answers
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ llm: LLM,
48
+ vision: bool,
49
+ personas: List,
50
+ tools_instance: "Tools",
51
+ shared_state: "DroidAgentState",
52
+ custom_tools: dict = None,
53
+ debug: bool = False,
54
+ **kwargs
55
+ ):
56
+ super().__init__(**kwargs)
57
+ self.llm = llm
58
+ self.vision = vision
59
+ self.personas = personas
60
+ self.tools_instance = tools_instance
61
+ self.shared_state = shared_state
62
+ self.custom_tools = custom_tools or {}
63
+ self.debug = debug
64
+
65
+ logger.info("✅ ManagerAgent initialized successfully.")
66
+
67
+ # ========================================================================
68
+ # Helper Methods
69
+ # ========================================================================
70
+
71
+ def _build_system_prompt(
72
+ self,
73
+ has_text_to_modify: bool
74
+ ) -> str:
75
+ """
76
+ Build system prompt with all context.
77
+
78
+ Args:
79
+ has_text_to_modify: Whether text manipulation mode is enabled
80
+
81
+ Returns:
82
+ Complete system prompt
83
+ """
84
+
85
+ # Get error history if error_flag_plan is set
86
+ error_history = []
87
+ if self.shared_state.error_flag_plan:
88
+ k = self.shared_state.err_to_manager_thresh
89
+ error_history = [
90
+ {
91
+ "action": act,
92
+ "summary": summ,
93
+ "error": err_des
94
+ }
95
+ for act, summ, err_des in zip(
96
+ self.shared_state.action_history[-k:],
97
+ self.shared_state.summary_history[-k:],
98
+ self.shared_state.error_descriptions[-k:], strict=True
99
+ )
100
+ ]
101
+
102
+ # Build custom tools descriptions
103
+ custom_tools_descriptions = build_custom_tool_descriptions(self.custom_tools)
104
+
105
+ return build_manager_system_prompt(
106
+ instruction=self.shared_state.instruction,
107
+ has_text_to_modify=has_text_to_modify,
108
+ app_card="", # TODO: implement app card retrieval system
109
+ device_date=self.tools_instance.get_date(),
110
+ important_notes="", # TODO: expose important_notes in DroidAgentState if needed
111
+ error_flag=self.shared_state.error_flag_plan,
112
+ error_history=error_history,
113
+ custom_tools_descriptions=custom_tools_descriptions
114
+ )
115
+
116
+ def _build_messages_with_context(
117
+ self,
118
+ system_prompt: str,
119
+ screenshot: str = None
120
+ ) -> list[dict]:
121
+ """
122
+ Build messages from history and inject current context.
123
+
124
+ Args:
125
+ system_prompt: System prompt to use
126
+ screenshot: Path to current screenshot (if vision enabled)
127
+
128
+ Returns:
129
+ List of message dicts ready for conversion
130
+ """
131
+ import copy
132
+
133
+ # Start with system message
134
+ messages = [
135
+ {"role": "system", "content": [{"text": system_prompt}]}
136
+ ]
137
+
138
+ # Add accumulated message history (deep copy to avoid mutation)
139
+ messages.extend(copy.deepcopy(self.shared_state.message_history))
140
+
141
+ # ====================================================================
142
+ # Inject memory, device state, screenshot to LAST user message
143
+ # ====================================================================
144
+ # Find last user message index
145
+ user_indices = [i for i, msg in enumerate(messages) if msg['role'] == 'user']
146
+
147
+ if user_indices:
148
+ last_user_idx = user_indices[-1]
149
+
150
+ # Add memory to last user message
151
+ current_memory = (self.shared_state.memory or "").strip()
152
+ if current_memory:
153
+ if messages[last_user_idx]['content'] and 'text' in messages[last_user_idx]['content'][0]:
154
+ messages[last_user_idx]['content'][0]['text'] += f"\n<memory>\n{current_memory}\n</memory>\n"
155
+ else:
156
+ messages[last_user_idx]['content'].insert(0, {"text": f"<memory>\n{current_memory}\n</memory>\n"})
157
+
158
+ # Add device state to last user message
159
+ current_a11y = (self.shared_state.ui_elements_list_after or self.shared_state.device_state_text or "").strip()
160
+ if current_a11y:
161
+ if messages[last_user_idx]['content'] and 'text' in messages[last_user_idx]['content'][0]:
162
+ messages[last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{current_a11y}\n</device_state>\n"
163
+ else:
164
+ messages[last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{current_a11y}\n</device_state>\n"})
165
+
166
+ # Add screenshot to last user message
167
+ if screenshot and self.vision:
168
+ messages[last_user_idx]['content'].append({"image": screenshot})
169
+
170
+ # Add previous device state to SECOND-TO-LAST user message (if exists)
171
+ if len(user_indices) >= 2:
172
+ second_last_user_idx = user_indices[-2]
173
+ prev_a11y = (self.shared_state.ui_elements_list_before or "").strip()
174
+
175
+ if prev_a11y:
176
+ if messages[second_last_user_idx]['content'] and 'text' in messages[second_last_user_idx]['content'][0]:
177
+ messages[second_last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{prev_a11y}\n</device_state>\n"
178
+ else:
179
+ messages[second_last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{prev_a11y}\n</device_state>\n"})
180
+ messages = remove_empty_messages(messages)
181
+ return messages
182
+
183
+ async def _validate_and_retry_llm_call(
184
+ self,
185
+ ctx: Context,
186
+ initial_messages: list[dict],
187
+ initial_response: str
188
+ ) -> str:
189
+ """
190
+ Validate LLM response and retry if needed.
191
+
192
+ Args:
193
+ ctx: Workflow context
194
+ initial_messages: Messages sent to LLM
195
+ initial_response: Initial LLM response
196
+
197
+ Returns:
198
+ Final validated response (may be same as initial or from retry)
199
+ """
200
+
201
+ output_planning = initial_response
202
+ parsed = parse_manager_response(output_planning)
203
+
204
+ max_retries = 3
205
+ retry_count = 0
206
+
207
+ while retry_count < max_retries:
208
+ # Validation rules
209
+ error_message = None
210
+
211
+ if parsed["answer"] and not parsed["plan"]:
212
+ # Valid: answer without plan (task complete)
213
+ break
214
+ elif parsed["plan"] and parsed["answer"]:
215
+ error_message = "You cannot use both request_accomplished tag while the plan is not finished. If you want to use request_accomplished tag, please make sure the plan is finished.\nRetry again."
216
+ elif not parsed["plan"]:
217
+ error_message = "You must provide a plan to complete the task. Please provide a plan with the correct format."
218
+ else:
219
+ # Valid: plan without answer
220
+ break
221
+
222
+ if error_message:
223
+ retry_count += 1
224
+ logger.warning(f"Manager response invalid (retry {retry_count}/{max_retries}): {error_message}")
225
+
226
+ # Retry with error message
227
+ retry_messages = initial_messages + [
228
+ {"role": "assistant", "content": [{"text": output_planning}]},
229
+ {"role": "user", "content": [{"text": error_message}]}
230
+ ]
231
+
232
+ chat_messages = convert_messages_to_chatmessages(retry_messages)
233
+
234
+ try:
235
+ response = await acall_with_retries(self.llm, chat_messages)
236
+ output_planning = response.message.content
237
+ parsed = parse_manager_response(output_planning)
238
+ except Exception as e:
239
+ logger.error(f"LLM retry failed: {e}")
240
+ break # Give up retrying
241
+
242
+ return output_planning
243
+
244
+ # ========================================================================
245
+ # Workflow Steps
246
+ # ========================================================================
247
+
248
+ @step
249
+ async def prepare_input(
250
+ self,
251
+ ctx: Context,
252
+ ev: StartEvent
253
+ ) -> ManagerThinkingEvent:
254
+ """
255
+ Gather context and prepare manager prompt.
256
+
257
+ This step:
258
+ 1. Gets current device state (UI elements, screenshot)
259
+ 2. Detects text manipulation mode
260
+ 3. Builds message history entry with last action
261
+ 4. Stores context for think() step
262
+ """
263
+ logger.info("💬 Preparing manager input...")
264
+
265
+ # ====================================================================
266
+ # Step 1: Get device state (UI elements accessibility tree)
267
+ # ====================================================================
268
+ device_state_text, focused_text = get_device_state_exact_format(self.tools_instance.get_state())
269
+
270
+ # ====================================================================
271
+ # Step 2: Capture screenshot if vision enabled
272
+ # ====================================================================
273
+ screenshot = None
274
+ if self.vision:
275
+ try:
276
+ result = self.tools_instance.take_screenshot()
277
+ if isinstance(result, tuple):
278
+ success, screenshot = result
279
+ if not success:
280
+ screenshot = None
281
+ else:
282
+ screenshot = result
283
+ logger.debug("📸 Screenshot captured for Manager")
284
+ except Exception as e:
285
+ logger.warning(f"Failed to capture screenshot: {e}")
286
+ screenshot = None
287
+
288
+ # ====================================================================
289
+ # Step 3: Detect text manipulation mode
290
+ # ====================================================================
291
+ focused_text = focused_text or ""
292
+ focused_text_clean = focused_text.replace("'", "").strip()
293
+
294
+ # Check if focused text differs from last typed text
295
+ # last_typed_text = ""
296
+ # if self.shared_state.action_history:
297
+ # recent_actions = self.shared_state.action_history[-1:] if len(self.shared_state.action_history) >= 1 else []
298
+ # for action in reversed(recent_actions):
299
+ # if isinstance(action, dict) and action.get('action') == 'type':
300
+ # last_typed_text = action.get('text', '')
301
+ # break
302
+
303
+ has_text_to_modify = (focused_text_clean != "")
304
+
305
+ # ====================================================================
306
+ # Step 4: Update state with device info
307
+ # ====================================================================
308
+ self.shared_state.device_state_text = device_state_text
309
+ self.shared_state.focused_text = focused_text
310
+ # Shift UI elements: before ← after, after ← current
311
+ self.shared_state.ui_elements_list_before = self.shared_state.ui_elements_list_after
312
+ self.shared_state.ui_elements_list_after = device_state_text
313
+
314
+ # ====================================================================
315
+ # Step 5: Build user message entry
316
+ # ====================================================================
317
+ parts = []
318
+
319
+ # Add context from last action
320
+ if self.shared_state.finish_thought:
321
+ parts.append(f"<thought>\n{self.shared_state.finish_thought}\n</thought>\n")
322
+
323
+ if self.shared_state.last_action:
324
+ import json
325
+ action_str = json.dumps(self.shared_state.last_action)
326
+ parts.append(f"<last_action>\n{action_str}\n</last_action>\n")
327
+
328
+ if self.shared_state.last_summary:
329
+ parts.append(f"<last_action_description>\n{self.shared_state.last_summary}\n</last_action_description>\n")
330
+
331
+
332
+ self.shared_state.message_history.append({
333
+ "role": "user",
334
+ "content": [{"text": "".join(parts)}]
335
+ })
336
+
337
+ # Store has_text_to_modify and screenshot for next step
338
+ self.shared_state.has_text_to_modify = has_text_to_modify
339
+ self.shared_state.screenshot = screenshot
340
+
341
+ logger.debug(f" - Device state prepared (text_modify={has_text_to_modify}, screenshot={screenshot is not None})")
342
+ return ManagerThinkingEvent()
343
+
344
+ @step
345
+ async def think(
346
+ self,
347
+ ctx: Context,
348
+ ev: ManagerThinkingEvent
349
+ ) -> ManagerPlanEvent:
350
+ """
351
+ Manager reasons and creates plan.
352
+
353
+ This step:
354
+ 1. Builds system prompt with all context
355
+ 2. Builds messages from history with injected context
356
+ 3. Calls LLM
357
+ 4. Validates and retries if needed
358
+ 5. Parses response
359
+ 6. Updates state (memory, message history)
360
+ """
361
+ logger.info("🧠 Manager thinking about the plan...")
362
+
363
+ has_text_to_modify = self.shared_state.has_text_to_modify
364
+ screenshot = self.shared_state.screenshot
365
+
366
+ # ====================================================================
367
+ # Step 1: Build system prompt
368
+ # ====================================================================
369
+ system_prompt = self._build_system_prompt(has_text_to_modify)
370
+
371
+ # ====================================================================
372
+ # Step 2: Build messages with context
373
+ # ====================================================================
374
+ messages = self._build_messages_with_context(
375
+ system_prompt=system_prompt,
376
+ screenshot=screenshot
377
+ )
378
+
379
+ # ====================================================================
380
+ # Step 3: Convert messages and call LLM
381
+ # ====================================================================
382
+ chat_messages = convert_messages_to_chatmessages(messages)
383
+
384
+ try:
385
+ response = await acall_with_retries(self.llm, chat_messages)
386
+ output_planning = response.message.content
387
+ except Exception as e:
388
+ logger.error(f"LLM call failed: {e}")
389
+ raise RuntimeError(f"Error calling LLM in manager: {e}") from e
390
+
391
+ # ====================================================================
392
+ # Step 4: Validate and retry if needed
393
+ # ====================================================================
394
+ output_planning = await self._validate_and_retry_llm_call(
395
+ ctx=ctx,
396
+ initial_messages=messages,
397
+ initial_response=output_planning
398
+ )
399
+
400
+ # ====================================================================
401
+ # Step 5: Parse response
402
+ # ====================================================================
403
+ parsed = parse_manager_response(output_planning)
404
+
405
+ # ====================================================================
406
+ # Step 6: Update state
407
+ # ====================================================================
408
+ memory_update = parsed.get("memory", "").strip()
409
+
410
+ # Update memory (append, not replace)
411
+ if memory_update:
412
+ if self.shared_state.memory:
413
+ self.shared_state.memory += "\n" + memory_update
414
+ else:
415
+ self.shared_state.memory = memory_update
416
+
417
+ # Append assistant response to message history
418
+ self.shared_state.message_history.append({
419
+ "role": "assistant",
420
+ "content": [{"text": output_planning}]
421
+ })
422
+
423
+ # Update planning fields
424
+ self.shared_state.plan = parsed["plan"]
425
+ self.shared_state.current_subgoal = parsed["current_subgoal"]
426
+ self.shared_state.completed_plan = parsed.get("completed_subgoal", "No completed subgoal.")
427
+ self.shared_state.finish_thought = parsed["thought"]
428
+ self.shared_state.manager_answer = parsed["answer"]
429
+
430
+ logger.info(f"📝 Plan: {parsed['plan'][:100]}...")
431
+ logger.debug(f" - Current subgoal: {parsed['current_subgoal']}")
432
+ logger.debug(f" - Manager answer: {parsed['answer'][:50] if parsed['answer'] else 'None'}")
433
+
434
+ return ManagerPlanEvent(
435
+ plan=parsed["plan"],
436
+ current_subgoal=parsed["current_subgoal"],
437
+ completed_plan=parsed.get("completed_subgoal", "No completed subgoal."),
438
+ thought=parsed["thought"],
439
+ manager_answer=parsed["answer"],
440
+ memory_update=memory_update
441
+ )
442
+
443
+ @step
444
+ async def finalize(
445
+ self,
446
+ ctx: Context,
447
+ ev: ManagerPlanEvent
448
+ ) -> StopEvent:
449
+ """Return manager results to parent workflow."""
450
+ logger.debug("✅ Manager planning complete")
451
+
452
+ return StopEvent(result={
453
+ "plan": ev.plan,
454
+ "current_subgoal": ev.current_subgoal,
455
+ "completed_plan": ev.completed_plan,
456
+ "thought": ev.thought,
457
+ "manager_answer": ev.manager_answer,
458
+ "memory_update": ev.memory_update
459
+ })
@@ -0,0 +1,223 @@
1
+ """
2
+ Prompts for the ManagerAgent.
3
+ """
4
+
5
+ import re
6
+
7
+
8
+ def build_manager_system_prompt(
9
+ instruction: str,
10
+ has_text_to_modify: bool = False,
11
+ app_card: str = "",
12
+ device_date: str = "",
13
+ important_notes: str = "",
14
+ error_flag: bool = False,
15
+ error_history: list = [], # noqa: B006
16
+ custom_tools_descriptions: str = ""
17
+ ) -> str:
18
+ """
19
+ Build the manager system prompt with all context.
20
+
21
+ Args:
22
+ instruction: User's goal/task
23
+ has_text_to_modify: Whether focused text field has editable content
24
+ app_card: App-specific instructions (TODO: implement app card system)
25
+ device_date: Current device date (TODO: implement via adb shell date)
26
+ important_notes: Additional important information
27
+ error_flag: Whether consecutive errors occurred
28
+ error_history: List of recent errors if error_flag=True
29
+ custom_tools_descriptions: Formatted descriptions of custom tools available to executor
30
+
31
+ Returns:
32
+ Complete system prompt for Manager
33
+ """
34
+ prompt = (
35
+ "You are an agent who can operate an Android phone on behalf of a user. "
36
+ "Your goal is to track progress and devise high-level plans to achieve the user's requests.\n\n"
37
+ "<user_request>\n"
38
+ f"{instruction}\n"
39
+ "</user_request>\n\n"
40
+ )
41
+
42
+
43
+ if device_date.strip():
44
+ prompt += f"<device_date>\n{device_date}\n</device_date>\n\n"
45
+
46
+
47
+
48
+
49
+ if app_card.strip():
50
+ prompt += "App card gives information on how to operate the app and perform actions.\n"
51
+ prompt += f"<app_card>\n{app_card.strip()}\n</app_card>\n\n"
52
+
53
+ # Important notes
54
+ if important_notes:
55
+ prompt += "<important_notes>\n"
56
+ prompt += f"{important_notes}\n"
57
+ prompt += "</important_notes>\n\n"
58
+
59
+ # Error escalation
60
+ if error_flag and error_history:
61
+ prompt += (
62
+ "<potentially_stuck>\n"
63
+ "You have encountered several failed attempts. Here are some logs:\n"
64
+ )
65
+ for error in error_history:
66
+ prompt += (
67
+ f"- Attempt: Action: {error['action']} | "
68
+ f"Description: {error['summary']} | "
69
+ f"Outcome: Failed | "
70
+ f"Feedback: {error['error']}\n"
71
+ )
72
+ prompt += "</potentially_stuck>\n\n"
73
+
74
+ # Guidelines
75
+ prompt += """<guidelines>
76
+ The following guidelines will help you plan this request.
77
+ General:
78
+ 1. Use the `open_app` action whenever you want to open an app, do not use the app drawer to open an app.
79
+ 2. Use search to quickly find a file or entry with a specific name, if search function is applicable.
80
+ 3. Only use copy to clipboard actions when the task specifically requires copying text to clipboard. Do not copy text just to use it later - use the Memory section instead.
81
+ 4. When you need to remember information for later use, store it in the Memory section (using <add_memory> tags) with step context (e.g., "At step X, I obtained [information] from [source]").
82
+ 5. File names in the user request must always match the exact file name you are working with, make that reflect in the plan too.
83
+ 6. Make sure names and titles are not cutoff. If the request is to check who sent a message, make sure to check the message sender's full name not just what appears in the notification because it might be cut off.
84
+ 7. Dates and file names must match the user query exactly.
85
+ 8. Don't do more than what the user asks for."""
86
+
87
+ # Text manipulation guidelines (conditional)
88
+ if has_text_to_modify:
89
+ prompt += """
90
+
91
+ <text_manipulation>
92
+ 1. Use **TEXT_TASK:** prefix in your plan when you need to modify text in the currently focused text input field
93
+ 2. TEXT_TASK is for editing, formatting, or transforming existing text content in text boxes using Python code
94
+ 3. Do not use TEXT_TASK for extracting text from messages, typing new text, or composing messages
95
+ 4. The focused text field contains editable text that you can modify
96
+ 5. Example plan item: 'TEXT_TASK: Add "Hello World" at the beginning of the text'
97
+ 6. Always use TEXT_TASK for modifying text, do not try to select the text to copy/cut/paste or adjust the text
98
+ </text_manipulation>"""
99
+
100
+ prompt += """
101
+
102
+ Memory Usage:
103
+ - Always include step context: "At step [number], I obtained [actual content] from [source]"
104
+ - Store the actual content you observe, not just references (e.g., store full recipe text, not "found recipes")
105
+ - Use memory instead of copying text unless specifically requested
106
+ - Memory is append-only: whatever you put in <add_memory> tags gets added to existing memory, not replaced
107
+ - Update memory to track progress on multi-step tasks
108
+
109
+ </guidelines>"""
110
+
111
+ # Add custom tools section if custom tools are provided
112
+ if custom_tools_descriptions.strip():
113
+ prompt += """
114
+
115
+ <custom_actions>
116
+ The executor has access to these additional custom actions beyond the standard actions (click, type, swipe, etc.):
117
+ """ + custom_tools_descriptions + """
118
+
119
+ You can reference these custom actions or tell the Executer agent to use them in your plan when they help achieve the user's goal.
120
+ </custom_actions>"""
121
+
122
+ prompt += """
123
+ ---
124
+ Carefully assess the current status and the provided screenshot. Check if the current plan needs to be revised.
125
+ Determine if the user request has been fully completed. If you are confident that no further actions are required, use the request_accomplished tag with a message in it. If the user request is not finished, update the plan and don't use it. If you are stuck with errors, think step by step about whether the overall plan needs to be revised to address the error.
126
+ NOTE: 1. If the current situation prevents proceeding with the original plan or requires clarification from the user, make reasonable assumptions and revise the plan accordingly. Act as though you are the user in such cases. 2. Please refer to the helpful information and steps in the Guidelines first for planning. 3. If the first subgoal in plan has been completed, please update the plan in time according to the screenshot and progress to ensure that the next subgoal is always the first item in the plan. 4. If the first subgoal is not completed, please copy the previous round's plan or update the plan based on the completion of the subgoal.
127
+ Provide your output in the following format, which contains four or five parts:
128
+
129
+ <thought>
130
+ An explanation of your rationale for the updated plan and current subgoal.
131
+ </thought>
132
+
133
+ <add_memory>
134
+ Store important information here with step context for later reference. Always include "At step X, I obtained [actual content] from [source]".
135
+ Examples:
136
+ - At step 5, I obtained recipe details from recipes.jpg: Recipe 1 "Chicken Pasta" - ingredients: chicken, pasta, cream. Instructions: Cook pasta, sauté chicken, add cream.
137
+ or
138
+ - At step 12, I successfully added Recipe 1 to Broccoli app. Still need to add Recipe 2 and Recipe 3 from memory.
139
+ Store important information here with step context for later reference.
140
+ </add_memory>
141
+
142
+ <plan>
143
+ Please update or copy the existing plan according to the current page and progress. Please pay close attention to the historical operations. Please do not repeat the plan of completed content unless you can judge from the screen status that a subgoal is indeed not completed.
144
+ </plan>
145
+
146
+ <request_accomplished>
147
+ Use this tag ONLY after actually completing the user's request through concrete actions, not at the beginning or for planning.
148
+
149
+ 1. Always include a message inside this tag confirming what you accomplished
150
+ 2. Ensure both opening and closing tags are present
151
+ 3. Use exclusively for signaling completed user requests
152
+ </request_accomplished>"""
153
+
154
+ return prompt
155
+
156
+
157
+ def parse_manager_response(response: str) -> dict:
158
+ """
159
+ Parse manager LLM response into structured dict.
160
+
161
+ Extracts XML-style tags from the response:
162
+ - <thought>...</thought>
163
+ - <add_memory>...</add_memory>
164
+ - <plan>...</plan>
165
+ - <request_accomplished>...</request_accomplished> (answer)
166
+ - <historical_operations>...</historical_operations> (optional, for completed plan)
167
+
168
+ Also derives:
169
+ - current_subgoal: first line of plan (with list markers removed)
170
+
171
+ Args:
172
+ response: Raw LLM response text
173
+
174
+ Returns:
175
+ Dict with keys:
176
+ - thought: str
177
+ - memory: str
178
+ - plan: str
179
+ - current_subgoal: str (first line of plan, cleaned)
180
+ - completed_subgoal: str
181
+ - answer: str (from request_accomplished tag)
182
+ """
183
+ def extract(tag: str) -> str:
184
+ """Extract content between XML-style tags."""
185
+ if f"<{tag}>" in response and f"</{tag}>" in response:
186
+ return response.split(f"<{tag}>", 1)[-1].split(f"</{tag}>", 1)[0].strip()
187
+ return ""
188
+
189
+ thought = extract("thought")
190
+ memory_section = extract("add_memory")
191
+ plan = extract("plan")
192
+ answer = extract("request_accomplished")
193
+
194
+ # Extract completed subgoal (optional historical_operations tag)
195
+ if "<historical_operations>" in response:
196
+ completed_subgoal = extract("historical_operations")
197
+ else:
198
+ completed_subgoal = "No completed subgoal."
199
+
200
+ # Parse current subgoal from first line of plan
201
+ current_goal_text = plan
202
+ # Prefer newline-separated plans; take the first non-empty line
203
+ plan_lines = [line.strip() for line in current_goal_text.splitlines() if line.strip()]
204
+ if plan_lines:
205
+ first_line = plan_lines[0]
206
+ else:
207
+ first_line = current_goal_text.strip()
208
+
209
+ # Remove common list markers like "1.", "-", "*", or bullet characters
210
+ first_line = re.sub(r"^\s*\d+\.\s*", "", first_line) # Remove "1. ", "2. ", etc.
211
+ first_line = re.sub(r"^\s*[-*]\s*", "", first_line) # Remove "- " or "* "
212
+ first_line = re.sub(r"^\s*•\s*", "", first_line) # Remove bullet "• "
213
+
214
+ current_subgoal = first_line.strip()
215
+
216
+ return {
217
+ "thought": thought,
218
+ "completed_subgoal": completed_subgoal,
219
+ "plan": plan,
220
+ "memory": memory_section,
221
+ "current_subgoal": current_subgoal,
222
+ "answer": answer,
223
+ }