droidrun 0.3.10.dev2__py3-none-any.whl → 0.3.10.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. droidrun/agent/codeact/__init__.py +1 -4
  2. droidrun/agent/codeact/codeact_agent.py +95 -86
  3. droidrun/agent/codeact/events.py +1 -2
  4. droidrun/agent/context/__init__.py +5 -9
  5. droidrun/agent/context/episodic_memory.py +1 -3
  6. droidrun/agent/context/task_manager.py +8 -2
  7. droidrun/agent/droid/droid_agent.py +102 -141
  8. droidrun/agent/droid/events.py +45 -14
  9. droidrun/agent/executor/__init__.py +6 -4
  10. droidrun/agent/executor/events.py +29 -9
  11. droidrun/agent/executor/executor_agent.py +86 -28
  12. droidrun/agent/executor/prompts.py +8 -2
  13. droidrun/agent/manager/__init__.py +6 -7
  14. droidrun/agent/manager/events.py +16 -4
  15. droidrun/agent/manager/manager_agent.py +130 -69
  16. droidrun/agent/manager/prompts.py +1 -159
  17. droidrun/agent/utils/chat_utils.py +64 -2
  18. droidrun/agent/utils/device_state_formatter.py +54 -26
  19. droidrun/agent/utils/executer.py +66 -80
  20. droidrun/agent/utils/inference.py +11 -10
  21. droidrun/agent/utils/tools.py +58 -6
  22. droidrun/agent/utils/trajectory.py +18 -12
  23. droidrun/cli/logs.py +118 -56
  24. droidrun/cli/main.py +154 -136
  25. droidrun/config_manager/__init__.py +9 -7
  26. droidrun/config_manager/app_card_loader.py +148 -0
  27. droidrun/config_manager/config_manager.py +200 -102
  28. droidrun/config_manager/path_resolver.py +104 -0
  29. droidrun/config_manager/prompt_loader.py +75 -0
  30. droidrun/macro/__init__.py +1 -1
  31. droidrun/macro/cli.py +23 -18
  32. droidrun/telemetry/__init__.py +2 -2
  33. droidrun/telemetry/events.py +3 -3
  34. droidrun/telemetry/tracker.py +1 -1
  35. droidrun/tools/adb.py +1 -1
  36. droidrun/tools/ios.py +3 -2
  37. {droidrun-0.3.10.dev2.dist-info → droidrun-0.3.10.dev4.dist-info}/METADATA +10 -3
  38. droidrun-0.3.10.dev4.dist-info/RECORD +61 -0
  39. droidrun/agent/codeact/prompts.py +0 -26
  40. droidrun/agent/context/agent_persona.py +0 -16
  41. droidrun/agent/context/context_injection_manager.py +0 -66
  42. droidrun/agent/context/personas/__init__.py +0 -11
  43. droidrun/agent/context/personas/app_starter.py +0 -44
  44. droidrun/agent/context/personas/big_agent.py +0 -96
  45. droidrun/agent/context/personas/default.py +0 -95
  46. droidrun/agent/context/personas/ui_expert.py +0 -108
  47. droidrun/agent/planner/__init__.py +0 -13
  48. droidrun/agent/planner/events.py +0 -21
  49. droidrun/agent/planner/planner_agent.py +0 -311
  50. droidrun/agent/planner/prompts.py +0 -124
  51. droidrun-0.3.10.dev2.dist-info/RECORD +0 -70
  52. {droidrun-0.3.10.dev2.dist-info → droidrun-0.3.10.dev4.dist-info}/WHEEL +0 -0
  53. {droidrun-0.3.10.dev2.dist-info → droidrun-0.3.10.dev4.dist-info}/entry_points.txt +0 -0
  54. {droidrun-0.3.10.dev2.dist-info → droidrun-0.3.10.dev4.dist-info}/licenses/LICENSE +0 -0
@@ -9,6 +9,7 @@ This agent is responsible for:
9
9
 
10
10
  from __future__ import annotations
11
11
 
12
+ import asyncio
12
13
  import json
13
14
  import logging
14
15
  from typing import TYPE_CHECKING
@@ -17,12 +18,20 @@ from llama_index.core.llms import ChatMessage, ImageBlock, TextBlock
17
18
  from llama_index.core.llms.llm import LLM
18
19
  from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
19
20
 
20
- from droidrun.agent.executor.events import ExecutorActionEvent, ExecutorResultEvent
21
- from droidrun.agent.executor.prompts import build_executor_system_prompt, parse_executor_response
22
- from droidrun.agent.utils.tools import click, long_press, open_app, swipe, system_button, type
21
+ from droidrun.agent.executor.events import ExecutorInternalActionEvent, ExecutorInternalResultEvent
22
+ from droidrun.agent.executor.prompts import parse_executor_response
23
23
  from droidrun.agent.utils.inference import acall_with_retries
24
- from droidrun.config_manager import config
25
- import asyncio
24
+ from droidrun.agent.utils.tools import (
25
+ ATOMIC_ACTION_SIGNATURES,
26
+ click,
27
+ long_press,
28
+ open_app,
29
+ swipe,
30
+ system_button,
31
+ type,
32
+ )
33
+ from droidrun.config_manager.config_manager import AgentConfig
34
+ from droidrun.config_manager.prompt_loader import PromptLoader
26
35
 
27
36
  if TYPE_CHECKING:
28
37
  from droidrun.agent.droid.events import DroidAgentState
@@ -30,7 +39,7 @@ if TYPE_CHECKING:
30
39
  logger = logging.getLogger("droidrun")
31
40
 
32
41
 
33
- class ExecutorAgent(Workflow):
42
+ class ExecutorAgent(Workflow): # TODO: Fix a bug in bad prompt
34
43
  """
35
44
  Action execution agent that performs specific actions.
36
45
 
@@ -45,22 +54,20 @@ class ExecutorAgent(Workflow):
45
54
  def __init__(
46
55
  self,
47
56
  llm: LLM,
48
- vision: bool,
49
57
  tools_instance,
50
58
  shared_state: "DroidAgentState",
51
- persona=None,
59
+ agent_config: AgentConfig,
52
60
  custom_tools: dict = None,
53
- debug: bool = False,
54
61
  **kwargs
55
62
  ):
56
63
  super().__init__(**kwargs)
57
64
  self.llm = llm
58
- self.vision = vision
65
+ self.agent_config = agent_config
66
+ self.config = agent_config.executor
67
+ self.vision = agent_config.executor.vision
59
68
  self.tools_instance = tools_instance
60
69
  self.shared_state = shared_state
61
- self.persona = persona
62
70
  self.custom_tools = custom_tools or {}
63
- self.debug = debug
64
71
 
65
72
  logger.info("✅ ExecutorAgent initialized successfully.")
66
73
 
@@ -70,7 +77,7 @@ class ExecutorAgent(Workflow):
70
77
  self,
71
78
  ctx: Context,
72
79
  ev: StartEvent
73
- ) -> ExecutorActionEvent:
80
+ ) -> ExecutorInternalActionEvent:
74
81
  """
75
82
  Executor decides which action to take.
76
83
 
@@ -83,12 +90,53 @@ class ExecutorAgent(Workflow):
83
90
  subgoal = ev.get("subgoal", "")
84
91
  logger.info(f"🧠 Executor thinking about action for: {subgoal}")
85
92
 
93
+ # Format app card (include tags in variable value or empty string)
86
94
  app_card = "" # TODO: Implement app card retrieval
87
-
88
- system_prompt = build_executor_system_prompt(
89
- state=self.shared_state,
90
- subgoal=subgoal,
91
- app_card=app_card
95
+ app_card_text = ""
96
+ if app_card.strip():
97
+ app_card_text = "App card gives information on how to operate the app and perform actions.\n### App Card ###\n" + app_card.strip() + "\n\n"
98
+
99
+ # Format device state (use unified state)
100
+ device_state_text = ""
101
+ if self.shared_state.formatted_device_state and self.shared_state.formatted_device_state.strip():
102
+ device_state_text = "### Device State ###\n" + self.shared_state.formatted_device_state.strip() + "\n\n"
103
+
104
+ # Format progress status
105
+ progress_status_text = self.shared_state.progress_status + "\n\n" if self.shared_state.progress_status else "No progress yet.\n\n"
106
+
107
+ # Format atomic actions
108
+ atomic_actions_text = chr(10).join(
109
+ f"- {action_name}({', '.join(action_info['arguments'])}): {action_info['description']}"
110
+ for action_name, action_info in ATOMIC_ACTION_SIGNATURES.items()
111
+ ) + "\n"
112
+
113
+ # Format action history
114
+ if self.shared_state.action_history:
115
+ action_history_text = "Recent actions you took previously and whether they were successful:\n" + "\n".join(
116
+ (f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome
117
+ else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
118
+ for act, summ, outcome, err_des in zip(
119
+ self.shared_state.action_history[-min(5, len(self.shared_state.action_history)):],
120
+ self.shared_state.summary_history[-min(5, len(self.shared_state.action_history)):],
121
+ self.shared_state.action_outcomes[-min(5, len(self.shared_state.action_history)):],
122
+ self.shared_state.error_descriptions[-min(5, len(self.shared_state.action_history)):], strict=True)
123
+ ) + "\n\n"
124
+ else:
125
+ action_history_text = "No actions have been taken yet.\n\n"
126
+
127
+ # Load and format prompt
128
+ system_prompt = PromptLoader.load_prompt(
129
+ self.agent_config.get_executor_system_prompt_path(),
130
+ {
131
+ "instruction": self.shared_state.instruction,
132
+ "app_card": app_card_text,
133
+ "device_state_text": device_state_text,
134
+ "plan": self.shared_state.plan,
135
+ "subgoal": subgoal,
136
+ "progress_status": progress_status_text,
137
+ "atomic_actions": atomic_actions_text,
138
+ "action_history": action_history_text
139
+ }
92
140
  )
93
141
 
94
142
  blocks = [TextBlock(text=system_prompt)]
@@ -112,7 +160,7 @@ class ExecutorAgent(Workflow):
112
160
  parsed = parse_executor_response(response_text)
113
161
  except Exception as e:
114
162
  logger.error(f"❌ Failed to parse executor response: {e}")
115
- return ExecutorActionEvent(
163
+ return ExecutorInternalActionEvent(
116
164
  action_json=json.dumps({"action": "invalid"}),
117
165
  thought=f"Failed to parse response: {str(e)}",
118
166
  description="Invalid response format from LLM"
@@ -122,18 +170,23 @@ class ExecutorAgent(Workflow):
122
170
  logger.info(f"🎯 Action: {parsed['action']}")
123
171
  logger.debug(f" - Description: {parsed['description']}")
124
172
 
125
- return ExecutorActionEvent(
173
+ event = ExecutorInternalActionEvent(
126
174
  action_json=parsed["action"],
127
175
  thought=parsed["thought"],
128
176
  description=parsed["description"]
129
177
  )
130
178
 
179
+ # Write event to stream for web interface
180
+ ctx.write_event_to_stream(event)
181
+
182
+ return event
183
+
131
184
  @step
132
185
  async def execute(
133
186
  self,
134
187
  ctx: Context,
135
- ev: ExecutorActionEvent
136
- ) -> ExecutorResultEvent:
188
+ ev: ExecutorInternalActionEvent
189
+ ) -> ExecutorInternalResultEvent:
137
190
  """
138
191
  Execute the selected action using the tools instance.
139
192
 
@@ -146,7 +199,7 @@ class ExecutorAgent(Workflow):
146
199
  action_dict = json.loads(ev.action_json)
147
200
  except json.JSONDecodeError as e:
148
201
  logger.error(f"❌ Failed to parse action JSON: {e}")
149
- return ExecutorResultEvent(
202
+ return ExecutorInternalResultEvent(
150
203
  action={"action": "invalid"},
151
204
  outcome=False,
152
205
  error=f"Invalid action JSON: {str(e)}",
@@ -155,15 +208,15 @@ class ExecutorAgent(Workflow):
155
208
  action_json=ev.action_json
156
209
  )
157
210
 
158
- # Execute the action
159
211
  outcome, error, summary = await self._execute_action(action_dict, ev.description)
160
212
 
161
- if outcome:
162
- await asyncio.sleep(config.agent.after_sleep_action)
213
+ # TODO: Add sleep after action (should be in DroidAgent.handle_executor_result)
214
+ # Available via: self.agent_config.after_sleep_action
215
+ # await asyncio.sleep(self.agent_config.after_sleep_action)
163
216
 
164
217
  logger.info(f"{'✅' if outcome else '❌'} Execution complete: {summary}")
165
218
 
166
- return ExecutorResultEvent(
219
+ result_event = ExecutorInternalResultEvent(
167
220
  action=action_dict,
168
221
  outcome=outcome,
169
222
  error=error,
@@ -172,6 +225,11 @@ class ExecutorAgent(Workflow):
172
225
  action_json=ev.action_json
173
226
  )
174
227
 
228
+ # Write event to stream for web interface
229
+ ctx.write_event_to_stream(result_event)
230
+
231
+ return result_event
232
+
175
233
  async def _execute_action(self, action_dict: dict, description: str) -> tuple[bool, str, str]:
176
234
  """
177
235
  Execute a single action based on the action dictionary.
@@ -312,7 +370,7 @@ class ExecutorAgent(Workflow):
312
370
  async def finalize(
313
371
  self,
314
372
  ctx: Context,
315
- ev: ExecutorResultEvent
373
+ ev: ExecutorInternalResultEvent
316
374
  ) -> StopEvent:
317
375
  """Return executor results to parent workflow."""
318
376
  logger.debug("✅ Executor execution complete")
@@ -76,7 +76,7 @@ The atomic action functions are listed in the format of `action(arguments): desc
76
76
  \n
77
77
  ### Latest Action History ###
78
78
  {(("Recent actions you took previously and whether they were successful:\n" + "\n".join(
79
- (f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome == "A"
79
+ (f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome
80
80
  else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
81
81
  for act, summ, outcome, err_des in zip(
82
82
  state.action_history[-min(5, len(state.action_history)):],
@@ -126,7 +126,13 @@ def parse_executor_response(response: str) -> dict:
126
126
  Dictionary with 'thought', 'action', 'description' keys
127
127
  """
128
128
  thought = response.split("### Thought")[-1].split("### Action")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
129
- action = response.split("### Action")[-1].split("### Description")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
129
+ action_raw = response.split("### Action")[-1].split("### Description")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
130
+ start_idx = action_raw.find('{')
131
+ end_idx = action_raw.rfind('}')
132
+ if start_idx != -1 and end_idx != -1:
133
+ action = action_raw[start_idx:end_idx + 1]
134
+ else:
135
+ action = action_raw
130
136
  description = response.split("### Description")[-1].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
131
137
 
132
138
  return {
@@ -2,17 +2,16 @@
2
2
  Manager Agent - Planning and reasoning workflow.
3
3
  """
4
4
 
5
- from droidrun.agent.manager.events import ManagerPlanEvent, ManagerThinkingEvent
5
+ from droidrun.agent.droid.events import ManagerInputEvent, ManagerPlanEvent
6
+ from droidrun.agent.manager.events import ManagerThinkingEvent, ManagerInternalPlanEvent
6
7
  from droidrun.agent.manager.manager_agent import ManagerAgent
7
- from droidrun.agent.manager.prompts import (
8
- build_manager_system_prompt,
9
- parse_manager_response,
10
- )
8
+ from droidrun.agent.manager.prompts import parse_manager_response
11
9
 
12
10
  __all__ = [
13
11
  "ManagerAgent",
14
- "ManagerThinkingEvent",
12
+ "ManagerInputEvent",
15
13
  "ManagerPlanEvent",
16
- "build_manager_system_prompt",
14
+ "ManagerThinkingEvent",
15
+ "ManagerInternalPlanEvent",
17
16
  "parse_manager_response",
18
17
  ]
@@ -1,5 +1,11 @@
1
1
  """
2
2
  Events for the ManagerAgent workflow.
3
+
4
+ These are INTERNAL events used within ManagerAgent for:
5
+ - Streaming to frontend/logging
6
+ - Carrying full debug metadata
7
+
8
+ For workflow coordination with DroidAgent, see droid/events.py
3
9
  """
4
10
 
5
11
  from llama_index.core.workflow.events import Event
@@ -10,11 +16,17 @@ class ManagerThinkingEvent(Event):
10
16
  pass
11
17
 
12
18
 
13
- class ManagerPlanEvent(Event):
14
- """Manager has created a plan"""
19
+ class ManagerInternalPlanEvent(Event):
20
+ """
21
+ Internal Manager planning event with full state and metadata.
22
+
23
+ This event is streamed to frontend/logging but NOT used for
24
+ workflow coordination between ManagerAgent and DroidAgent.
25
+
26
+ For workflow coordination, see ManagerPlanEvent in droid/events.py
27
+ """
15
28
  plan: str
16
29
  current_subgoal: str
17
- completed_plan: str
18
30
  thought: str
19
31
  manager_answer: str = ""
20
- memory_update: str = ""
32
+ memory_update: str = "" # Debugging metadata: LLM's memory additions
@@ -11,22 +11,26 @@ This agent is responsible for:
11
11
  from __future__ import annotations
12
12
 
13
13
  import logging
14
- from typing import TYPE_CHECKING, List
14
+ from typing import TYPE_CHECKING
15
15
 
16
16
  from llama_index.core.llms.llm import LLM
17
17
  from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
18
18
 
19
- from droidrun.agent.manager.events import ManagerPlanEvent, ManagerThinkingEvent
20
- from droidrun.agent.manager.prompts import build_manager_system_prompt, parse_manager_response
19
+ from droidrun.agent.manager.events import ManagerInternalPlanEvent, ManagerThinkingEvent
20
+ from droidrun.agent.manager.prompts import parse_manager_response
21
21
  from droidrun.agent.utils import convert_messages_to_chatmessages
22
22
  from droidrun.agent.utils.chat_utils import remove_empty_messages
23
- from droidrun.agent.utils.device_state_formatter import get_device_state_exact_format
23
+ from droidrun.agent.utils.device_state_formatter import format_device_state
24
24
  from droidrun.agent.utils.inference import acall_with_retries
25
25
  from droidrun.agent.utils.tools import build_custom_tool_descriptions
26
+ from droidrun.config_manager.prompt_loader import PromptLoader
27
+ from droidrun.config_manager.app_card_loader import AppCardLoader
26
28
 
27
29
  if TYPE_CHECKING:
28
30
  from droidrun.agent.droid.events import DroidAgentState
29
31
  from droidrun.tools import Tools
32
+ from droidrun.config_manager.config_manager import AgentConfig
33
+
30
34
 
31
35
  logger = logging.getLogger("droidrun")
32
36
 
@@ -45,22 +49,21 @@ class ManagerAgent(Workflow):
45
49
  def __init__(
46
50
  self,
47
51
  llm: LLM,
48
- vision: bool,
49
- personas: List,
50
52
  tools_instance: "Tools",
51
53
  shared_state: "DroidAgentState",
54
+ agent_config: "AgentConfig",
52
55
  custom_tools: dict = None,
53
- debug: bool = False,
54
56
  **kwargs
55
57
  ):
56
58
  super().__init__(**kwargs)
57
59
  self.llm = llm
58
- self.vision = vision
59
- self.personas = personas
60
+ self.config = agent_config.manager
61
+ self.vision = self.config.vision
60
62
  self.tools_instance = tools_instance
61
63
  self.shared_state = shared_state
62
64
  self.custom_tools = custom_tools or {}
63
- self.debug = debug
65
+ self.agent_config = agent_config
66
+ self.app_card_loader = self.agent_config.app_cards
64
67
 
65
68
  logger.info("✅ ManagerAgent initialized successfully.")
66
69
 
@@ -70,23 +73,23 @@ class ManagerAgent(Workflow):
70
73
 
71
74
  def _build_system_prompt(
72
75
  self,
73
- has_text_to_modify: bool
76
+ has_text_to_modify: bool,
77
+ app_card: str = ""
74
78
  ) -> str:
75
79
  """
76
80
  Build system prompt with all context.
77
81
 
78
82
  Args:
79
83
  has_text_to_modify: Whether text manipulation mode is enabled
80
-
84
+ app_card: App card content
81
85
  Returns:
82
86
  Complete system prompt
83
87
  """
84
-
85
- # Get error history if error_flag_plan is set
86
- error_history = []
88
+ # Format error history
89
+ error_history_text = ""
87
90
  if self.shared_state.error_flag_plan:
88
91
  k = self.shared_state.err_to_manager_thresh
89
- error_history = [
92
+ errors = [
90
93
  {
91
94
  "action": act,
92
95
  "summary": summ,
@@ -98,19 +101,76 @@ class ManagerAgent(Workflow):
98
101
  self.shared_state.error_descriptions[-k:], strict=True
99
102
  )
100
103
  ]
101
-
102
- # Build custom tools descriptions
103
- custom_tools_descriptions = build_custom_tool_descriptions(self.custom_tools)
104
-
105
- return build_manager_system_prompt(
106
- instruction=self.shared_state.instruction,
107
- has_text_to_modify=has_text_to_modify,
108
- app_card="", # TODO: implement app card retrieval system
109
- device_date=self.tools_instance.get_date(),
110
- important_notes="", # TODO: expose important_notes in DroidAgentState if needed
111
- error_flag=self.shared_state.error_flag_plan,
112
- error_history=error_history,
113
- custom_tools_descriptions=custom_tools_descriptions
104
+ error_history_text = (
105
+ "<potentially_stuck>\n"
106
+ "You have encountered several failed attempts. Here are some logs:\n"
107
+ )
108
+ for error in errors:
109
+ error_history_text += (
110
+ f"- Attempt: Action: {error['action']} | "
111
+ f"Description: {error['summary']} | "
112
+ f"Outcome: Failed | "
113
+ f"Feedback: {error['error']}\n"
114
+ )
115
+ error_history_text += "</potentially_stuck>\n\n"
116
+
117
+ # Text manipulation section
118
+ text_manipulation_section = ""
119
+ if has_text_to_modify:
120
+ text_manipulation_section = """
121
+
122
+ <text_manipulation>
123
+ 1. Use **TEXT_TASK:** prefix in your plan when you need to modify text in the currently focused text input field
124
+ 2. TEXT_TASK is for editing, formatting, or transforming existing text content in text boxes using Python code
125
+ 3. Do not use TEXT_TASK for extracting text from messages, typing new text, or composing messages
126
+ 4. The focused text field contains editable text that you can modify
127
+ 5. Example plan item: 'TEXT_TASK: Add "Hello World" at the beginning of the text'
128
+ 6. Always use TEXT_TASK for modifying text, do not try to select the text to copy/cut/paste or adjust the text
129
+ </text_manipulation>"""
130
+
131
+ # Device date (include tags in variable value or empty string)
132
+ device_date = self.tools_instance.get_date()
133
+ device_date_text = ""
134
+ if device_date.strip():
135
+ device_date_text = f"<device_date>\n{device_date}\n</device_date>\n\n"
136
+
137
+ # App card (include tags in variable value or empty string)
138
+ app_card = app_card
139
+ app_card_text = ""
140
+ if app_card.strip():
141
+ app_card_text = "App card gives information on how to operate the app and perform actions.\n<app_card>\n" + app_card.strip() + "\n</app_card>\n\n"
142
+
143
+ # Important notes (include tags in variable value or empty string)
144
+ important_notes = "" # TODO: implement
145
+ important_notes_text = ""
146
+ if important_notes.strip():
147
+ important_notes_text = "<important_notes>\n" + important_notes + "\n</important_notes>\n\n"
148
+
149
+ # Custom tools
150
+ custom_tools_desc = build_custom_tool_descriptions(self.custom_tools)
151
+ custom_tools_text = ""
152
+ if custom_tools_desc.strip():
153
+ custom_tools_text = """
154
+
155
+ <custom_actions>
156
+ The executor has access to these additional custom actions beyond the standard actions (click, type, swipe, etc.):
157
+ """ + custom_tools_desc + """
158
+
159
+ You can reference these custom actions or tell the Executer agent to use them in your plan when they help achieve the user's goal.
160
+ </custom_actions>"""
161
+
162
+ # Load and format prompt
163
+ return PromptLoader.load_prompt(
164
+ self.agent_config.get_manager_system_prompt_path(),
165
+ {
166
+ "instruction": self.shared_state.instruction,
167
+ "device_date": device_date_text,
168
+ "app_card": app_card_text,
169
+ "important_notes": important_notes_text,
170
+ "error_history": error_history_text,
171
+ "text_manipulation_section": text_manipulation_section,
172
+ "custom_tools_descriptions": custom_tools_text
173
+ }
114
174
  )
115
175
 
116
176
  def _build_messages_with_context(
@@ -155,28 +215,28 @@ class ManagerAgent(Workflow):
155
215
  else:
156
216
  messages[last_user_idx]['content'].insert(0, {"text": f"<memory>\n{current_memory}\n</memory>\n"})
157
217
 
158
- # Add device state to last user message
159
- current_a11y = (self.shared_state.ui_elements_list_after or self.shared_state.device_state_text or "").strip()
160
- if current_a11y:
218
+ # Add CURRENT device state to last user message (use unified state)
219
+ current_state = self.shared_state.formatted_device_state.strip()
220
+ if current_state:
161
221
  if messages[last_user_idx]['content'] and 'text' in messages[last_user_idx]['content'][0]:
162
- messages[last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{current_a11y}\n</device_state>\n"
222
+ messages[last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{current_state}\n</device_state>\n"
163
223
  else:
164
- messages[last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{current_a11y}\n</device_state>\n"})
224
+ messages[last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{current_state}\n</device_state>\n"})
165
225
 
166
226
  # Add screenshot to last user message
167
227
  if screenshot and self.vision:
168
228
  messages[last_user_idx]['content'].append({"image": screenshot})
169
229
 
170
- # Add previous device state to SECOND-TO-LAST user message (if exists)
230
+ # Add PREVIOUS device state to SECOND-TO-LAST user message (if exists)
171
231
  if len(user_indices) >= 2:
172
232
  second_last_user_idx = user_indices[-2]
173
- prev_a11y = (self.shared_state.ui_elements_list_before or "").strip()
233
+ prev_state = self.shared_state.previous_formatted_device_state.strip()
174
234
 
175
- if prev_a11y:
235
+ if prev_state:
176
236
  if messages[second_last_user_idx]['content'] and 'text' in messages[second_last_user_idx]['content'][0]:
177
- messages[second_last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{prev_a11y}\n</device_state>\n"
237
+ messages[second_last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{prev_state}\n</device_state>\n"
178
238
  else:
179
- messages[second_last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{prev_a11y}\n</device_state>\n"})
239
+ messages[second_last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{prev_state}\n</device_state>\n"})
180
240
  messages = remove_empty_messages(messages)
181
241
  return messages
182
242
 
@@ -263,9 +323,23 @@ class ManagerAgent(Workflow):
263
323
  logger.info("💬 Preparing manager input...")
264
324
 
265
325
  # ====================================================================
266
- # Step 1: Get device state (UI elements accessibility tree)
326
+ # Step 1: Get and format device state using unified formatter
267
327
  # ====================================================================
268
- device_state_text, focused_text = get_device_state_exact_format(self.tools_instance.get_state())
328
+ raw_state = self.tools_instance.get_state()
329
+ formatted_text, focused_text, a11y_tree, phone_state = format_device_state(raw_state)
330
+
331
+ # Update shared state (previous ← current, current ← new)
332
+ self.shared_state.previous_formatted_device_state = self.shared_state.formatted_device_state
333
+ self.shared_state.formatted_device_state = formatted_text
334
+ self.shared_state.focused_text = focused_text
335
+ self.shared_state.a11y_tree = a11y_tree
336
+ self.shared_state.phone_state = phone_state
337
+
338
+ # Extract and store package/app name
339
+ self.shared_state.current_package_name = phone_state.get('packageName', 'Unknown')
340
+ self.shared_state.current_app_name = phone_state.get('currentApp', 'Unknown')
341
+
342
+ # App cards
269
343
 
270
344
  # ====================================================================
271
345
  # Step 2: Capture screenshot if vision enabled
@@ -278,6 +352,7 @@ class ManagerAgent(Workflow):
278
352
  success, screenshot = result
279
353
  if not success:
280
354
  screenshot = None
355
+
281
356
  else:
282
357
  screenshot = result
283
358
  logger.debug("📸 Screenshot captured for Manager")
@@ -288,29 +363,9 @@ class ManagerAgent(Workflow):
288
363
  # ====================================================================
289
364
  # Step 3: Detect text manipulation mode
290
365
  # ====================================================================
291
- focused_text = focused_text or ""
292
366
  focused_text_clean = focused_text.replace("'", "").strip()
293
-
294
- # Check if focused text differs from last typed text
295
- # last_typed_text = ""
296
- # if self.shared_state.action_history:
297
- # recent_actions = self.shared_state.action_history[-1:] if len(self.shared_state.action_history) >= 1 else []
298
- # for action in reversed(recent_actions):
299
- # if isinstance(action, dict) and action.get('action') == 'type':
300
- # last_typed_text = action.get('text', '')
301
- # break
302
-
303
367
  has_text_to_modify = (focused_text_clean != "")
304
368
 
305
- # ====================================================================
306
- # Step 4: Update state with device info
307
- # ====================================================================
308
- self.shared_state.device_state_text = device_state_text
309
- self.shared_state.focused_text = focused_text
310
- # Shift UI elements: before ← after, after ← current
311
- self.shared_state.ui_elements_list_before = self.shared_state.ui_elements_list_after
312
- self.shared_state.ui_elements_list_after = device_state_text
313
-
314
369
  # ====================================================================
315
370
  # Step 5: Build user message entry
316
371
  # ====================================================================
@@ -328,7 +383,7 @@ class ManagerAgent(Workflow):
328
383
  if self.shared_state.last_summary:
329
384
  parts.append(f"<last_action_description>\n{self.shared_state.last_summary}\n</last_action_description>\n")
330
385
 
331
-
386
+
332
387
  self.shared_state.message_history.append({
333
388
  "role": "user",
334
389
  "content": [{"text": "".join(parts)}]
@@ -346,7 +401,7 @@ class ManagerAgent(Workflow):
346
401
  self,
347
402
  ctx: Context,
348
403
  ev: ManagerThinkingEvent
349
- ) -> ManagerPlanEvent:
404
+ ) -> ManagerInternalPlanEvent:
350
405
  """
351
406
  Manager reasons and creates plan.
352
407
 
@@ -362,11 +417,15 @@ class ManagerAgent(Workflow):
362
417
 
363
418
  has_text_to_modify = self.shared_state.has_text_to_modify
364
419
  screenshot = self.shared_state.screenshot
420
+ if self.app_card_loader.enabled:
421
+ app_card = AppCardLoader.load_app_card(self.shared_state.current_package_name, self.app_card_loader.app_cards_dir)
422
+ else:
423
+ app_card = ""
365
424
 
366
425
  # ====================================================================
367
426
  # Step 1: Build system prompt
368
427
  # ====================================================================
369
- system_prompt = self._build_system_prompt(has_text_to_modify)
428
+ system_prompt = self._build_system_prompt(has_text_to_modify, app_card)
370
429
 
371
430
  # ====================================================================
372
431
  # Step 2: Build messages with context
@@ -423,7 +482,6 @@ class ManagerAgent(Workflow):
423
482
  # Update planning fields
424
483
  self.shared_state.plan = parsed["plan"]
425
484
  self.shared_state.current_subgoal = parsed["current_subgoal"]
426
- self.shared_state.completed_plan = parsed.get("completed_subgoal", "No completed subgoal.")
427
485
  self.shared_state.finish_thought = parsed["thought"]
428
486
  self.shared_state.manager_answer = parsed["answer"]
429
487
 
@@ -431,20 +489,24 @@ class ManagerAgent(Workflow):
431
489
  logger.debug(f" - Current subgoal: {parsed['current_subgoal']}")
432
490
  logger.debug(f" - Manager answer: {parsed['answer'][:50] if parsed['answer'] else 'None'}")
433
491
 
434
- return ManagerPlanEvent(
492
+ event = ManagerInternalPlanEvent(
435
493
  plan=parsed["plan"],
436
494
  current_subgoal=parsed["current_subgoal"],
437
- completed_plan=parsed.get("completed_subgoal", "No completed subgoal."),
438
495
  thought=parsed["thought"],
439
496
  manager_answer=parsed["answer"],
440
497
  memory_update=memory_update
441
498
  )
442
499
 
500
+ # Write event to stream for web interface
501
+ ctx.write_event_to_stream(event)
502
+
503
+ return event
504
+
443
505
  @step
444
506
  async def finalize(
445
507
  self,
446
508
  ctx: Context,
447
- ev: ManagerPlanEvent
509
+ ev: ManagerInternalPlanEvent
448
510
  ) -> StopEvent:
449
511
  """Return manager results to parent workflow."""
450
512
  logger.debug("✅ Manager planning complete")
@@ -452,7 +514,6 @@ class ManagerAgent(Workflow):
452
514
  return StopEvent(result={
453
515
  "plan": ev.plan,
454
516
  "current_subgoal": ev.current_subgoal,
455
- "completed_plan": ev.completed_plan,
456
517
  "thought": ev.thought,
457
518
  "manager_answer": ev.manager_answer,
458
519
  "memory_update": ev.memory_update