droidrun 0.3.10.dev3__py3-none-any.whl → 0.3.10.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/agent/codeact/__init__.py +1 -4
- droidrun/agent/codeact/codeact_agent.py +95 -86
- droidrun/agent/codeact/events.py +1 -2
- droidrun/agent/context/__init__.py +5 -9
- droidrun/agent/context/episodic_memory.py +1 -3
- droidrun/agent/context/task_manager.py +8 -2
- droidrun/agent/droid/droid_agent.py +102 -141
- droidrun/agent/droid/events.py +45 -14
- droidrun/agent/executor/__init__.py +6 -4
- droidrun/agent/executor/events.py +29 -9
- droidrun/agent/executor/executor_agent.py +86 -28
- droidrun/agent/executor/prompts.py +8 -2
- droidrun/agent/manager/__init__.py +6 -7
- droidrun/agent/manager/events.py +16 -4
- droidrun/agent/manager/manager_agent.py +130 -69
- droidrun/agent/manager/prompts.py +1 -159
- droidrun/agent/utils/chat_utils.py +64 -2
- droidrun/agent/utils/device_state_formatter.py +54 -26
- droidrun/agent/utils/executer.py +66 -80
- droidrun/agent/utils/inference.py +11 -10
- droidrun/agent/utils/tools.py +58 -6
- droidrun/agent/utils/trajectory.py +18 -12
- droidrun/cli/logs.py +118 -56
- droidrun/cli/main.py +154 -136
- droidrun/config_manager/__init__.py +9 -7
- droidrun/config_manager/app_card_loader.py +148 -0
- droidrun/config_manager/config_manager.py +200 -102
- droidrun/config_manager/path_resolver.py +104 -0
- droidrun/config_manager/prompt_loader.py +75 -0
- droidrun/macro/__init__.py +1 -1
- droidrun/macro/cli.py +23 -18
- droidrun/telemetry/__init__.py +2 -2
- droidrun/telemetry/events.py +3 -3
- droidrun/telemetry/tracker.py +1 -1
- droidrun/tools/adb.py +1 -1
- droidrun/tools/ios.py +3 -2
- {droidrun-0.3.10.dev3.dist-info → droidrun-0.3.10.dev5.dist-info}/METADATA +10 -4
- droidrun-0.3.10.dev5.dist-info/RECORD +61 -0
- droidrun/agent/codeact/prompts.py +0 -26
- droidrun/agent/context/agent_persona.py +0 -16
- droidrun/agent/context/context_injection_manager.py +0 -66
- droidrun/agent/context/personas/__init__.py +0 -11
- droidrun/agent/context/personas/app_starter.py +0 -44
- droidrun/agent/context/personas/big_agent.py +0 -96
- droidrun/agent/context/personas/default.py +0 -95
- droidrun/agent/context/personas/ui_expert.py +0 -108
- droidrun/agent/planner/__init__.py +0 -13
- droidrun/agent/planner/events.py +0 -21
- droidrun/agent/planner/planner_agent.py +0 -311
- droidrun/agent/planner/prompts.py +0 -124
- droidrun-0.3.10.dev3.dist-info/RECORD +0 -70
- {droidrun-0.3.10.dev3.dist-info → droidrun-0.3.10.dev5.dist-info}/WHEEL +0 -0
- {droidrun-0.3.10.dev3.dist-info → droidrun-0.3.10.dev5.dist-info}/entry_points.txt +0 -0
- {droidrun-0.3.10.dev3.dist-info → droidrun-0.3.10.dev5.dist-info}/licenses/LICENSE +0 -0
@@ -9,6 +9,7 @@ This agent is responsible for:
|
|
9
9
|
|
10
10
|
from __future__ import annotations
|
11
11
|
|
12
|
+
import asyncio
|
12
13
|
import json
|
13
14
|
import logging
|
14
15
|
from typing import TYPE_CHECKING
|
@@ -17,12 +18,20 @@ from llama_index.core.llms import ChatMessage, ImageBlock, TextBlock
|
|
17
18
|
from llama_index.core.llms.llm import LLM
|
18
19
|
from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
|
19
20
|
|
20
|
-
from droidrun.agent.executor.events import
|
21
|
-
from droidrun.agent.executor.prompts import
|
22
|
-
from droidrun.agent.utils.tools import click, long_press, open_app, swipe, system_button, type
|
21
|
+
from droidrun.agent.executor.events import ExecutorInternalActionEvent, ExecutorInternalResultEvent
|
22
|
+
from droidrun.agent.executor.prompts import parse_executor_response
|
23
23
|
from droidrun.agent.utils.inference import acall_with_retries
|
24
|
-
from droidrun.
|
25
|
-
|
24
|
+
from droidrun.agent.utils.tools import (
|
25
|
+
ATOMIC_ACTION_SIGNATURES,
|
26
|
+
click,
|
27
|
+
long_press,
|
28
|
+
open_app,
|
29
|
+
swipe,
|
30
|
+
system_button,
|
31
|
+
type,
|
32
|
+
)
|
33
|
+
from droidrun.config_manager.config_manager import AgentConfig
|
34
|
+
from droidrun.config_manager.prompt_loader import PromptLoader
|
26
35
|
|
27
36
|
if TYPE_CHECKING:
|
28
37
|
from droidrun.agent.droid.events import DroidAgentState
|
@@ -30,7 +39,7 @@ if TYPE_CHECKING:
|
|
30
39
|
logger = logging.getLogger("droidrun")
|
31
40
|
|
32
41
|
|
33
|
-
class ExecutorAgent(Workflow):
|
42
|
+
class ExecutorAgent(Workflow): # TODO: Fix a bug in bad prompt
|
34
43
|
"""
|
35
44
|
Action execution agent that performs specific actions.
|
36
45
|
|
@@ -45,22 +54,20 @@ class ExecutorAgent(Workflow):
|
|
45
54
|
def __init__(
|
46
55
|
self,
|
47
56
|
llm: LLM,
|
48
|
-
vision: bool,
|
49
57
|
tools_instance,
|
50
58
|
shared_state: "DroidAgentState",
|
51
|
-
|
59
|
+
agent_config: AgentConfig,
|
52
60
|
custom_tools: dict = None,
|
53
|
-
debug: bool = False,
|
54
61
|
**kwargs
|
55
62
|
):
|
56
63
|
super().__init__(**kwargs)
|
57
64
|
self.llm = llm
|
58
|
-
self.
|
65
|
+
self.agent_config = agent_config
|
66
|
+
self.config = agent_config.executor
|
67
|
+
self.vision = agent_config.executor.vision
|
59
68
|
self.tools_instance = tools_instance
|
60
69
|
self.shared_state = shared_state
|
61
|
-
self.persona = persona
|
62
70
|
self.custom_tools = custom_tools or {}
|
63
|
-
self.debug = debug
|
64
71
|
|
65
72
|
logger.info("✅ ExecutorAgent initialized successfully.")
|
66
73
|
|
@@ -70,7 +77,7 @@ class ExecutorAgent(Workflow):
|
|
70
77
|
self,
|
71
78
|
ctx: Context,
|
72
79
|
ev: StartEvent
|
73
|
-
) ->
|
80
|
+
) -> ExecutorInternalActionEvent:
|
74
81
|
"""
|
75
82
|
Executor decides which action to take.
|
76
83
|
|
@@ -83,12 +90,53 @@ class ExecutorAgent(Workflow):
|
|
83
90
|
subgoal = ev.get("subgoal", "")
|
84
91
|
logger.info(f"🧠 Executor thinking about action for: {subgoal}")
|
85
92
|
|
93
|
+
# Format app card (include tags in variable value or empty string)
|
86
94
|
app_card = "" # TODO: Implement app card retrieval
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
95
|
+
app_card_text = ""
|
96
|
+
if app_card.strip():
|
97
|
+
app_card_text = "App card gives information on how to operate the app and perform actions.\n### App Card ###\n" + app_card.strip() + "\n\n"
|
98
|
+
|
99
|
+
# Format device state (use unified state)
|
100
|
+
device_state_text = ""
|
101
|
+
if self.shared_state.formatted_device_state and self.shared_state.formatted_device_state.strip():
|
102
|
+
device_state_text = "### Device State ###\n" + self.shared_state.formatted_device_state.strip() + "\n\n"
|
103
|
+
|
104
|
+
# Format progress status
|
105
|
+
progress_status_text = self.shared_state.progress_status + "\n\n" if self.shared_state.progress_status else "No progress yet.\n\n"
|
106
|
+
|
107
|
+
# Format atomic actions
|
108
|
+
atomic_actions_text = chr(10).join(
|
109
|
+
f"- {action_name}({', '.join(action_info['arguments'])}): {action_info['description']}"
|
110
|
+
for action_name, action_info in ATOMIC_ACTION_SIGNATURES.items()
|
111
|
+
) + "\n"
|
112
|
+
|
113
|
+
# Format action history
|
114
|
+
if self.shared_state.action_history:
|
115
|
+
action_history_text = "Recent actions you took previously and whether they were successful:\n" + "\n".join(
|
116
|
+
(f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome
|
117
|
+
else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
|
118
|
+
for act, summ, outcome, err_des in zip(
|
119
|
+
self.shared_state.action_history[-min(5, len(self.shared_state.action_history)):],
|
120
|
+
self.shared_state.summary_history[-min(5, len(self.shared_state.action_history)):],
|
121
|
+
self.shared_state.action_outcomes[-min(5, len(self.shared_state.action_history)):],
|
122
|
+
self.shared_state.error_descriptions[-min(5, len(self.shared_state.action_history)):], strict=True)
|
123
|
+
) + "\n\n"
|
124
|
+
else:
|
125
|
+
action_history_text = "No actions have been taken yet.\n\n"
|
126
|
+
|
127
|
+
# Load and format prompt
|
128
|
+
system_prompt = PromptLoader.load_prompt(
|
129
|
+
self.agent_config.get_executor_system_prompt_path(),
|
130
|
+
{
|
131
|
+
"instruction": self.shared_state.instruction,
|
132
|
+
"app_card": app_card_text,
|
133
|
+
"device_state_text": device_state_text,
|
134
|
+
"plan": self.shared_state.plan,
|
135
|
+
"subgoal": subgoal,
|
136
|
+
"progress_status": progress_status_text,
|
137
|
+
"atomic_actions": atomic_actions_text,
|
138
|
+
"action_history": action_history_text
|
139
|
+
}
|
92
140
|
)
|
93
141
|
|
94
142
|
blocks = [TextBlock(text=system_prompt)]
|
@@ -112,7 +160,7 @@ class ExecutorAgent(Workflow):
|
|
112
160
|
parsed = parse_executor_response(response_text)
|
113
161
|
except Exception as e:
|
114
162
|
logger.error(f"❌ Failed to parse executor response: {e}")
|
115
|
-
return
|
163
|
+
return ExecutorInternalActionEvent(
|
116
164
|
action_json=json.dumps({"action": "invalid"}),
|
117
165
|
thought=f"Failed to parse response: {str(e)}",
|
118
166
|
description="Invalid response format from LLM"
|
@@ -122,18 +170,23 @@ class ExecutorAgent(Workflow):
|
|
122
170
|
logger.info(f"🎯 Action: {parsed['action']}")
|
123
171
|
logger.debug(f" - Description: {parsed['description']}")
|
124
172
|
|
125
|
-
|
173
|
+
event = ExecutorInternalActionEvent(
|
126
174
|
action_json=parsed["action"],
|
127
175
|
thought=parsed["thought"],
|
128
176
|
description=parsed["description"]
|
129
177
|
)
|
130
178
|
|
179
|
+
# Write event to stream for web interface
|
180
|
+
ctx.write_event_to_stream(event)
|
181
|
+
|
182
|
+
return event
|
183
|
+
|
131
184
|
@step
|
132
185
|
async def execute(
|
133
186
|
self,
|
134
187
|
ctx: Context,
|
135
|
-
ev:
|
136
|
-
) ->
|
188
|
+
ev: ExecutorInternalActionEvent
|
189
|
+
) -> ExecutorInternalResultEvent:
|
137
190
|
"""
|
138
191
|
Execute the selected action using the tools instance.
|
139
192
|
|
@@ -146,7 +199,7 @@ class ExecutorAgent(Workflow):
|
|
146
199
|
action_dict = json.loads(ev.action_json)
|
147
200
|
except json.JSONDecodeError as e:
|
148
201
|
logger.error(f"❌ Failed to parse action JSON: {e}")
|
149
|
-
return
|
202
|
+
return ExecutorInternalResultEvent(
|
150
203
|
action={"action": "invalid"},
|
151
204
|
outcome=False,
|
152
205
|
error=f"Invalid action JSON: {str(e)}",
|
@@ -155,15 +208,15 @@ class ExecutorAgent(Workflow):
|
|
155
208
|
action_json=ev.action_json
|
156
209
|
)
|
157
210
|
|
158
|
-
# Execute the action
|
159
211
|
outcome, error, summary = await self._execute_action(action_dict, ev.description)
|
160
212
|
|
161
|
-
|
162
|
-
|
213
|
+
# TODO: Add sleep after action (should be in DroidAgent.handle_executor_result)
|
214
|
+
# Available via: self.agent_config.after_sleep_action
|
215
|
+
# await asyncio.sleep(self.agent_config.after_sleep_action)
|
163
216
|
|
164
217
|
logger.info(f"{'✅' if outcome else '❌'} Execution complete: {summary}")
|
165
218
|
|
166
|
-
|
219
|
+
result_event = ExecutorInternalResultEvent(
|
167
220
|
action=action_dict,
|
168
221
|
outcome=outcome,
|
169
222
|
error=error,
|
@@ -172,6 +225,11 @@ class ExecutorAgent(Workflow):
|
|
172
225
|
action_json=ev.action_json
|
173
226
|
)
|
174
227
|
|
228
|
+
# Write event to stream for web interface
|
229
|
+
ctx.write_event_to_stream(result_event)
|
230
|
+
|
231
|
+
return result_event
|
232
|
+
|
175
233
|
async def _execute_action(self, action_dict: dict, description: str) -> tuple[bool, str, str]:
|
176
234
|
"""
|
177
235
|
Execute a single action based on the action dictionary.
|
@@ -312,7 +370,7 @@ class ExecutorAgent(Workflow):
|
|
312
370
|
async def finalize(
|
313
371
|
self,
|
314
372
|
ctx: Context,
|
315
|
-
ev:
|
373
|
+
ev: ExecutorInternalResultEvent
|
316
374
|
) -> StopEvent:
|
317
375
|
"""Return executor results to parent workflow."""
|
318
376
|
logger.debug("✅ Executor execution complete")
|
@@ -76,7 +76,7 @@ The atomic action functions are listed in the format of `action(arguments): desc
|
|
76
76
|
\n
|
77
77
|
### Latest Action History ###
|
78
78
|
{(("Recent actions you took previously and whether they were successful:\n" + "\n".join(
|
79
|
-
(f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome
|
79
|
+
(f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome
|
80
80
|
else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
|
81
81
|
for act, summ, outcome, err_des in zip(
|
82
82
|
state.action_history[-min(5, len(state.action_history)):],
|
@@ -126,7 +126,13 @@ def parse_executor_response(response: str) -> dict:
|
|
126
126
|
Dictionary with 'thought', 'action', 'description' keys
|
127
127
|
"""
|
128
128
|
thought = response.split("### Thought")[-1].split("### Action")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
|
129
|
-
|
129
|
+
action_raw = response.split("### Action")[-1].split("### Description")[0].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
|
130
|
+
start_idx = action_raw.find('{')
|
131
|
+
end_idx = action_raw.rfind('}')
|
132
|
+
if start_idx != -1 and end_idx != -1:
|
133
|
+
action = action_raw[start_idx:end_idx + 1]
|
134
|
+
else:
|
135
|
+
action = action_raw
|
130
136
|
description = response.split("### Description")[-1].replace("\n", " ").replace(" ", " ").replace("###", "").strip()
|
131
137
|
|
132
138
|
return {
|
@@ -2,17 +2,16 @@
|
|
2
2
|
Manager Agent - Planning and reasoning workflow.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from droidrun.agent.
|
5
|
+
from droidrun.agent.droid.events import ManagerInputEvent, ManagerPlanEvent
|
6
|
+
from droidrun.agent.manager.events import ManagerThinkingEvent, ManagerInternalPlanEvent
|
6
7
|
from droidrun.agent.manager.manager_agent import ManagerAgent
|
7
|
-
from droidrun.agent.manager.prompts import
|
8
|
-
build_manager_system_prompt,
|
9
|
-
parse_manager_response,
|
10
|
-
)
|
8
|
+
from droidrun.agent.manager.prompts import parse_manager_response
|
11
9
|
|
12
10
|
__all__ = [
|
13
11
|
"ManagerAgent",
|
14
|
-
"
|
12
|
+
"ManagerInputEvent",
|
15
13
|
"ManagerPlanEvent",
|
16
|
-
"
|
14
|
+
"ManagerThinkingEvent",
|
15
|
+
"ManagerInternalPlanEvent",
|
17
16
|
"parse_manager_response",
|
18
17
|
]
|
droidrun/agent/manager/events.py
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
"""
|
2
2
|
Events for the ManagerAgent workflow.
|
3
|
+
|
4
|
+
These are INTERNAL events used within ManagerAgent for:
|
5
|
+
- Streaming to frontend/logging
|
6
|
+
- Carrying full debug metadata
|
7
|
+
|
8
|
+
For workflow coordination with DroidAgent, see droid/events.py
|
3
9
|
"""
|
4
10
|
|
5
11
|
from llama_index.core.workflow.events import Event
|
@@ -10,11 +16,17 @@ class ManagerThinkingEvent(Event):
|
|
10
16
|
pass
|
11
17
|
|
12
18
|
|
13
|
-
class
|
14
|
-
"""
|
19
|
+
class ManagerInternalPlanEvent(Event):
|
20
|
+
"""
|
21
|
+
Internal Manager planning event with full state and metadata.
|
22
|
+
|
23
|
+
This event is streamed to frontend/logging but NOT used for
|
24
|
+
workflow coordination between ManagerAgent and DroidAgent.
|
25
|
+
|
26
|
+
For workflow coordination, see ManagerPlanEvent in droid/events.py
|
27
|
+
"""
|
15
28
|
plan: str
|
16
29
|
current_subgoal: str
|
17
|
-
completed_plan: str
|
18
30
|
thought: str
|
19
31
|
manager_answer: str = ""
|
20
|
-
memory_update: str = ""
|
32
|
+
memory_update: str = "" # Debugging metadata: LLM's memory additions
|
@@ -11,22 +11,26 @@ This agent is responsible for:
|
|
11
11
|
from __future__ import annotations
|
12
12
|
|
13
13
|
import logging
|
14
|
-
from typing import TYPE_CHECKING
|
14
|
+
from typing import TYPE_CHECKING
|
15
15
|
|
16
16
|
from llama_index.core.llms.llm import LLM
|
17
17
|
from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
|
18
18
|
|
19
|
-
from droidrun.agent.manager.events import
|
20
|
-
from droidrun.agent.manager.prompts import
|
19
|
+
from droidrun.agent.manager.events import ManagerInternalPlanEvent, ManagerThinkingEvent
|
20
|
+
from droidrun.agent.manager.prompts import parse_manager_response
|
21
21
|
from droidrun.agent.utils import convert_messages_to_chatmessages
|
22
22
|
from droidrun.agent.utils.chat_utils import remove_empty_messages
|
23
|
-
from droidrun.agent.utils.device_state_formatter import
|
23
|
+
from droidrun.agent.utils.device_state_formatter import format_device_state
|
24
24
|
from droidrun.agent.utils.inference import acall_with_retries
|
25
25
|
from droidrun.agent.utils.tools import build_custom_tool_descriptions
|
26
|
+
from droidrun.config_manager.prompt_loader import PromptLoader
|
27
|
+
from droidrun.config_manager.app_card_loader import AppCardLoader
|
26
28
|
|
27
29
|
if TYPE_CHECKING:
|
28
30
|
from droidrun.agent.droid.events import DroidAgentState
|
29
31
|
from droidrun.tools import Tools
|
32
|
+
from droidrun.config_manager.config_manager import AgentConfig
|
33
|
+
|
30
34
|
|
31
35
|
logger = logging.getLogger("droidrun")
|
32
36
|
|
@@ -45,22 +49,21 @@ class ManagerAgent(Workflow):
|
|
45
49
|
def __init__(
|
46
50
|
self,
|
47
51
|
llm: LLM,
|
48
|
-
vision: bool,
|
49
|
-
personas: List,
|
50
52
|
tools_instance: "Tools",
|
51
53
|
shared_state: "DroidAgentState",
|
54
|
+
agent_config: "AgentConfig",
|
52
55
|
custom_tools: dict = None,
|
53
|
-
debug: bool = False,
|
54
56
|
**kwargs
|
55
57
|
):
|
56
58
|
super().__init__(**kwargs)
|
57
59
|
self.llm = llm
|
58
|
-
self.
|
59
|
-
self.
|
60
|
+
self.config = agent_config.manager
|
61
|
+
self.vision = self.config.vision
|
60
62
|
self.tools_instance = tools_instance
|
61
63
|
self.shared_state = shared_state
|
62
64
|
self.custom_tools = custom_tools or {}
|
63
|
-
self.
|
65
|
+
self.agent_config = agent_config
|
66
|
+
self.app_card_loader = self.agent_config.app_cards
|
64
67
|
|
65
68
|
logger.info("✅ ManagerAgent initialized successfully.")
|
66
69
|
|
@@ -70,23 +73,23 @@ class ManagerAgent(Workflow):
|
|
70
73
|
|
71
74
|
def _build_system_prompt(
|
72
75
|
self,
|
73
|
-
has_text_to_modify: bool
|
76
|
+
has_text_to_modify: bool,
|
77
|
+
app_card: str = ""
|
74
78
|
) -> str:
|
75
79
|
"""
|
76
80
|
Build system prompt with all context.
|
77
81
|
|
78
82
|
Args:
|
79
83
|
has_text_to_modify: Whether text manipulation mode is enabled
|
80
|
-
|
84
|
+
app_card: App card content
|
81
85
|
Returns:
|
82
86
|
Complete system prompt
|
83
87
|
"""
|
84
|
-
|
85
|
-
|
86
|
-
error_history = []
|
88
|
+
# Format error history
|
89
|
+
error_history_text = ""
|
87
90
|
if self.shared_state.error_flag_plan:
|
88
91
|
k = self.shared_state.err_to_manager_thresh
|
89
|
-
|
92
|
+
errors = [
|
90
93
|
{
|
91
94
|
"action": act,
|
92
95
|
"summary": summ,
|
@@ -98,19 +101,76 @@ class ManagerAgent(Workflow):
|
|
98
101
|
self.shared_state.error_descriptions[-k:], strict=True
|
99
102
|
)
|
100
103
|
]
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
104
|
+
error_history_text = (
|
105
|
+
"<potentially_stuck>\n"
|
106
|
+
"You have encountered several failed attempts. Here are some logs:\n"
|
107
|
+
)
|
108
|
+
for error in errors:
|
109
|
+
error_history_text += (
|
110
|
+
f"- Attempt: Action: {error['action']} | "
|
111
|
+
f"Description: {error['summary']} | "
|
112
|
+
f"Outcome: Failed | "
|
113
|
+
f"Feedback: {error['error']}\n"
|
114
|
+
)
|
115
|
+
error_history_text += "</potentially_stuck>\n\n"
|
116
|
+
|
117
|
+
# Text manipulation section
|
118
|
+
text_manipulation_section = ""
|
119
|
+
if has_text_to_modify:
|
120
|
+
text_manipulation_section = """
|
121
|
+
|
122
|
+
<text_manipulation>
|
123
|
+
1. Use **TEXT_TASK:** prefix in your plan when you need to modify text in the currently focused text input field
|
124
|
+
2. TEXT_TASK is for editing, formatting, or transforming existing text content in text boxes using Python code
|
125
|
+
3. Do not use TEXT_TASK for extracting text from messages, typing new text, or composing messages
|
126
|
+
4. The focused text field contains editable text that you can modify
|
127
|
+
5. Example plan item: 'TEXT_TASK: Add "Hello World" at the beginning of the text'
|
128
|
+
6. Always use TEXT_TASK for modifying text, do not try to select the text to copy/cut/paste or adjust the text
|
129
|
+
</text_manipulation>"""
|
130
|
+
|
131
|
+
# Device date (include tags in variable value or empty string)
|
132
|
+
device_date = self.tools_instance.get_date()
|
133
|
+
device_date_text = ""
|
134
|
+
if device_date.strip():
|
135
|
+
device_date_text = f"<device_date>\n{device_date}\n</device_date>\n\n"
|
136
|
+
|
137
|
+
# App card (include tags in variable value or empty string)
|
138
|
+
app_card = app_card
|
139
|
+
app_card_text = ""
|
140
|
+
if app_card.strip():
|
141
|
+
app_card_text = "App card gives information on how to operate the app and perform actions.\n<app_card>\n" + app_card.strip() + "\n</app_card>\n\n"
|
142
|
+
|
143
|
+
# Important notes (include tags in variable value or empty string)
|
144
|
+
important_notes = "" # TODO: implement
|
145
|
+
important_notes_text = ""
|
146
|
+
if important_notes.strip():
|
147
|
+
important_notes_text = "<important_notes>\n" + important_notes + "\n</important_notes>\n\n"
|
148
|
+
|
149
|
+
# Custom tools
|
150
|
+
custom_tools_desc = build_custom_tool_descriptions(self.custom_tools)
|
151
|
+
custom_tools_text = ""
|
152
|
+
if custom_tools_desc.strip():
|
153
|
+
custom_tools_text = """
|
154
|
+
|
155
|
+
<custom_actions>
|
156
|
+
The executor has access to these additional custom actions beyond the standard actions (click, type, swipe, etc.):
|
157
|
+
""" + custom_tools_desc + """
|
158
|
+
|
159
|
+
You can reference these custom actions or tell the Executer agent to use them in your plan when they help achieve the user's goal.
|
160
|
+
</custom_actions>"""
|
161
|
+
|
162
|
+
# Load and format prompt
|
163
|
+
return PromptLoader.load_prompt(
|
164
|
+
self.agent_config.get_manager_system_prompt_path(),
|
165
|
+
{
|
166
|
+
"instruction": self.shared_state.instruction,
|
167
|
+
"device_date": device_date_text,
|
168
|
+
"app_card": app_card_text,
|
169
|
+
"important_notes": important_notes_text,
|
170
|
+
"error_history": error_history_text,
|
171
|
+
"text_manipulation_section": text_manipulation_section,
|
172
|
+
"custom_tools_descriptions": custom_tools_text
|
173
|
+
}
|
114
174
|
)
|
115
175
|
|
116
176
|
def _build_messages_with_context(
|
@@ -155,28 +215,28 @@ class ManagerAgent(Workflow):
|
|
155
215
|
else:
|
156
216
|
messages[last_user_idx]['content'].insert(0, {"text": f"<memory>\n{current_memory}\n</memory>\n"})
|
157
217
|
|
158
|
-
# Add device state to last user message
|
159
|
-
|
160
|
-
if
|
218
|
+
# Add CURRENT device state to last user message (use unified state)
|
219
|
+
current_state = self.shared_state.formatted_device_state.strip()
|
220
|
+
if current_state:
|
161
221
|
if messages[last_user_idx]['content'] and 'text' in messages[last_user_idx]['content'][0]:
|
162
|
-
messages[last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{
|
222
|
+
messages[last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{current_state}\n</device_state>\n"
|
163
223
|
else:
|
164
|
-
messages[last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{
|
224
|
+
messages[last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{current_state}\n</device_state>\n"})
|
165
225
|
|
166
226
|
# Add screenshot to last user message
|
167
227
|
if screenshot and self.vision:
|
168
228
|
messages[last_user_idx]['content'].append({"image": screenshot})
|
169
229
|
|
170
|
-
# Add
|
230
|
+
# Add PREVIOUS device state to SECOND-TO-LAST user message (if exists)
|
171
231
|
if len(user_indices) >= 2:
|
172
232
|
second_last_user_idx = user_indices[-2]
|
173
|
-
|
233
|
+
prev_state = self.shared_state.previous_formatted_device_state.strip()
|
174
234
|
|
175
|
-
if
|
235
|
+
if prev_state:
|
176
236
|
if messages[second_last_user_idx]['content'] and 'text' in messages[second_last_user_idx]['content'][0]:
|
177
|
-
messages[second_last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{
|
237
|
+
messages[second_last_user_idx]['content'][0]['text'] += f"\n<device_state>\n{prev_state}\n</device_state>\n"
|
178
238
|
else:
|
179
|
-
messages[second_last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{
|
239
|
+
messages[second_last_user_idx]['content'].insert(0, {"text": f"<device_state>\n{prev_state}\n</device_state>\n"})
|
180
240
|
messages = remove_empty_messages(messages)
|
181
241
|
return messages
|
182
242
|
|
@@ -263,9 +323,23 @@ class ManagerAgent(Workflow):
|
|
263
323
|
logger.info("💬 Preparing manager input...")
|
264
324
|
|
265
325
|
# ====================================================================
|
266
|
-
# Step 1: Get device state
|
326
|
+
# Step 1: Get and format device state using unified formatter
|
267
327
|
# ====================================================================
|
268
|
-
|
328
|
+
raw_state = self.tools_instance.get_state()
|
329
|
+
formatted_text, focused_text, a11y_tree, phone_state = format_device_state(raw_state)
|
330
|
+
|
331
|
+
# Update shared state (previous ← current, current ← new)
|
332
|
+
self.shared_state.previous_formatted_device_state = self.shared_state.formatted_device_state
|
333
|
+
self.shared_state.formatted_device_state = formatted_text
|
334
|
+
self.shared_state.focused_text = focused_text
|
335
|
+
self.shared_state.a11y_tree = a11y_tree
|
336
|
+
self.shared_state.phone_state = phone_state
|
337
|
+
|
338
|
+
# Extract and store package/app name
|
339
|
+
self.shared_state.current_package_name = phone_state.get('packageName', 'Unknown')
|
340
|
+
self.shared_state.current_app_name = phone_state.get('currentApp', 'Unknown')
|
341
|
+
|
342
|
+
# App cards
|
269
343
|
|
270
344
|
# ====================================================================
|
271
345
|
# Step 2: Capture screenshot if vision enabled
|
@@ -278,6 +352,7 @@ class ManagerAgent(Workflow):
|
|
278
352
|
success, screenshot = result
|
279
353
|
if not success:
|
280
354
|
screenshot = None
|
355
|
+
|
281
356
|
else:
|
282
357
|
screenshot = result
|
283
358
|
logger.debug("📸 Screenshot captured for Manager")
|
@@ -288,29 +363,9 @@ class ManagerAgent(Workflow):
|
|
288
363
|
# ====================================================================
|
289
364
|
# Step 3: Detect text manipulation mode
|
290
365
|
# ====================================================================
|
291
|
-
focused_text = focused_text or ""
|
292
366
|
focused_text_clean = focused_text.replace("'", "").strip()
|
293
|
-
|
294
|
-
# Check if focused text differs from last typed text
|
295
|
-
# last_typed_text = ""
|
296
|
-
# if self.shared_state.action_history:
|
297
|
-
# recent_actions = self.shared_state.action_history[-1:] if len(self.shared_state.action_history) >= 1 else []
|
298
|
-
# for action in reversed(recent_actions):
|
299
|
-
# if isinstance(action, dict) and action.get('action') == 'type':
|
300
|
-
# last_typed_text = action.get('text', '')
|
301
|
-
# break
|
302
|
-
|
303
367
|
has_text_to_modify = (focused_text_clean != "")
|
304
368
|
|
305
|
-
# ====================================================================
|
306
|
-
# Step 4: Update state with device info
|
307
|
-
# ====================================================================
|
308
|
-
self.shared_state.device_state_text = device_state_text
|
309
|
-
self.shared_state.focused_text = focused_text
|
310
|
-
# Shift UI elements: before ← after, after ← current
|
311
|
-
self.shared_state.ui_elements_list_before = self.shared_state.ui_elements_list_after
|
312
|
-
self.shared_state.ui_elements_list_after = device_state_text
|
313
|
-
|
314
369
|
# ====================================================================
|
315
370
|
# Step 5: Build user message entry
|
316
371
|
# ====================================================================
|
@@ -328,7 +383,7 @@ class ManagerAgent(Workflow):
|
|
328
383
|
if self.shared_state.last_summary:
|
329
384
|
parts.append(f"<last_action_description>\n{self.shared_state.last_summary}\n</last_action_description>\n")
|
330
385
|
|
331
|
-
|
386
|
+
|
332
387
|
self.shared_state.message_history.append({
|
333
388
|
"role": "user",
|
334
389
|
"content": [{"text": "".join(parts)}]
|
@@ -346,7 +401,7 @@ class ManagerAgent(Workflow):
|
|
346
401
|
self,
|
347
402
|
ctx: Context,
|
348
403
|
ev: ManagerThinkingEvent
|
349
|
-
) ->
|
404
|
+
) -> ManagerInternalPlanEvent:
|
350
405
|
"""
|
351
406
|
Manager reasons and creates plan.
|
352
407
|
|
@@ -362,11 +417,15 @@ class ManagerAgent(Workflow):
|
|
362
417
|
|
363
418
|
has_text_to_modify = self.shared_state.has_text_to_modify
|
364
419
|
screenshot = self.shared_state.screenshot
|
420
|
+
if self.app_card_loader.enabled:
|
421
|
+
app_card = AppCardLoader.load_app_card(self.shared_state.current_package_name, self.app_card_loader.app_cards_dir)
|
422
|
+
else:
|
423
|
+
app_card = ""
|
365
424
|
|
366
425
|
# ====================================================================
|
367
426
|
# Step 1: Build system prompt
|
368
427
|
# ====================================================================
|
369
|
-
system_prompt = self._build_system_prompt(has_text_to_modify)
|
428
|
+
system_prompt = self._build_system_prompt(has_text_to_modify, app_card)
|
370
429
|
|
371
430
|
# ====================================================================
|
372
431
|
# Step 2: Build messages with context
|
@@ -423,7 +482,6 @@ class ManagerAgent(Workflow):
|
|
423
482
|
# Update planning fields
|
424
483
|
self.shared_state.plan = parsed["plan"]
|
425
484
|
self.shared_state.current_subgoal = parsed["current_subgoal"]
|
426
|
-
self.shared_state.completed_plan = parsed.get("completed_subgoal", "No completed subgoal.")
|
427
485
|
self.shared_state.finish_thought = parsed["thought"]
|
428
486
|
self.shared_state.manager_answer = parsed["answer"]
|
429
487
|
|
@@ -431,20 +489,24 @@ class ManagerAgent(Workflow):
|
|
431
489
|
logger.debug(f" - Current subgoal: {parsed['current_subgoal']}")
|
432
490
|
logger.debug(f" - Manager answer: {parsed['answer'][:50] if parsed['answer'] else 'None'}")
|
433
491
|
|
434
|
-
|
492
|
+
event = ManagerInternalPlanEvent(
|
435
493
|
plan=parsed["plan"],
|
436
494
|
current_subgoal=parsed["current_subgoal"],
|
437
|
-
completed_plan=parsed.get("completed_subgoal", "No completed subgoal."),
|
438
495
|
thought=parsed["thought"],
|
439
496
|
manager_answer=parsed["answer"],
|
440
497
|
memory_update=memory_update
|
441
498
|
)
|
442
499
|
|
500
|
+
# Write event to stream for web interface
|
501
|
+
ctx.write_event_to_stream(event)
|
502
|
+
|
503
|
+
return event
|
504
|
+
|
443
505
|
@step
|
444
506
|
async def finalize(
|
445
507
|
self,
|
446
508
|
ctx: Context,
|
447
|
-
ev:
|
509
|
+
ev: ManagerInternalPlanEvent
|
448
510
|
) -> StopEvent:
|
449
511
|
"""Return manager results to parent workflow."""
|
450
512
|
logger.debug("✅ Manager planning complete")
|
@@ -452,7 +514,6 @@ class ManagerAgent(Workflow):
|
|
452
514
|
return StopEvent(result={
|
453
515
|
"plan": ev.plan,
|
454
516
|
"current_subgoal": ev.current_subgoal,
|
455
|
-
"completed_plan": ev.completed_plan,
|
456
517
|
"thought": ev.thought,
|
457
518
|
"manager_answer": ev.manager_answer,
|
458
519
|
"memory_update": ev.memory_update
|