droidrun 0.3.10.dev5__py3-none-any.whl → 0.3.10.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/agent/codeact/codeact_agent.py +21 -29
- droidrun/agent/context/task_manager.py +0 -1
- droidrun/agent/droid/droid_agent.py +1 -3
- droidrun/agent/droid/events.py +6 -3
- droidrun/agent/executor/executor_agent.py +24 -38
- droidrun/agent/executor/prompts.py +0 -108
- droidrun/agent/manager/__init__.py +1 -1
- droidrun/agent/manager/manager_agent.py +104 -87
- droidrun/agent/utils/executer.py +11 -10
- droidrun/agent/utils/llm_picker.py +63 -1
- droidrun/agent/utils/tools.py +30 -1
- droidrun/app_cards/app_card_provider.py +26 -0
- droidrun/app_cards/providers/__init__.py +7 -0
- droidrun/app_cards/providers/composite_provider.py +97 -0
- droidrun/app_cards/providers/local_provider.py +115 -0
- droidrun/app_cards/providers/server_provider.py +126 -0
- droidrun/cli/logs.py +4 -4
- droidrun/cli/main.py +244 -34
- droidrun/config_manager/__init__.py +0 -2
- droidrun/config_manager/config_manager.py +45 -102
- droidrun/config_manager/path_resolver.py +1 -1
- droidrun/config_manager/prompt_loader.py +48 -51
- droidrun/macro/cli.py +0 -1
- droidrun/portal.py +17 -0
- droidrun/tools/adb.py +13 -34
- {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/METADATA +2 -9
- {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/RECORD +30 -26
- droidrun/config_manager/app_card_loader.py +0 -148
- {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/WHEEL +0 -0
- {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/entry_points.txt +0 -0
- {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,7 @@ import json
|
|
3
3
|
import logging
|
4
4
|
import re
|
5
5
|
import time
|
6
|
-
from typing import
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
7
7
|
|
8
8
|
from llama_index.core.base.llms.types import ChatMessage, ChatResponse
|
9
9
|
from llama_index.core.llms.llm import LLM
|
@@ -23,9 +23,8 @@ from droidrun.agent.common.events import RecordUIStateEvent, ScreenshotEvent
|
|
23
23
|
from droidrun.agent.context.episodic_memory import EpisodicMemory, EpisodicMemoryStep
|
24
24
|
from droidrun.agent.usage import get_usage_from_response
|
25
25
|
from droidrun.agent.utils import chat_utils
|
26
|
-
from droidrun.agent.utils.executer import SimpleCodeExecutor, ExecuterState
|
27
26
|
from droidrun.agent.utils.device_state_formatter import format_device_state
|
28
|
-
|
27
|
+
from droidrun.agent.utils.executer import ExecuterState, SimpleCodeExecutor
|
29
28
|
from droidrun.agent.utils.tools import (
|
30
29
|
ATOMIC_ACTION_SIGNATURES,
|
31
30
|
build_custom_tool_descriptions,
|
@@ -85,15 +84,8 @@ class CodeActAgent(Workflow):
|
|
85
84
|
self.tool_list = {}
|
86
85
|
for action_name, signature in merged_signatures.items():
|
87
86
|
func = signature["function"]
|
88
|
-
|
89
|
-
|
90
|
-
def make_bound(f, ti):
|
91
|
-
async def bound_func(*args, **kwargs):
|
92
|
-
return await f(ti, *args, **kwargs)
|
93
|
-
return bound_func
|
94
|
-
self.tool_list[action_name] = make_bound(func, tools_instance)
|
95
|
-
else:
|
96
|
-
self.tool_list[action_name] = lambda *args, f=func, ti=tools_instance, **kwargs: f(ti, *args, **kwargs)
|
87
|
+
|
88
|
+
self.tool_list[action_name] = lambda *args, f=func, ti=tools_instance, **kwargs: f(ti, *args, **kwargs)
|
97
89
|
|
98
90
|
self.tool_list["remember"] = tools_instance.remember
|
99
91
|
self.tool_list["complete"] = tools_instance.complete
|
@@ -113,13 +105,10 @@ class CodeActAgent(Workflow):
|
|
113
105
|
)
|
114
106
|
self.system_prompt = ChatMessage(role="system", content=system_prompt_text)
|
115
107
|
|
116
|
-
self.user_prompt_template = PromptLoader.load_prompt(agent_config.get_codeact_user_prompt_path())
|
117
|
-
|
118
108
|
self.executor = SimpleCodeExecutor(
|
119
109
|
loop=asyncio.get_event_loop(),
|
120
110
|
locals={},
|
121
111
|
tools=self.tool_list,
|
122
|
-
tools_instance=tools_instance,
|
123
112
|
globals={"__builtins__": __builtins__},
|
124
113
|
)
|
125
114
|
|
@@ -293,27 +282,30 @@ Now, describe the next step you will take to address the original goal: {goal}""
|
|
293
282
|
try:
|
294
283
|
self.code_exec_counter += 1
|
295
284
|
result = await self.executor.execute(ExecuterState(ui_state=ctx.store.get("ui_state", None)), code)
|
296
|
-
logger.info(f"💡 Code execution successful. Result: {result
|
285
|
+
logger.info(f"💡 Code execution successful. Result: {result}")
|
297
286
|
await asyncio.sleep(self.agent_config.after_sleep_action)
|
298
|
-
screenshots = result['screenshots']
|
299
|
-
for screenshot in screenshots[:-1]: # the last screenshot will be captured by next step
|
300
|
-
ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
|
301
|
-
|
302
|
-
ui_states = result['ui_states']
|
303
|
-
for ui_state in ui_states[:-1]:
|
304
|
-
ctx.write_event_to_stream(RecordUIStateEvent(ui_state=ui_state['a11y_tree']))
|
305
287
|
|
288
|
+
# Check if complete() was called
|
306
289
|
if self.tools.finished:
|
307
|
-
logger.
|
308
|
-
|
309
|
-
|
310
|
-
|
290
|
+
logger.info("✅ Task marked as complete via complete() function")
|
291
|
+
|
292
|
+
# Validate completion state
|
293
|
+
success = self.tools.success if self.tools.success is not None else False
|
294
|
+
reason = self.tools.reason if self.tools.reason else "Task completed without reason"
|
295
|
+
|
296
|
+
# Reset finished flag for next execution
|
297
|
+
self.tools.finished = False
|
298
|
+
|
299
|
+
logger.info(f" - Success: {success}")
|
300
|
+
logger.info(f" - Reason: {reason}")
|
301
|
+
|
302
|
+
event = TaskEndEvent(success=success, reason=reason)
|
311
303
|
ctx.write_event_to_stream(event)
|
312
304
|
return event
|
313
305
|
|
314
306
|
self.remembered_info = self.tools.memory
|
315
307
|
|
316
|
-
event = TaskExecutionResultEvent(output=str(result
|
308
|
+
event = TaskExecutionResultEvent(output=str(result))
|
317
309
|
ctx.write_event_to_stream(event)
|
318
310
|
return event
|
319
311
|
|
@@ -496,7 +488,7 @@ Now, describe the next step you will take to address the original goal: {goal}""
|
|
496
488
|
try:
|
497
489
|
state = self.tools.get_state()
|
498
490
|
a11y_tree = state.get("a11y_tree", "")
|
499
|
-
phone_state = state.get("phone_state", "")
|
491
|
+
phone_state = state.get("phone_state", "") # noqa: F841
|
500
492
|
except Exception as e:
|
501
493
|
raise Exception(f"Failed to capture final UI state: {e}") from e
|
502
494
|
|
@@ -32,7 +32,7 @@ from droidrun.agent.droid.events import (
|
|
32
32
|
)
|
33
33
|
from droidrun.agent.executor import ExecutorAgent
|
34
34
|
from droidrun.agent.manager import ManagerAgent
|
35
|
-
from droidrun.agent.utils.tools import ATOMIC_ACTION_SIGNATURES
|
35
|
+
from droidrun.agent.utils.tools import ATOMIC_ACTION_SIGNATURES, open_app
|
36
36
|
from droidrun.agent.utils.trajectory import Trajectory
|
37
37
|
from droidrun.config_manager.config_manager import (
|
38
38
|
AgentConfig,
|
@@ -43,8 +43,6 @@ from droidrun.config_manager.config_manager import (
|
|
43
43
|
ToolsConfig,
|
44
44
|
TracingConfig,
|
45
45
|
)
|
46
|
-
|
47
|
-
from droidrun.agent.utils.tools import open_app
|
48
46
|
from droidrun.telemetry import (
|
49
47
|
DroidAgentFinalizeEvent,
|
50
48
|
DroidAgentInitEvent,
|
droidrun/agent/droid/events.py
CHANGED
@@ -10,10 +10,11 @@ For internal events with full debugging metadata, see:
|
|
10
10
|
- codeact/events.py (Task*, EpisodicMemoryEvent)
|
11
11
|
"""
|
12
12
|
|
13
|
+
import asyncio
|
13
14
|
from typing import Dict, List
|
14
15
|
|
15
16
|
from llama_index.core.workflow import Event
|
16
|
-
from pydantic import BaseModel, Field
|
17
|
+
from pydantic import BaseModel, ConfigDict, Field
|
17
18
|
|
18
19
|
from droidrun.agent.context import Task
|
19
20
|
|
@@ -46,10 +47,12 @@ class DroidAgentState(BaseModel):
|
|
46
47
|
"""
|
47
48
|
State model for DroidAgent workflow - shared across parent and child workflows.
|
48
49
|
"""
|
49
|
-
|
50
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
50
51
|
# Task context
|
51
52
|
instruction: str = ""
|
52
|
-
|
53
|
+
# App Cards
|
54
|
+
app_card: str = ""
|
55
|
+
app_card_loading_task: asyncio.Task[str] | None = None
|
53
56
|
# Formatted device state for prompts (complete text)
|
54
57
|
formatted_device_state: str = ""
|
55
58
|
|
@@ -90,52 +90,38 @@ class ExecutorAgent(Workflow): # TODO: Fix a bug in bad prompt
|
|
90
90
|
subgoal = ev.get("subgoal", "")
|
91
91
|
logger.info(f"🧠 Executor thinking about action for: {subgoal}")
|
92
92
|
|
93
|
-
#
|
94
|
-
|
95
|
-
app_card_text = ""
|
96
|
-
if app_card.strip():
|
97
|
-
app_card_text = "App card gives information on how to operate the app and perform actions.\n### App Card ###\n" + app_card.strip() + "\n\n"
|
98
|
-
|
99
|
-
# Format device state (use unified state)
|
100
|
-
device_state_text = ""
|
101
|
-
if self.shared_state.formatted_device_state and self.shared_state.formatted_device_state.strip():
|
102
|
-
device_state_text = "### Device State ###\n" + self.shared_state.formatted_device_state.strip() + "\n\n"
|
103
|
-
|
104
|
-
# Format progress status
|
105
|
-
progress_status_text = self.shared_state.progress_status + "\n\n" if self.shared_state.progress_status else "No progress yet.\n\n"
|
106
|
-
|
107
|
-
# Format atomic actions
|
108
|
-
atomic_actions_text = chr(10).join(
|
109
|
-
f"- {action_name}({', '.join(action_info['arguments'])}): {action_info['description']}"
|
110
|
-
for action_name, action_info in ATOMIC_ACTION_SIGNATURES.items()
|
111
|
-
) + "\n"
|
112
|
-
|
113
|
-
# Format action history
|
93
|
+
# Prepare action history as structured data (last 5 actions)
|
94
|
+
action_history = []
|
114
95
|
if self.shared_state.action_history:
|
115
|
-
|
116
|
-
|
117
|
-
|
96
|
+
n = min(5, len(self.shared_state.action_history))
|
97
|
+
action_history = [
|
98
|
+
{
|
99
|
+
"action": act,
|
100
|
+
"summary": summ,
|
101
|
+
"outcome": outcome,
|
102
|
+
"error": err_des
|
103
|
+
}
|
118
104
|
for act, summ, outcome, err_des in zip(
|
119
|
-
self.shared_state.action_history[-
|
120
|
-
self.shared_state.summary_history[-
|
121
|
-
self.shared_state.action_outcomes[-
|
122
|
-
self.shared_state.error_descriptions[-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
#
|
105
|
+
self.shared_state.action_history[-n:],
|
106
|
+
self.shared_state.summary_history[-n:],
|
107
|
+
self.shared_state.action_outcomes[-n:],
|
108
|
+
self.shared_state.error_descriptions[-n:],
|
109
|
+
strict=True
|
110
|
+
)
|
111
|
+
]
|
112
|
+
|
113
|
+
# Let Jinja2 handle all formatting
|
128
114
|
system_prompt = PromptLoader.load_prompt(
|
129
115
|
self.agent_config.get_executor_system_prompt_path(),
|
130
116
|
{
|
131
117
|
"instruction": self.shared_state.instruction,
|
132
|
-
"app_card":
|
133
|
-
"
|
118
|
+
"app_card": "", # TODO: Implement app card loader
|
119
|
+
"device_state": self.shared_state.formatted_device_state,
|
134
120
|
"plan": self.shared_state.plan,
|
135
121
|
"subgoal": subgoal,
|
136
|
-
"progress_status":
|
137
|
-
"atomic_actions":
|
138
|
-
"action_history":
|
122
|
+
"progress_status": self.shared_state.progress_status,
|
123
|
+
"atomic_actions": ATOMIC_ACTION_SIGNATURES,
|
124
|
+
"action_history": action_history
|
139
125
|
}
|
140
126
|
)
|
141
127
|
|
@@ -2,114 +2,6 @@
|
|
2
2
|
Prompts for the ExecutorAgent.
|
3
3
|
"""
|
4
4
|
|
5
|
-
|
6
|
-
from droidrun.agent.droid.events import DroidAgentState
|
7
|
-
from droidrun.agent.utils.tools import ATOMIC_ACTION_SIGNATURES
|
8
|
-
|
9
|
-
|
10
|
-
def build_executor_system_prompt(
|
11
|
-
state: "DroidAgentState",
|
12
|
-
subgoal: str,
|
13
|
-
app_card: str = "",
|
14
|
-
) -> str:
|
15
|
-
"""
|
16
|
-
Build the complete Executor system prompt with all context.
|
17
|
-
|
18
|
-
Args:
|
19
|
-
state: Current DroidAgentState with all context
|
20
|
-
subgoal: Current subgoal to execute
|
21
|
-
app_card: Optional app-specific instructions
|
22
|
-
|
23
|
-
Returns:
|
24
|
-
Complete system prompt for the Executor
|
25
|
-
"""
|
26
|
-
prompt = f"""You are a LOW-LEVEL ACTION EXECUTOR for an Android phone. You do NOT answer questions or provide results. You ONLY perform individual atomic actions as specified in the current subgoal. You are part of a larger system - your job is to execute actions, not to think about or answer the user's original question.
|
27
|
-
|
28
|
-
### User Request ###
|
29
|
-
{state.instruction}
|
30
|
-
|
31
|
-
{("App card gives information on how to operate the app and perform actions.\n" + "### App Card ###\n" + app_card.strip() + "\n\n") if app_card.strip() else ""}{(("### Device State ###\n" + state.device_state_text.strip() + "\n\n") if state.device_state_text.strip() else "")}### Overall Plan ###
|
32
|
-
{state.plan}
|
33
|
-
|
34
|
-
### Current Subgoal ###
|
35
|
-
EXECUTE THIS SUBGOAL: {subgoal}
|
36
|
-
|
37
|
-
EXECUTION MODE: You are a dumb robot. Find the exact text/element mentioned in the subgoal above and perform the specified action on it. Do not read anything below this line until after you execute the subgoal.
|
38
|
-
|
39
|
-
### SUBGOAL PARSING MODE ###
|
40
|
-
Read the current subgoal exactly as written. Look for:
|
41
|
-
- Action words: "tap", "click", "swipe", "type", "press", "open" etc.
|
42
|
-
- Target elements: specific text, buttons, fields, coordinates mentioned
|
43
|
-
- Locations: "header", "bottom", "left", "right", specific coordinates
|
44
|
-
Convert directly to atomic action:
|
45
|
-
- "tap/click" → click action
|
46
|
-
- "swipe" → swipe action
|
47
|
-
- "type" → type action
|
48
|
-
- "press [system button]" → system_button action
|
49
|
-
- "open [app]" → open_app action
|
50
|
-
Execute the atomic action for the exact target mentioned. Ignore everything else.
|
51
|
-
|
52
|
-
### Progress Status ###
|
53
|
-
{(state.progress_status + "\n\n") if state.progress_status != "" else "No progress yet.\n\n"}
|
54
|
-
|
55
|
-
### Guidelines ###
|
56
|
-
General:
|
57
|
-
- For any pop-up window, such as a permission request, you need to close it (e.g., by clicking `Don't Allow` or `Accept & continue`) before proceeding. Never choose to add any account or log in.
|
58
|
-
Action Related:
|
59
|
-
- Use the `open_app` action whenever you want to open an app (nothing will happen if the app is not installed), do not use the app drawer to open an app.
|
60
|
-
- Consider exploring the screen by using the `swipe` action with different directions to reveal additional content. Or use search to quickly find a specific entry, if applicable.
|
61
|
-
- If you cannot change the page content by swiping in the same direction continuously, the page may have been swiped to the bottom. Please try another operation to display more content.
|
62
|
-
- For some horizontally distributed tags, you can swipe horizontally to view more.
|
63
|
-
Text Related Operations:
|
64
|
-
- Activated input box: If an input box is activated, it may have a cursor inside it and the keyboard is visible. If there is no cursor on the screen but the keyboard is visible, it may be because the cursor is blinking. The color of the activated input box will be highlighted. If you are not sure whether the input box is activated, click it before typing.
|
65
|
-
- To input some text: first click the input box that you want to input, make sure the correct input box is activated and the keyboard is visible, then use `type` action to enter the specified text.
|
66
|
-
- To clear the text: long press the backspace button in the keyboard.
|
67
|
-
- To copy some text: first long press the text you want to copy, then click the `copy` button in bar.
|
68
|
-
- To paste text into a text box: first long press the text box, then click the `paste` button in bar.
|
69
|
-
|
70
|
-
---
|
71
|
-
Execute the current subgoal mechanically. Do NOT examine the screen content or make decisions about what you see. Parse the current subgoal text to identify the required action and execute it exactly as written. You must choose your action from one of the atomic actions.
|
72
|
-
|
73
|
-
#### Atomic Actions ####
|
74
|
-
The atomic action functions are listed in the format of `action(arguments): description` as follows:
|
75
|
-
{chr(10).join(f"- {action_name}({', '.join(action_info['arguments'])}): {action_info['description']}" for action_name, action_info in ATOMIC_ACTION_SIGNATURES.items())}
|
76
|
-
\n
|
77
|
-
### Latest Action History ###
|
78
|
-
{(("Recent actions you took previously and whether they were successful:\n" + "\n".join(
|
79
|
-
(f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome
|
80
|
-
else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
|
81
|
-
for act, summ, outcome, err_des in zip(
|
82
|
-
state.action_history[-min(5, len(state.action_history)):],
|
83
|
-
state.summary_history[-min(5, len(state.action_history)):],
|
84
|
-
state.action_outcomes[-min(5, len(state.action_history)):],
|
85
|
-
state.error_descriptions[-min(5, len(state.action_history)):], strict=True)
|
86
|
-
) + "\n\n")) if state.action_history else "No actions have been taken yet.\n\n"}
|
87
|
-
|
88
|
-
---
|
89
|
-
### LITERAL EXECUTION RULE ###
|
90
|
-
Whatever the current subgoal says to do, do that EXACTLY. Do not substitute with what you think is better. Do not optimize. Do not consider screen state. Parse the subgoal text literally and execute the matching atomic action.
|
91
|
-
|
92
|
-
IMPORTANT:
|
93
|
-
1. Do NOT repeat previously failed actions multiple times. Try changing to another action.
|
94
|
-
2. Must do the current subgoal.
|
95
|
-
|
96
|
-
Provide your output in the following format, which contains three parts:
|
97
|
-
|
98
|
-
### Thought ###
|
99
|
-
Break down the current subgoal into: (1) What atomic action is required? (2) What target/location is specified? (3) What parameters do I need? Do NOT reason about whether this makes sense - just mechanically convert the subgoal text into the appropriate action format.
|
100
|
-
|
101
|
-
### Action ###
|
102
|
-
Choose only one action or shortcut from the options provided.
|
103
|
-
You must provide your decision using a valid JSON format specifying the `action` and the arguments of the action. For example, if you want to open an App, you should write {{ "action":"open_app", "text": "app name" }}.
|
104
|
-
|
105
|
-
### Description ###
|
106
|
-
A brief description of the chosen action. Do not describe expected outcome.
|
107
|
-
"""
|
108
|
-
|
109
|
-
|
110
|
-
return prompt
|
111
|
-
|
112
|
-
|
113
5
|
def parse_executor_response(response: str) -> dict:
|
114
6
|
"""
|
115
7
|
Parse the Executor LLM response.
|
@@ -3,7 +3,7 @@ Manager Agent - Planning and reasoning workflow.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
from droidrun.agent.droid.events import ManagerInputEvent, ManagerPlanEvent
|
6
|
-
from droidrun.agent.manager.events import
|
6
|
+
from droidrun.agent.manager.events import ManagerInternalPlanEvent, ManagerThinkingEvent
|
7
7
|
from droidrun.agent.manager.manager_agent import ManagerAgent
|
8
8
|
from droidrun.agent.manager.prompts import parse_manager_response
|
9
9
|
|
@@ -10,6 +10,7 @@ This agent is responsible for:
|
|
10
10
|
|
11
11
|
from __future__ import annotations
|
12
12
|
|
13
|
+
import asyncio
|
13
14
|
import logging
|
14
15
|
from typing import TYPE_CHECKING
|
15
16
|
|
@@ -23,13 +24,18 @@ from droidrun.agent.utils.chat_utils import remove_empty_messages
|
|
23
24
|
from droidrun.agent.utils.device_state_formatter import format_device_state
|
24
25
|
from droidrun.agent.utils.inference import acall_with_retries
|
25
26
|
from droidrun.agent.utils.tools import build_custom_tool_descriptions
|
27
|
+
from droidrun.app_cards.app_card_provider import AppCardProvider
|
28
|
+
from droidrun.app_cards.providers import (
|
29
|
+
CompositeAppCardProvider,
|
30
|
+
LocalAppCardProvider,
|
31
|
+
ServerAppCardProvider,
|
32
|
+
)
|
26
33
|
from droidrun.config_manager.prompt_loader import PromptLoader
|
27
|
-
from droidrun.config_manager.app_card_loader import AppCardLoader
|
28
34
|
|
29
35
|
if TYPE_CHECKING:
|
30
36
|
from droidrun.agent.droid.events import DroidAgentState
|
31
|
-
from droidrun.tools import Tools
|
32
37
|
from droidrun.config_manager.config_manager import AgentConfig
|
38
|
+
from droidrun.tools import Tools
|
33
39
|
|
34
40
|
|
35
41
|
logger = logging.getLogger("droidrun")
|
@@ -63,33 +69,75 @@ class ManagerAgent(Workflow):
|
|
63
69
|
self.shared_state = shared_state
|
64
70
|
self.custom_tools = custom_tools or {}
|
65
71
|
self.agent_config = agent_config
|
66
|
-
self.
|
72
|
+
self.app_card_config = self.agent_config.app_cards
|
73
|
+
|
74
|
+
# Initialize app card provider based on mode
|
75
|
+
self.app_card_provider: AppCardProvider = self._initialize_app_card_provider()
|
67
76
|
|
68
77
|
logger.info("✅ ManagerAgent initialized successfully.")
|
69
78
|
|
79
|
+
def _initialize_app_card_provider(self) -> AppCardProvider:
|
80
|
+
"""Initialize app card provider based on configuration mode."""
|
81
|
+
if not self.app_card_config.enabled:
|
82
|
+
# Return a dummy provider that always returns empty string
|
83
|
+
class DisabledProvider(AppCardProvider):
|
84
|
+
async def load_app_card(self, package_name: str, instruction: str = "") -> str:
|
85
|
+
return ""
|
86
|
+
return DisabledProvider()
|
87
|
+
|
88
|
+
mode = self.app_card_config.mode.lower()
|
89
|
+
|
90
|
+
if mode == "local":
|
91
|
+
logger.info(f"Initializing local app card provider (dir: {self.app_card_config.app_cards_dir})")
|
92
|
+
return LocalAppCardProvider(app_cards_dir=self.app_card_config.app_cards_dir)
|
93
|
+
|
94
|
+
elif mode == "server":
|
95
|
+
if not self.app_card_config.server_url:
|
96
|
+
logger.warning("Server mode enabled but no server_url configured, falling back to local")
|
97
|
+
return LocalAppCardProvider(app_cards_dir=self.app_card_config.app_cards_dir)
|
98
|
+
|
99
|
+
logger.info(f"Initializing server app card provider (url: {self.app_card_config.server_url})")
|
100
|
+
return ServerAppCardProvider(
|
101
|
+
server_url=self.app_card_config.server_url,
|
102
|
+
timeout=self.app_card_config.server_timeout,
|
103
|
+
max_retries=self.app_card_config.server_max_retries,
|
104
|
+
)
|
105
|
+
|
106
|
+
elif mode == "composite":
|
107
|
+
if not self.app_card_config.server_url:
|
108
|
+
logger.warning("Composite mode enabled but no server_url configured, falling back to local")
|
109
|
+
return LocalAppCardProvider(app_cards_dir=self.app_card_config.app_cards_dir)
|
110
|
+
|
111
|
+
logger.info(
|
112
|
+
f"Initializing composite app card provider "
|
113
|
+
f"(server: {self.app_card_config.server_url}, local: {self.app_card_config.app_cards_dir})"
|
114
|
+
)
|
115
|
+
return CompositeAppCardProvider(
|
116
|
+
server_url=self.app_card_config.server_url,
|
117
|
+
app_cards_dir=self.app_card_config.app_cards_dir,
|
118
|
+
server_timeout=self.app_card_config.server_timeout,
|
119
|
+
server_max_retries=self.app_card_config.server_max_retries,
|
120
|
+
)
|
121
|
+
|
122
|
+
else:
|
123
|
+
logger.warning(f"Unknown app_card mode '{mode}', falling back to local")
|
124
|
+
return LocalAppCardProvider(app_cards_dir=self.app_card_config.app_cards_dir)
|
125
|
+
|
70
126
|
# ========================================================================
|
71
127
|
# Helper Methods
|
72
128
|
# ========================================================================
|
73
129
|
|
74
130
|
def _build_system_prompt(
|
75
131
|
self,
|
76
|
-
has_text_to_modify: bool
|
77
|
-
app_card: str = ""
|
132
|
+
has_text_to_modify: bool
|
78
133
|
) -> str:
|
79
|
-
"""
|
80
|
-
Build system prompt with all context.
|
134
|
+
"""Build system prompt with all context."""
|
81
135
|
|
82
|
-
|
83
|
-
|
84
|
-
app_card: App card content
|
85
|
-
Returns:
|
86
|
-
Complete system prompt
|
87
|
-
"""
|
88
|
-
# Format error history
|
89
|
-
error_history_text = ""
|
136
|
+
# Prepare error history as structured data (if needed)
|
137
|
+
error_history = None
|
90
138
|
if self.shared_state.error_flag_plan:
|
91
139
|
k = self.shared_state.err_to_manager_thresh
|
92
|
-
|
140
|
+
error_history = [
|
93
141
|
{
|
94
142
|
"action": act,
|
95
143
|
"summary": summ,
|
@@ -98,78 +146,22 @@ class ManagerAgent(Workflow):
|
|
98
146
|
for act, summ, err_des in zip(
|
99
147
|
self.shared_state.action_history[-k:],
|
100
148
|
self.shared_state.summary_history[-k:],
|
101
|
-
self.shared_state.error_descriptions[-k:],
|
149
|
+
self.shared_state.error_descriptions[-k:],
|
150
|
+
strict=True
|
102
151
|
)
|
103
152
|
]
|
104
|
-
|
105
|
-
|
106
|
-
"You have encountered several failed attempts. Here are some logs:\n"
|
107
|
-
)
|
108
|
-
for error in errors:
|
109
|
-
error_history_text += (
|
110
|
-
f"- Attempt: Action: {error['action']} | "
|
111
|
-
f"Description: {error['summary']} | "
|
112
|
-
f"Outcome: Failed | "
|
113
|
-
f"Feedback: {error['error']}\n"
|
114
|
-
)
|
115
|
-
error_history_text += "</potentially_stuck>\n\n"
|
116
|
-
|
117
|
-
# Text manipulation section
|
118
|
-
text_manipulation_section = ""
|
119
|
-
if has_text_to_modify:
|
120
|
-
text_manipulation_section = """
|
121
|
-
|
122
|
-
<text_manipulation>
|
123
|
-
1. Use **TEXT_TASK:** prefix in your plan when you need to modify text in the currently focused text input field
|
124
|
-
2. TEXT_TASK is for editing, formatting, or transforming existing text content in text boxes using Python code
|
125
|
-
3. Do not use TEXT_TASK for extracting text from messages, typing new text, or composing messages
|
126
|
-
4. The focused text field contains editable text that you can modify
|
127
|
-
5. Example plan item: 'TEXT_TASK: Add "Hello World" at the beginning of the text'
|
128
|
-
6. Always use TEXT_TASK for modifying text, do not try to select the text to copy/cut/paste or adjust the text
|
129
|
-
</text_manipulation>"""
|
130
|
-
|
131
|
-
# Device date (include tags in variable value or empty string)
|
132
|
-
device_date = self.tools_instance.get_date()
|
133
|
-
device_date_text = ""
|
134
|
-
if device_date.strip():
|
135
|
-
device_date_text = f"<device_date>\n{device_date}\n</device_date>\n\n"
|
136
|
-
|
137
|
-
# App card (include tags in variable value or empty string)
|
138
|
-
app_card = app_card
|
139
|
-
app_card_text = ""
|
140
|
-
if app_card.strip():
|
141
|
-
app_card_text = "App card gives information on how to operate the app and perform actions.\n<app_card>\n" + app_card.strip() + "\n</app_card>\n\n"
|
142
|
-
|
143
|
-
# Important notes (include tags in variable value or empty string)
|
144
|
-
important_notes = "" # TODO: implement
|
145
|
-
important_notes_text = ""
|
146
|
-
if important_notes.strip():
|
147
|
-
important_notes_text = "<important_notes>\n" + important_notes + "\n</important_notes>\n\n"
|
148
|
-
|
149
|
-
# Custom tools
|
150
|
-
custom_tools_desc = build_custom_tool_descriptions(self.custom_tools)
|
151
|
-
custom_tools_text = ""
|
152
|
-
if custom_tools_desc.strip():
|
153
|
-
custom_tools_text = """
|
154
|
-
|
155
|
-
<custom_actions>
|
156
|
-
The executor has access to these additional custom actions beyond the standard actions (click, type, swipe, etc.):
|
157
|
-
""" + custom_tools_desc + """
|
158
|
-
|
159
|
-
You can reference these custom actions or tell the Executer agent to use them in your plan when they help achieve the user's goal.
|
160
|
-
</custom_actions>"""
|
161
|
-
|
162
|
-
# Load and format prompt
|
153
|
+
|
154
|
+
# Let Jinja2 handle all formatting and conditionals
|
163
155
|
return PromptLoader.load_prompt(
|
164
156
|
self.agent_config.get_manager_system_prompt_path(),
|
165
157
|
{
|
166
158
|
"instruction": self.shared_state.instruction,
|
167
|
-
"device_date":
|
168
|
-
"app_card":
|
169
|
-
"important_notes":
|
170
|
-
"error_history":
|
171
|
-
"
|
172
|
-
"custom_tools_descriptions":
|
159
|
+
"device_date": self.tools_instance.get_date(),
|
160
|
+
"app_card": self.shared_state.app_card,
|
161
|
+
"important_notes": "", # TODO: implement
|
162
|
+
"error_history": error_history,
|
163
|
+
"text_manipulation_enabled": has_text_to_modify,
|
164
|
+
"custom_tools_descriptions": build_custom_tool_descriptions(self.custom_tools)
|
173
165
|
}
|
174
166
|
)
|
175
167
|
|
@@ -339,7 +331,17 @@ You can reference these custom actions or tell the Executer agent to use them in
|
|
339
331
|
self.shared_state.current_package_name = phone_state.get('packageName', 'Unknown')
|
340
332
|
self.shared_state.current_app_name = phone_state.get('currentApp', 'Unknown')
|
341
333
|
|
342
|
-
#
|
334
|
+
# ====================================================================
|
335
|
+
# Step 1.5: Start loading app card in background
|
336
|
+
# ====================================================================
|
337
|
+
if self.app_card_config.enabled:
|
338
|
+
loading_task = asyncio.create_task(
|
339
|
+
self.app_card_provider.load_app_card(
|
340
|
+
package_name=self.shared_state.current_package_name,
|
341
|
+
instruction=self.shared_state.instruction
|
342
|
+
)
|
343
|
+
)
|
344
|
+
self.shared_state.app_card_loading_task = loading_task
|
343
345
|
|
344
346
|
# ====================================================================
|
345
347
|
# Step 2: Capture screenshot if vision enabled
|
@@ -417,15 +419,30 @@ You can reference these custom actions or tell the Executer agent to use them in
|
|
417
419
|
|
418
420
|
has_text_to_modify = self.shared_state.has_text_to_modify
|
419
421
|
screenshot = self.shared_state.screenshot
|
420
|
-
|
421
|
-
|
422
|
+
|
423
|
+
# ====================================================================
|
424
|
+
# Try to get app card from previous iteration's loading task
|
425
|
+
# ====================================================================
|
426
|
+
if self.app_card_config.enabled and self.shared_state.app_card_loading_task:
|
427
|
+
try:
|
428
|
+
# Wait briefly for the background task to complete (0.1s timeout)
|
429
|
+
self.shared_state.app_card = await asyncio.wait_for(
|
430
|
+
self.shared_state.app_card_loading_task,
|
431
|
+
timeout=0.1
|
432
|
+
)
|
433
|
+
except asyncio.TimeoutError:
|
434
|
+
# Task not ready yet, use empty string
|
435
|
+
self.shared_state.app_card = ""
|
436
|
+
except Exception as e:
|
437
|
+
logger.warning(f"Error getting app card: {e}")
|
438
|
+
self.shared_state.app_card = ""
|
422
439
|
else:
|
423
|
-
app_card = ""
|
440
|
+
self.shared_state.app_card = ""
|
424
441
|
|
425
442
|
# ====================================================================
|
426
443
|
# Step 1: Build system prompt
|
427
444
|
# ====================================================================
|
428
|
-
system_prompt = self._build_system_prompt(has_text_to_modify
|
445
|
+
system_prompt = self._build_system_prompt(has_text_to_modify)
|
429
446
|
|
430
447
|
# ====================================================================
|
431
448
|
# Step 2: Build messages with context
|