droidrun 0.3.10.dev5__py3-none-any.whl → 0.3.10.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. droidrun/agent/codeact/codeact_agent.py +21 -29
  2. droidrun/agent/context/task_manager.py +0 -1
  3. droidrun/agent/droid/droid_agent.py +1 -3
  4. droidrun/agent/droid/events.py +6 -3
  5. droidrun/agent/executor/executor_agent.py +24 -38
  6. droidrun/agent/executor/prompts.py +0 -108
  7. droidrun/agent/manager/__init__.py +1 -1
  8. droidrun/agent/manager/manager_agent.py +104 -87
  9. droidrun/agent/utils/executer.py +11 -10
  10. droidrun/agent/utils/llm_picker.py +63 -1
  11. droidrun/agent/utils/tools.py +30 -1
  12. droidrun/app_cards/app_card_provider.py +26 -0
  13. droidrun/app_cards/providers/__init__.py +7 -0
  14. droidrun/app_cards/providers/composite_provider.py +97 -0
  15. droidrun/app_cards/providers/local_provider.py +115 -0
  16. droidrun/app_cards/providers/server_provider.py +126 -0
  17. droidrun/cli/logs.py +4 -4
  18. droidrun/cli/main.py +244 -34
  19. droidrun/config_manager/__init__.py +0 -2
  20. droidrun/config_manager/config_manager.py +45 -102
  21. droidrun/config_manager/path_resolver.py +1 -1
  22. droidrun/config_manager/prompt_loader.py +48 -51
  23. droidrun/macro/cli.py +0 -1
  24. droidrun/portal.py +17 -0
  25. droidrun/tools/adb.py +13 -34
  26. {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/METADATA +2 -9
  27. {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/RECORD +30 -26
  28. droidrun/config_manager/app_card_loader.py +0 -148
  29. {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/WHEEL +0 -0
  30. {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/entry_points.txt +0 -0
  31. {droidrun-0.3.10.dev5.dist-info → droidrun-0.3.10.dev7.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,7 @@ import json
3
3
  import logging
4
4
  import re
5
5
  import time
6
- from typing import List, Union, Optional, TYPE_CHECKING
6
+ from typing import TYPE_CHECKING, List, Optional, Union
7
7
 
8
8
  from llama_index.core.base.llms.types import ChatMessage, ChatResponse
9
9
  from llama_index.core.llms.llm import LLM
@@ -23,9 +23,8 @@ from droidrun.agent.common.events import RecordUIStateEvent, ScreenshotEvent
23
23
  from droidrun.agent.context.episodic_memory import EpisodicMemory, EpisodicMemoryStep
24
24
  from droidrun.agent.usage import get_usage_from_response
25
25
  from droidrun.agent.utils import chat_utils
26
- from droidrun.agent.utils.executer import SimpleCodeExecutor, ExecuterState
27
26
  from droidrun.agent.utils.device_state_formatter import format_device_state
28
-
27
+ from droidrun.agent.utils.executer import ExecuterState, SimpleCodeExecutor
29
28
  from droidrun.agent.utils.tools import (
30
29
  ATOMIC_ACTION_SIGNATURES,
31
30
  build_custom_tool_descriptions,
@@ -85,15 +84,8 @@ class CodeActAgent(Workflow):
85
84
  self.tool_list = {}
86
85
  for action_name, signature in merged_signatures.items():
87
86
  func = signature["function"]
88
- if asyncio.iscoroutinefunction(func):
89
- # Create async bound function with proper closure
90
- def make_bound(f, ti):
91
- async def bound_func(*args, **kwargs):
92
- return await f(ti, *args, **kwargs)
93
- return bound_func
94
- self.tool_list[action_name] = make_bound(func, tools_instance)
95
- else:
96
- self.tool_list[action_name] = lambda *args, f=func, ti=tools_instance, **kwargs: f(ti, *args, **kwargs)
87
+
88
+ self.tool_list[action_name] = lambda *args, f=func, ti=tools_instance, **kwargs: f(ti, *args, **kwargs)
97
89
 
98
90
  self.tool_list["remember"] = tools_instance.remember
99
91
  self.tool_list["complete"] = tools_instance.complete
@@ -113,13 +105,10 @@ class CodeActAgent(Workflow):
113
105
  )
114
106
  self.system_prompt = ChatMessage(role="system", content=system_prompt_text)
115
107
 
116
- self.user_prompt_template = PromptLoader.load_prompt(agent_config.get_codeact_user_prompt_path())
117
-
118
108
  self.executor = SimpleCodeExecutor(
119
109
  loop=asyncio.get_event_loop(),
120
110
  locals={},
121
111
  tools=self.tool_list,
122
- tools_instance=tools_instance,
123
112
  globals={"__builtins__": __builtins__},
124
113
  )
125
114
 
@@ -293,27 +282,30 @@ Now, describe the next step you will take to address the original goal: {goal}""
293
282
  try:
294
283
  self.code_exec_counter += 1
295
284
  result = await self.executor.execute(ExecuterState(ui_state=ctx.store.get("ui_state", None)), code)
296
- logger.info(f"💡 Code execution successful. Result: {result['output']}")
285
+ logger.info(f"💡 Code execution successful. Result: {result}")
297
286
  await asyncio.sleep(self.agent_config.after_sleep_action)
298
- screenshots = result['screenshots']
299
- for screenshot in screenshots[:-1]: # the last screenshot will be captured by next step
300
- ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
301
-
302
- ui_states = result['ui_states']
303
- for ui_state in ui_states[:-1]:
304
- ctx.write_event_to_stream(RecordUIStateEvent(ui_state=ui_state['a11y_tree']))
305
287
 
288
+ # Check if complete() was called
306
289
  if self.tools.finished:
307
- logger.debug(" - Task completed.")
308
- event = TaskEndEvent(
309
- success=self.tools.success, reason=self.tools.reason
310
- )
290
+ logger.info(" Task marked as complete via complete() function")
291
+
292
+ # Validate completion state
293
+ success = self.tools.success if self.tools.success is not None else False
294
+ reason = self.tools.reason if self.tools.reason else "Task completed without reason"
295
+
296
+ # Reset finished flag for next execution
297
+ self.tools.finished = False
298
+
299
+ logger.info(f" - Success: {success}")
300
+ logger.info(f" - Reason: {reason}")
301
+
302
+ event = TaskEndEvent(success=success, reason=reason)
311
303
  ctx.write_event_to_stream(event)
312
304
  return event
313
305
 
314
306
  self.remembered_info = self.tools.memory
315
307
 
316
- event = TaskExecutionResultEvent(output=str(result['output']))
308
+ event = TaskExecutionResultEvent(output=str(result))
317
309
  ctx.write_event_to_stream(event)
318
310
  return event
319
311
 
@@ -496,7 +488,7 @@ Now, describe the next step you will take to address the original goal: {goal}""
496
488
  try:
497
489
  state = self.tools.get_state()
498
490
  a11y_tree = state.get("a11y_tree", "")
499
- phone_state = state.get("phone_state", "")
491
+ phone_state = state.get("phone_state", "") # noqa: F841
500
492
  except Exception as e:
501
493
  raise Exception(f"Failed to capture final UI state: {e}") from e
502
494
 
@@ -1,5 +1,4 @@
1
1
  import copy
2
- import os
3
2
  from dataclasses import dataclass
4
3
  from typing import Dict, List, Optional
5
4
 
@@ -32,7 +32,7 @@ from droidrun.agent.droid.events import (
32
32
  )
33
33
  from droidrun.agent.executor import ExecutorAgent
34
34
  from droidrun.agent.manager import ManagerAgent
35
- from droidrun.agent.utils.tools import ATOMIC_ACTION_SIGNATURES
35
+ from droidrun.agent.utils.tools import ATOMIC_ACTION_SIGNATURES, open_app
36
36
  from droidrun.agent.utils.trajectory import Trajectory
37
37
  from droidrun.config_manager.config_manager import (
38
38
  AgentConfig,
@@ -43,8 +43,6 @@ from droidrun.config_manager.config_manager import (
43
43
  ToolsConfig,
44
44
  TracingConfig,
45
45
  )
46
-
47
- from droidrun.agent.utils.tools import open_app
48
46
  from droidrun.telemetry import (
49
47
  DroidAgentFinalizeEvent,
50
48
  DroidAgentInitEvent,
@@ -10,10 +10,11 @@ For internal events with full debugging metadata, see:
10
10
  - codeact/events.py (Task*, EpisodicMemoryEvent)
11
11
  """
12
12
 
13
+ import asyncio
13
14
  from typing import Dict, List
14
15
 
15
16
  from llama_index.core.workflow import Event
16
- from pydantic import BaseModel, Field
17
+ from pydantic import BaseModel, ConfigDict, Field
17
18
 
18
19
  from droidrun.agent.context import Task
19
20
 
@@ -46,10 +47,12 @@ class DroidAgentState(BaseModel):
46
47
  """
47
48
  State model for DroidAgent workflow - shared across parent and child workflows.
48
49
  """
49
-
50
+ model_config = ConfigDict(arbitrary_types_allowed=True)
50
51
  # Task context
51
52
  instruction: str = ""
52
-
53
+ # App Cards
54
+ app_card: str = ""
55
+ app_card_loading_task: asyncio.Task[str] | None = None
53
56
  # Formatted device state for prompts (complete text)
54
57
  formatted_device_state: str = ""
55
58
 
@@ -90,52 +90,38 @@ class ExecutorAgent(Workflow): # TODO: Fix a bug in bad prompt
90
90
  subgoal = ev.get("subgoal", "")
91
91
  logger.info(f"🧠 Executor thinking about action for: {subgoal}")
92
92
 
93
- # Format app card (include tags in variable value or empty string)
94
- app_card = "" # TODO: Implement app card retrieval
95
- app_card_text = ""
96
- if app_card.strip():
97
- app_card_text = "App card gives information on how to operate the app and perform actions.\n### App Card ###\n" + app_card.strip() + "\n\n"
98
-
99
- # Format device state (use unified state)
100
- device_state_text = ""
101
- if self.shared_state.formatted_device_state and self.shared_state.formatted_device_state.strip():
102
- device_state_text = "### Device State ###\n" + self.shared_state.formatted_device_state.strip() + "\n\n"
103
-
104
- # Format progress status
105
- progress_status_text = self.shared_state.progress_status + "\n\n" if self.shared_state.progress_status else "No progress yet.\n\n"
106
-
107
- # Format atomic actions
108
- atomic_actions_text = chr(10).join(
109
- f"- {action_name}({', '.join(action_info['arguments'])}): {action_info['description']}"
110
- for action_name, action_info in ATOMIC_ACTION_SIGNATURES.items()
111
- ) + "\n"
112
-
113
- # Format action history
93
+ # Prepare action history as structured data (last 5 actions)
94
+ action_history = []
114
95
  if self.shared_state.action_history:
115
- action_history_text = "Recent actions you took previously and whether they were successful:\n" + "\n".join(
116
- (f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome
117
- else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
96
+ n = min(5, len(self.shared_state.action_history))
97
+ action_history = [
98
+ {
99
+ "action": act,
100
+ "summary": summ,
101
+ "outcome": outcome,
102
+ "error": err_des
103
+ }
118
104
  for act, summ, outcome, err_des in zip(
119
- self.shared_state.action_history[-min(5, len(self.shared_state.action_history)):],
120
- self.shared_state.summary_history[-min(5, len(self.shared_state.action_history)):],
121
- self.shared_state.action_outcomes[-min(5, len(self.shared_state.action_history)):],
122
- self.shared_state.error_descriptions[-min(5, len(self.shared_state.action_history)):], strict=True)
123
- ) + "\n\n"
124
- else:
125
- action_history_text = "No actions have been taken yet.\n\n"
126
-
127
- # Load and format prompt
105
+ self.shared_state.action_history[-n:],
106
+ self.shared_state.summary_history[-n:],
107
+ self.shared_state.action_outcomes[-n:],
108
+ self.shared_state.error_descriptions[-n:],
109
+ strict=True
110
+ )
111
+ ]
112
+
113
+ # Let Jinja2 handle all formatting
128
114
  system_prompt = PromptLoader.load_prompt(
129
115
  self.agent_config.get_executor_system_prompt_path(),
130
116
  {
131
117
  "instruction": self.shared_state.instruction,
132
- "app_card": app_card_text,
133
- "device_state_text": device_state_text,
118
+ "app_card": "", # TODO: Implement app card loader
119
+ "device_state": self.shared_state.formatted_device_state,
134
120
  "plan": self.shared_state.plan,
135
121
  "subgoal": subgoal,
136
- "progress_status": progress_status_text,
137
- "atomic_actions": atomic_actions_text,
138
- "action_history": action_history_text
122
+ "progress_status": self.shared_state.progress_status,
123
+ "atomic_actions": ATOMIC_ACTION_SIGNATURES,
124
+ "action_history": action_history
139
125
  }
140
126
  )
141
127
 
@@ -2,114 +2,6 @@
2
2
  Prompts for the ExecutorAgent.
3
3
  """
4
4
 
5
-
6
- from droidrun.agent.droid.events import DroidAgentState
7
- from droidrun.agent.utils.tools import ATOMIC_ACTION_SIGNATURES
8
-
9
-
10
- def build_executor_system_prompt(
11
- state: "DroidAgentState",
12
- subgoal: str,
13
- app_card: str = "",
14
- ) -> str:
15
- """
16
- Build the complete Executor system prompt with all context.
17
-
18
- Args:
19
- state: Current DroidAgentState with all context
20
- subgoal: Current subgoal to execute
21
- app_card: Optional app-specific instructions
22
-
23
- Returns:
24
- Complete system prompt for the Executor
25
- """
26
- prompt = f"""You are a LOW-LEVEL ACTION EXECUTOR for an Android phone. You do NOT answer questions or provide results. You ONLY perform individual atomic actions as specified in the current subgoal. You are part of a larger system - your job is to execute actions, not to think about or answer the user's original question.
27
-
28
- ### User Request ###
29
- {state.instruction}
30
-
31
- {("App card gives information on how to operate the app and perform actions.\n" + "### App Card ###\n" + app_card.strip() + "\n\n") if app_card.strip() else ""}{(("### Device State ###\n" + state.device_state_text.strip() + "\n\n") if state.device_state_text.strip() else "")}### Overall Plan ###
32
- {state.plan}
33
-
34
- ### Current Subgoal ###
35
- EXECUTE THIS SUBGOAL: {subgoal}
36
-
37
- EXECUTION MODE: You are a dumb robot. Find the exact text/element mentioned in the subgoal above and perform the specified action on it. Do not read anything below this line until after you execute the subgoal.
38
-
39
- ### SUBGOAL PARSING MODE ###
40
- Read the current subgoal exactly as written. Look for:
41
- - Action words: "tap", "click", "swipe", "type", "press", "open" etc.
42
- - Target elements: specific text, buttons, fields, coordinates mentioned
43
- - Locations: "header", "bottom", "left", "right", specific coordinates
44
- Convert directly to atomic action:
45
- - "tap/click" → click action
46
- - "swipe" → swipe action
47
- - "type" → type action
48
- - "press [system button]" → system_button action
49
- - "open [app]" → open_app action
50
- Execute the atomic action for the exact target mentioned. Ignore everything else.
51
-
52
- ### Progress Status ###
53
- {(state.progress_status + "\n\n") if state.progress_status != "" else "No progress yet.\n\n"}
54
-
55
- ### Guidelines ###
56
- General:
57
- - For any pop-up window, such as a permission request, you need to close it (e.g., by clicking `Don't Allow` or `Accept & continue`) before proceeding. Never choose to add any account or log in.
58
- Action Related:
59
- - Use the `open_app` action whenever you want to open an app (nothing will happen if the app is not installed), do not use the app drawer to open an app.
60
- - Consider exploring the screen by using the `swipe` action with different directions to reveal additional content. Or use search to quickly find a specific entry, if applicable.
61
- - If you cannot change the page content by swiping in the same direction continuously, the page may have been swiped to the bottom. Please try another operation to display more content.
62
- - For some horizontally distributed tags, you can swipe horizontally to view more.
63
- Text Related Operations:
64
- - Activated input box: If an input box is activated, it may have a cursor inside it and the keyboard is visible. If there is no cursor on the screen but the keyboard is visible, it may be because the cursor is blinking. The color of the activated input box will be highlighted. If you are not sure whether the input box is activated, click it before typing.
65
- - To input some text: first click the input box that you want to input, make sure the correct input box is activated and the keyboard is visible, then use `type` action to enter the specified text.
66
- - To clear the text: long press the backspace button in the keyboard.
67
- - To copy some text: first long press the text you want to copy, then click the `copy` button in bar.
68
- - To paste text into a text box: first long press the text box, then click the `paste` button in bar.
69
-
70
- ---
71
- Execute the current subgoal mechanically. Do NOT examine the screen content or make decisions about what you see. Parse the current subgoal text to identify the required action and execute it exactly as written. You must choose your action from one of the atomic actions.
72
-
73
- #### Atomic Actions ####
74
- The atomic action functions are listed in the format of `action(arguments): description` as follows:
75
- {chr(10).join(f"- {action_name}({', '.join(action_info['arguments'])}): {action_info['description']}" for action_name, action_info in ATOMIC_ACTION_SIGNATURES.items())}
76
- \n
77
- ### Latest Action History ###
78
- {(("Recent actions you took previously and whether they were successful:\n" + "\n".join(
79
- (f"Action: {act} | Description: {summ} | Outcome: Successful" if outcome
80
- else f"Action: {act} | Description: {summ} | Outcome: Failed | Feedback: {err_des}")
81
- for act, summ, outcome, err_des in zip(
82
- state.action_history[-min(5, len(state.action_history)):],
83
- state.summary_history[-min(5, len(state.action_history)):],
84
- state.action_outcomes[-min(5, len(state.action_history)):],
85
- state.error_descriptions[-min(5, len(state.action_history)):], strict=True)
86
- ) + "\n\n")) if state.action_history else "No actions have been taken yet.\n\n"}
87
-
88
- ---
89
- ### LITERAL EXECUTION RULE ###
90
- Whatever the current subgoal says to do, do that EXACTLY. Do not substitute with what you think is better. Do not optimize. Do not consider screen state. Parse the subgoal text literally and execute the matching atomic action.
91
-
92
- IMPORTANT:
93
- 1. Do NOT repeat previously failed actions multiple times. Try changing to another action.
94
- 2. Must do the current subgoal.
95
-
96
- Provide your output in the following format, which contains three parts:
97
-
98
- ### Thought ###
99
- Break down the current subgoal into: (1) What atomic action is required? (2) What target/location is specified? (3) What parameters do I need? Do NOT reason about whether this makes sense - just mechanically convert the subgoal text into the appropriate action format.
100
-
101
- ### Action ###
102
- Choose only one action or shortcut from the options provided.
103
- You must provide your decision using a valid JSON format specifying the `action` and the arguments of the action. For example, if you want to open an App, you should write {{ "action":"open_app", "text": "app name" }}.
104
-
105
- ### Description ###
106
- A brief description of the chosen action. Do not describe expected outcome.
107
- """
108
-
109
-
110
- return prompt
111
-
112
-
113
5
  def parse_executor_response(response: str) -> dict:
114
6
  """
115
7
  Parse the Executor LLM response.
@@ -3,7 +3,7 @@ Manager Agent - Planning and reasoning workflow.
3
3
  """
4
4
 
5
5
  from droidrun.agent.droid.events import ManagerInputEvent, ManagerPlanEvent
6
- from droidrun.agent.manager.events import ManagerThinkingEvent, ManagerInternalPlanEvent
6
+ from droidrun.agent.manager.events import ManagerInternalPlanEvent, ManagerThinkingEvent
7
7
  from droidrun.agent.manager.manager_agent import ManagerAgent
8
8
  from droidrun.agent.manager.prompts import parse_manager_response
9
9
 
@@ -10,6 +10,7 @@ This agent is responsible for:
10
10
 
11
11
  from __future__ import annotations
12
12
 
13
+ import asyncio
13
14
  import logging
14
15
  from typing import TYPE_CHECKING
15
16
 
@@ -23,13 +24,18 @@ from droidrun.agent.utils.chat_utils import remove_empty_messages
23
24
  from droidrun.agent.utils.device_state_formatter import format_device_state
24
25
  from droidrun.agent.utils.inference import acall_with_retries
25
26
  from droidrun.agent.utils.tools import build_custom_tool_descriptions
27
+ from droidrun.app_cards.app_card_provider import AppCardProvider
28
+ from droidrun.app_cards.providers import (
29
+ CompositeAppCardProvider,
30
+ LocalAppCardProvider,
31
+ ServerAppCardProvider,
32
+ )
26
33
  from droidrun.config_manager.prompt_loader import PromptLoader
27
- from droidrun.config_manager.app_card_loader import AppCardLoader
28
34
 
29
35
  if TYPE_CHECKING:
30
36
  from droidrun.agent.droid.events import DroidAgentState
31
- from droidrun.tools import Tools
32
37
  from droidrun.config_manager.config_manager import AgentConfig
38
+ from droidrun.tools import Tools
33
39
 
34
40
 
35
41
  logger = logging.getLogger("droidrun")
@@ -63,33 +69,75 @@ class ManagerAgent(Workflow):
63
69
  self.shared_state = shared_state
64
70
  self.custom_tools = custom_tools or {}
65
71
  self.agent_config = agent_config
66
- self.app_card_loader = self.agent_config.app_cards
72
+ self.app_card_config = self.agent_config.app_cards
73
+
74
+ # Initialize app card provider based on mode
75
+ self.app_card_provider: AppCardProvider = self._initialize_app_card_provider()
67
76
 
68
77
  logger.info("✅ ManagerAgent initialized successfully.")
69
78
 
79
+ def _initialize_app_card_provider(self) -> AppCardProvider:
80
+ """Initialize app card provider based on configuration mode."""
81
+ if not self.app_card_config.enabled:
82
+ # Return a dummy provider that always returns empty string
83
+ class DisabledProvider(AppCardProvider):
84
+ async def load_app_card(self, package_name: str, instruction: str = "") -> str:
85
+ return ""
86
+ return DisabledProvider()
87
+
88
+ mode = self.app_card_config.mode.lower()
89
+
90
+ if mode == "local":
91
+ logger.info(f"Initializing local app card provider (dir: {self.app_card_config.app_cards_dir})")
92
+ return LocalAppCardProvider(app_cards_dir=self.app_card_config.app_cards_dir)
93
+
94
+ elif mode == "server":
95
+ if not self.app_card_config.server_url:
96
+ logger.warning("Server mode enabled but no server_url configured, falling back to local")
97
+ return LocalAppCardProvider(app_cards_dir=self.app_card_config.app_cards_dir)
98
+
99
+ logger.info(f"Initializing server app card provider (url: {self.app_card_config.server_url})")
100
+ return ServerAppCardProvider(
101
+ server_url=self.app_card_config.server_url,
102
+ timeout=self.app_card_config.server_timeout,
103
+ max_retries=self.app_card_config.server_max_retries,
104
+ )
105
+
106
+ elif mode == "composite":
107
+ if not self.app_card_config.server_url:
108
+ logger.warning("Composite mode enabled but no server_url configured, falling back to local")
109
+ return LocalAppCardProvider(app_cards_dir=self.app_card_config.app_cards_dir)
110
+
111
+ logger.info(
112
+ f"Initializing composite app card provider "
113
+ f"(server: {self.app_card_config.server_url}, local: {self.app_card_config.app_cards_dir})"
114
+ )
115
+ return CompositeAppCardProvider(
116
+ server_url=self.app_card_config.server_url,
117
+ app_cards_dir=self.app_card_config.app_cards_dir,
118
+ server_timeout=self.app_card_config.server_timeout,
119
+ server_max_retries=self.app_card_config.server_max_retries,
120
+ )
121
+
122
+ else:
123
+ logger.warning(f"Unknown app_card mode '{mode}', falling back to local")
124
+ return LocalAppCardProvider(app_cards_dir=self.app_card_config.app_cards_dir)
125
+
70
126
  # ========================================================================
71
127
  # Helper Methods
72
128
  # ========================================================================
73
129
 
74
130
  def _build_system_prompt(
75
131
  self,
76
- has_text_to_modify: bool,
77
- app_card: str = ""
132
+ has_text_to_modify: bool
78
133
  ) -> str:
79
- """
80
- Build system prompt with all context.
134
+ """Build system prompt with all context."""
81
135
 
82
- Args:
83
- has_text_to_modify: Whether text manipulation mode is enabled
84
- app_card: App card content
85
- Returns:
86
- Complete system prompt
87
- """
88
- # Format error history
89
- error_history_text = ""
136
+ # Prepare error history as structured data (if needed)
137
+ error_history = None
90
138
  if self.shared_state.error_flag_plan:
91
139
  k = self.shared_state.err_to_manager_thresh
92
- errors = [
140
+ error_history = [
93
141
  {
94
142
  "action": act,
95
143
  "summary": summ,
@@ -98,78 +146,22 @@ class ManagerAgent(Workflow):
98
146
  for act, summ, err_des in zip(
99
147
  self.shared_state.action_history[-k:],
100
148
  self.shared_state.summary_history[-k:],
101
- self.shared_state.error_descriptions[-k:], strict=True
149
+ self.shared_state.error_descriptions[-k:],
150
+ strict=True
102
151
  )
103
152
  ]
104
- error_history_text = (
105
- "<potentially_stuck>\n"
106
- "You have encountered several failed attempts. Here are some logs:\n"
107
- )
108
- for error in errors:
109
- error_history_text += (
110
- f"- Attempt: Action: {error['action']} | "
111
- f"Description: {error['summary']} | "
112
- f"Outcome: Failed | "
113
- f"Feedback: {error['error']}\n"
114
- )
115
- error_history_text += "</potentially_stuck>\n\n"
116
-
117
- # Text manipulation section
118
- text_manipulation_section = ""
119
- if has_text_to_modify:
120
- text_manipulation_section = """
121
-
122
- <text_manipulation>
123
- 1. Use **TEXT_TASK:** prefix in your plan when you need to modify text in the currently focused text input field
124
- 2. TEXT_TASK is for editing, formatting, or transforming existing text content in text boxes using Python code
125
- 3. Do not use TEXT_TASK for extracting text from messages, typing new text, or composing messages
126
- 4. The focused text field contains editable text that you can modify
127
- 5. Example plan item: 'TEXT_TASK: Add "Hello World" at the beginning of the text'
128
- 6. Always use TEXT_TASK for modifying text, do not try to select the text to copy/cut/paste or adjust the text
129
- </text_manipulation>"""
130
-
131
- # Device date (include tags in variable value or empty string)
132
- device_date = self.tools_instance.get_date()
133
- device_date_text = ""
134
- if device_date.strip():
135
- device_date_text = f"<device_date>\n{device_date}\n</device_date>\n\n"
136
-
137
- # App card (include tags in variable value or empty string)
138
- app_card = app_card
139
- app_card_text = ""
140
- if app_card.strip():
141
- app_card_text = "App card gives information on how to operate the app and perform actions.\n<app_card>\n" + app_card.strip() + "\n</app_card>\n\n"
142
-
143
- # Important notes (include tags in variable value or empty string)
144
- important_notes = "" # TODO: implement
145
- important_notes_text = ""
146
- if important_notes.strip():
147
- important_notes_text = "<important_notes>\n" + important_notes + "\n</important_notes>\n\n"
148
-
149
- # Custom tools
150
- custom_tools_desc = build_custom_tool_descriptions(self.custom_tools)
151
- custom_tools_text = ""
152
- if custom_tools_desc.strip():
153
- custom_tools_text = """
154
-
155
- <custom_actions>
156
- The executor has access to these additional custom actions beyond the standard actions (click, type, swipe, etc.):
157
- """ + custom_tools_desc + """
158
-
159
- You can reference these custom actions or tell the Executer agent to use them in your plan when they help achieve the user's goal.
160
- </custom_actions>"""
161
-
162
- # Load and format prompt
153
+
154
+ # Let Jinja2 handle all formatting and conditionals
163
155
  return PromptLoader.load_prompt(
164
156
  self.agent_config.get_manager_system_prompt_path(),
165
157
  {
166
158
  "instruction": self.shared_state.instruction,
167
- "device_date": device_date_text,
168
- "app_card": app_card_text,
169
- "important_notes": important_notes_text,
170
- "error_history": error_history_text,
171
- "text_manipulation_section": text_manipulation_section,
172
- "custom_tools_descriptions": custom_tools_text
159
+ "device_date": self.tools_instance.get_date(),
160
+ "app_card": self.shared_state.app_card,
161
+ "important_notes": "", # TODO: implement
162
+ "error_history": error_history,
163
+ "text_manipulation_enabled": has_text_to_modify,
164
+ "custom_tools_descriptions": build_custom_tool_descriptions(self.custom_tools)
173
165
  }
174
166
  )
175
167
 
@@ -339,7 +331,17 @@ You can reference these custom actions or tell the Executer agent to use them in
339
331
  self.shared_state.current_package_name = phone_state.get('packageName', 'Unknown')
340
332
  self.shared_state.current_app_name = phone_state.get('currentApp', 'Unknown')
341
333
 
342
- # App cards
334
+ # ====================================================================
335
+ # Step 1.5: Start loading app card in background
336
+ # ====================================================================
337
+ if self.app_card_config.enabled:
338
+ loading_task = asyncio.create_task(
339
+ self.app_card_provider.load_app_card(
340
+ package_name=self.shared_state.current_package_name,
341
+ instruction=self.shared_state.instruction
342
+ )
343
+ )
344
+ self.shared_state.app_card_loading_task = loading_task
343
345
 
344
346
  # ====================================================================
345
347
  # Step 2: Capture screenshot if vision enabled
@@ -417,15 +419,30 @@ You can reference these custom actions or tell the Executer agent to use them in
417
419
 
418
420
  has_text_to_modify = self.shared_state.has_text_to_modify
419
421
  screenshot = self.shared_state.screenshot
420
- if self.app_card_loader.enabled:
421
- app_card = AppCardLoader.load_app_card(self.shared_state.current_package_name, self.app_card_loader.app_cards_dir)
422
+
423
+ # ====================================================================
424
+ # Try to get app card from previous iteration's loading task
425
+ # ====================================================================
426
+ if self.app_card_config.enabled and self.shared_state.app_card_loading_task:
427
+ try:
428
+ # Wait briefly for the background task to complete (0.1s timeout)
429
+ self.shared_state.app_card = await asyncio.wait_for(
430
+ self.shared_state.app_card_loading_task,
431
+ timeout=0.1
432
+ )
433
+ except asyncio.TimeoutError:
434
+ # Task not ready yet, use empty string
435
+ self.shared_state.app_card = ""
436
+ except Exception as e:
437
+ logger.warning(f"Error getting app card: {e}")
438
+ self.shared_state.app_card = ""
422
439
  else:
423
- app_card = ""
440
+ self.shared_state.app_card = ""
424
441
 
425
442
  # ====================================================================
426
443
  # Step 1: Build system prompt
427
444
  # ====================================================================
428
- system_prompt = self._build_system_prompt(has_text_to_modify, app_card)
445
+ system_prompt = self._build_system_prompt(has_text_to_modify)
429
446
 
430
447
  # ====================================================================
431
448
  # Step 2: Build messages with context