droidrun 0.3.8__py3-none-any.whl → 0.3.10.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/__init__.py +2 -3
- droidrun/__main__.py +1 -1
- droidrun/agent/__init__.py +1 -1
- droidrun/agent/codeact/__init__.py +1 -4
- droidrun/agent/codeact/codeact_agent.py +112 -48
- droidrun/agent/codeact/events.py +6 -3
- droidrun/agent/codeact/prompts.py +2 -2
- droidrun/agent/common/constants.py +2 -0
- droidrun/agent/common/events.py +5 -3
- droidrun/agent/context/__init__.py +1 -3
- droidrun/agent/context/agent_persona.py +2 -1
- droidrun/agent/context/context_injection_manager.py +6 -6
- droidrun/agent/context/episodic_memory.py +5 -3
- droidrun/agent/context/personas/__init__.py +3 -3
- droidrun/agent/context/personas/app_starter.py +3 -3
- droidrun/agent/context/personas/big_agent.py +3 -3
- droidrun/agent/context/personas/default.py +3 -3
- droidrun/agent/context/personas/ui_expert.py +5 -5
- droidrun/agent/context/task_manager.py +15 -17
- droidrun/agent/droid/__init__.py +1 -1
- droidrun/agent/droid/droid_agent.py +327 -182
- droidrun/agent/droid/events.py +91 -9
- droidrun/agent/executor/__init__.py +13 -0
- droidrun/agent/executor/events.py +24 -0
- droidrun/agent/executor/executor_agent.py +327 -0
- droidrun/agent/executor/prompts.py +136 -0
- droidrun/agent/manager/__init__.py +18 -0
- droidrun/agent/manager/events.py +20 -0
- droidrun/agent/manager/manager_agent.py +459 -0
- droidrun/agent/manager/prompts.py +223 -0
- droidrun/agent/oneflows/app_starter_workflow.py +118 -0
- droidrun/agent/oneflows/text_manipulator.py +204 -0
- droidrun/agent/planner/__init__.py +3 -3
- droidrun/agent/planner/events.py +6 -3
- droidrun/agent/planner/planner_agent.py +60 -53
- droidrun/agent/planner/prompts.py +2 -2
- droidrun/agent/usage.py +15 -13
- droidrun/agent/utils/__init__.py +11 -1
- droidrun/agent/utils/async_utils.py +2 -1
- droidrun/agent/utils/chat_utils.py +48 -60
- droidrun/agent/utils/device_state_formatter.py +177 -0
- droidrun/agent/utils/executer.py +13 -12
- droidrun/agent/utils/inference.py +114 -0
- droidrun/agent/utils/llm_picker.py +2 -0
- droidrun/agent/utils/message_utils.py +85 -0
- droidrun/agent/utils/tools.py +220 -0
- droidrun/agent/utils/trajectory.py +8 -7
- droidrun/cli/__init__.py +1 -1
- droidrun/cli/logs.py +29 -28
- droidrun/cli/main.py +279 -143
- droidrun/config_manager/__init__.py +25 -0
- droidrun/config_manager/config_manager.py +583 -0
- droidrun/macro/__init__.py +2 -2
- droidrun/macro/__main__.py +1 -1
- droidrun/macro/cli.py +36 -34
- droidrun/macro/replay.py +7 -9
- droidrun/portal.py +1 -1
- droidrun/telemetry/__init__.py +2 -2
- droidrun/telemetry/events.py +3 -4
- droidrun/telemetry/phoenix.py +173 -0
- droidrun/telemetry/tracker.py +7 -5
- droidrun/tools/__init__.py +1 -1
- droidrun/tools/adb.py +210 -82
- droidrun/tools/ios.py +7 -5
- droidrun/tools/tools.py +25 -8
- {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/METADATA +13 -7
- droidrun-0.3.10.dev2.dist-info/RECORD +70 -0
- droidrun/agent/common/default.py +0 -5
- droidrun/agent/context/reflection.py +0 -20
- droidrun/agent/oneflows/reflector.py +0 -265
- droidrun-0.3.8.dist-info/RECORD +0 -55
- {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/WHEEL +0 -0
- {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/entry_points.txt +0 -0
- {droidrun-0.3.8.dist-info → droidrun-0.3.10.dev2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,118 @@
|
|
1
|
+
"""
|
2
|
+
Simple workflow to open an app based on a description.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import json
|
6
|
+
|
7
|
+
from workflows import Context, Workflow, step
|
8
|
+
from workflows.events import StartEvent, StopEvent
|
9
|
+
|
10
|
+
from droidrun.tools.tools import Tools
|
11
|
+
|
12
|
+
|
13
|
+
class AppStarter(Workflow):
|
14
|
+
"""
|
15
|
+
A simple workflow that opens an app based on a description.
|
16
|
+
|
17
|
+
The workflow uses an LLM to intelligently match the app description
|
18
|
+
to an installed app's package name, then opens it.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self, tools: Tools, llm, timeout: int = 60, **kwargs):
|
22
|
+
"""
|
23
|
+
Initialize the OpenAppWorkflow.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
tools: An instance of Tools (e.g., AdbTools) to interact with the device
|
27
|
+
llm: An LLM instance (e.g., OpenAI) to determine which app to open
|
28
|
+
timeout: Workflow timeout in seconds (default: 60)
|
29
|
+
**kwargs: Additional arguments passed to Workflow
|
30
|
+
"""
|
31
|
+
super().__init__(timeout=timeout, **kwargs)
|
32
|
+
self.tools = tools
|
33
|
+
self.llm = llm
|
34
|
+
|
35
|
+
@step
|
36
|
+
async def open_app_step(self, ev: StartEvent, ctx: Context) -> StopEvent:
|
37
|
+
"""
|
38
|
+
Opens an app based on the provided description.
|
39
|
+
|
40
|
+
Expected StartEvent attributes:
|
41
|
+
- app_description (str): The name or description of the app to open
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
StopEvent with the result of the open_app operation
|
45
|
+
"""
|
46
|
+
app_description = ev.app_description
|
47
|
+
|
48
|
+
# Get list of installed apps
|
49
|
+
apps = self.tools.get_apps(include_system=True)
|
50
|
+
|
51
|
+
# Format apps list for LLM
|
52
|
+
apps_list = "\n".join([
|
53
|
+
f"- {app['label']} (package: {app['package']})"
|
54
|
+
for app in apps
|
55
|
+
])
|
56
|
+
|
57
|
+
# Construct prompt for LLM
|
58
|
+
prompt = f"""Given the following list of installed apps and a user's description, determine which app package name to open.
|
59
|
+
|
60
|
+
Installed Apps:
|
61
|
+
{apps_list}
|
62
|
+
|
63
|
+
User's Request: "{app_description}"
|
64
|
+
|
65
|
+
Return ONLY a JSON object with the following structure:
|
66
|
+
{{
|
67
|
+
"package": "com.example.package"
|
68
|
+
}}
|
69
|
+
|
70
|
+
Choose the most appropriate app based on the description. Return the package name of the best match."""
|
71
|
+
|
72
|
+
# Get LLM response
|
73
|
+
response = await self.llm.acomplete(prompt)
|
74
|
+
response_text = str(response).strip()
|
75
|
+
|
76
|
+
# Parse JSON response - extract content between { and }
|
77
|
+
try:
|
78
|
+
start = response_text.find("{")
|
79
|
+
end = response_text.rfind("}") + 1
|
80
|
+
json_str = response_text[start:end]
|
81
|
+
result_json = json.loads(json_str)
|
82
|
+
package_name = result_json["package"]
|
83
|
+
except (json.JSONDecodeError, KeyError, ValueError) as e:
|
84
|
+
return StopEvent(result=f"Error parsing LLM response: {e}. Response: {response_text}")
|
85
|
+
|
86
|
+
# Open the selected app using the package name
|
87
|
+
result = self.tools.start_app(package_name)
|
88
|
+
|
89
|
+
return StopEvent(result=result)
|
90
|
+
|
91
|
+
|
92
|
+
# Example usage
|
93
|
+
async def main():
|
94
|
+
"""
|
95
|
+
Example of how to use the OpenAppWorkflow.
|
96
|
+
"""
|
97
|
+
from llama_index.llms.openai import OpenAI
|
98
|
+
|
99
|
+
from droidrun.tools.adb import AdbTools
|
100
|
+
|
101
|
+
# Initialize tools with device serial (None for default device)
|
102
|
+
tools = AdbTools(serial=None)
|
103
|
+
|
104
|
+
# Initialize LLM
|
105
|
+
llm = OpenAI(model="gpt-4o-mini")
|
106
|
+
|
107
|
+
# Create workflow instance
|
108
|
+
workflow = AppStarter(tools=tools, llm=llm, timeout=60, verbose=True)
|
109
|
+
|
110
|
+
# Run workflow to open an app
|
111
|
+
result = await workflow.run(app_description="Settings")
|
112
|
+
|
113
|
+
print(f"Result: {result}")
|
114
|
+
|
115
|
+
|
116
|
+
if __name__ == "__main__":
|
117
|
+
import asyncio
|
118
|
+
asyncio.run(main())
|
@@ -0,0 +1,204 @@
|
|
1
|
+
'''CodeAct-style agent for text manipulation via constrained Python execution.
|
2
|
+
|
3
|
+
This agent receives two inputs:
|
4
|
+
- current_text: the current content of the focused text box
|
5
|
+
- task_instruction: a natural language instruction describing how to modify the text
|
6
|
+
|
7
|
+
It asks an LLM to produce Python code that:
|
8
|
+
- Uses ONLY a single provided function: input_text(text: str)
|
9
|
+
- Constructs the final text to type as a triple-quoted big string, assigned
|
10
|
+
to a variable of the model's choice (e.g., new_text = """...""")
|
11
|
+
- May reference the predefined variable ORIGINAL which contains the current text
|
12
|
+
from the text box
|
13
|
+
- Calls input_text(new_text) exactly once to clear the field and input the new text
|
14
|
+
|
15
|
+
The produced code is executed in a restricted sandbox exposing ONLY:
|
16
|
+
- ORIGINAL: str (the original text content)
|
17
|
+
- input_text: function (captures the final text; semantically clears and types)
|
18
|
+
|
19
|
+
If the generated code produces execution errors, the agent automatically sends the
|
20
|
+
stack trace back to the LLM for correction, with up to 3 retry attempts by default.
|
21
|
+
This enables iterative refinement of the generated code.
|
22
|
+
|
23
|
+
The agent returns the final text that should be entered into the text box and the
|
24
|
+
raw code produced by the model (potentially after corrections).
|
25
|
+
'''
|
26
|
+
|
27
|
+
|
28
|
+
import traceback
|
29
|
+
|
30
|
+
from llama_index.core.llms import ChatMessage
|
31
|
+
from llama_index.core.llms.llm import LLM
|
32
|
+
|
33
|
+
from droidrun.agent.utils.inference import call_with_retries
|
34
|
+
from droidrun.telemetry.phoenix import clean_span
|
35
|
+
|
36
|
+
|
37
|
+
@clean_span("text_manipulator")
|
38
|
+
def run_text_manipulation_agent(instruction: str, current_subgoal: str, current_text: str, overall_plan, hitorical_plan, llm: LLM, max_retries: int = 4) -> tuple[str, str]:
|
39
|
+
"""Convenience function to run CodeAct text manipulation with error correction.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
instruction: User's overall instruction
|
43
|
+
current_subgoal: Current subgoal to accomplish
|
44
|
+
current_text: The current content of the focused text field
|
45
|
+
overall_plan: Overall plan context
|
46
|
+
hitorical_plan: Historical progress
|
47
|
+
llm: LLM instance to use for text manipulation
|
48
|
+
max_retries: Maximum number of retry attempts if code execution fails
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
Tuple of (final_text, raw_code) - the final text to input and the generated code
|
52
|
+
"""
|
53
|
+
system_prompt = (
|
54
|
+
"You are CODEACT_TEXT_AGENT, a constrained Python code generator for editing text in an Android text box.\n"
|
55
|
+
"You will be given: (1) the current text in the focused text box as ORIGINAL, and (2) a TASK that describes how to modify it.\n\n"
|
56
|
+
"Your job is to output ONLY a single Python code block in ```python format that:\n"
|
57
|
+
"- Defines NO new functions, classes, or imports.\n"
|
58
|
+
"- Uses ONLY the provided function input_text(text: str).\n"
|
59
|
+
"- Builds the final content in a triple-quoted big string assigned to a variable of your choice, e.g.:\n"
|
60
|
+
" new_text = \"\"\"...\"\"\"\n"
|
61
|
+
"- Includes ORIGINAL in the new_text if needed to fulfill the TASK.\n"
|
62
|
+
"- Calls input_text(new_text) exactly once to clear the field and input the new content.\n\n"
|
63
|
+
"STRICT FORMAT RULES:\n"
|
64
|
+
"- Respond with ONLY a fenced Python code block: ```python\n<code>\n```\n"
|
65
|
+
"- Do NOT print anything. Do NOT use input().\n"
|
66
|
+
"- Do NOT import any modules. Do NOT define additional functions or classes.\n"
|
67
|
+
"- Do NOT access files, network, or system.\n"
|
68
|
+
"If you are unsure about the ORIGINAL, use it by referencing ORIGINAL variable so you dont make mistake with white space or new line characters\n"
|
69
|
+
"below is ORIGINAL use it by referencing ORIGINAL variable or directly typing it out:\n<ORIGINAL>\n{current_text}\n</ORIGINAL>\n"
|
70
|
+
f"""
|
71
|
+
<user_request>
|
72
|
+
{instruction}
|
73
|
+
</user_request>
|
74
|
+
<overall_plan>
|
75
|
+
{overall_plan}
|
76
|
+
</overall_plan>
|
77
|
+
<progress_status>
|
78
|
+
{hitorical_plan}
|
79
|
+
</progress_status>
|
80
|
+
<current_subgoal>
|
81
|
+
{current_subgoal}
|
82
|
+
</current_subgoal>
|
83
|
+
"""
|
84
|
+
)
|
85
|
+
|
86
|
+
error_correction_prompt = (
|
87
|
+
"You are CODEACT_TEXT_AGENT, correcting your previous code that had execution errors.\n\n"
|
88
|
+
"The code you generated previously failed with this error:\n{error_message}\n\n"
|
89
|
+
"Please fix the code and output ONLY a new Python code block in ```python format.\n"
|
90
|
+
"Follow the same rules as before:\n"
|
91
|
+
"- Use ONLY the provided function input_text(text: str)\n"
|
92
|
+
"- Build the final content in a triple-quoted big string\n"
|
93
|
+
"- Include ORIGINAL in the new_text if needed\n"
|
94
|
+
"- Call input_text(new_text) exactly once\n"
|
95
|
+
"- Respond with ONLY a fenced Python code block\n"
|
96
|
+
"If you are unsure about the ORIGINAL, use it by referencing ORIGINAL variable so you dont make mistake with white space or new line characters"
|
97
|
+
"below is ORIGINAL use it by referencing ORIGINAL variable or directly typing it out:\n<ORIGINAL>{current_text}</ORIGINAL>\n"
|
98
|
+
)
|
99
|
+
|
100
|
+
user_prompt = (
|
101
|
+
"TASK:\n{task_instruction}\n\n"
|
102
|
+
"CURRENT TEXT (ORIGINAL):\n{current_text}\n\n"
|
103
|
+
"Write the Python code now."
|
104
|
+
).format(
|
105
|
+
task_instruction=current_subgoal.strip(),
|
106
|
+
current_text=current_text,
|
107
|
+
)
|
108
|
+
|
109
|
+
messages = [ChatMessage(role="system", content=system_prompt.format(overall_plan=overall_plan, hitorical_plan=hitorical_plan, current_subgoal=current_subgoal, instruction=instruction, current_text=current_text)), ChatMessage(role="user", content=user_prompt)]
|
110
|
+
|
111
|
+
for attempt in range(max_retries + 1): # +1 for initial attempt
|
112
|
+
# Call the LLM with current messages
|
113
|
+
response_message = call_with_retries(llm, messages).message
|
114
|
+
content = response_message.content
|
115
|
+
messages.append(response_message)
|
116
|
+
|
117
|
+
# Extract code from ```python blocks
|
118
|
+
code = _extract_python_code(content)
|
119
|
+
if not code:
|
120
|
+
# Fallback: if no code block found, use entire response as code
|
121
|
+
code = content.strip()
|
122
|
+
|
123
|
+
# Execute the code in a sandbox
|
124
|
+
final_text, error_message = _execute_sandbox(code, current_text)
|
125
|
+
|
126
|
+
# If successful (no error), return the result
|
127
|
+
if not error_message:
|
128
|
+
return final_text, code
|
129
|
+
|
130
|
+
# If this was the last attempt, return what we have
|
131
|
+
if attempt == max_retries:
|
132
|
+
return final_text, code
|
133
|
+
|
134
|
+
# Add error correction message to conversation
|
135
|
+
correction_message = error_correction_prompt.format(error_message=error_message)
|
136
|
+
messages.append(ChatMessage(role="user", content=correction_message))
|
137
|
+
|
138
|
+
# This should never be reached, but just in case
|
139
|
+
return current_text, ""
|
140
|
+
|
141
|
+
|
142
|
+
def _extract_python_code(text: str) -> str:
|
143
|
+
"""Extract Python code from ```python fenced blocks using simple string operations."""
|
144
|
+
if not text:
|
145
|
+
return ""
|
146
|
+
|
147
|
+
# Try different variations of code block markers
|
148
|
+
patterns = [
|
149
|
+
# ```python with newlines
|
150
|
+
("```python\n", "\n```"),
|
151
|
+
# ```python without newlines
|
152
|
+
("```python", "```"),
|
153
|
+
# Generic ``` with newlines
|
154
|
+
("```\n", "\n```"),
|
155
|
+
# Generic ``` without newlines
|
156
|
+
("```", "```"),
|
157
|
+
]
|
158
|
+
|
159
|
+
for start_marker, end_marker in patterns:
|
160
|
+
if start_marker in text and end_marker in text:
|
161
|
+
# Find the start position after the marker
|
162
|
+
start_idx = text.find(start_marker) + len(start_marker)
|
163
|
+
# Find the end position before the marker
|
164
|
+
end_idx = text.find(end_marker, start_idx)
|
165
|
+
if end_idx != -1:
|
166
|
+
code = text[start_idx:end_idx].strip()
|
167
|
+
# Only return if we actually extracted some code
|
168
|
+
if code:
|
169
|
+
return code
|
170
|
+
|
171
|
+
return ""
|
172
|
+
|
173
|
+
|
174
|
+
def _execute_sandbox(code: str, original_text: str) -> tuple[str, str]:
|
175
|
+
"""Execute model code in a locked-down environment with exec().
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
Tuple of (result_text, error_message) - result_text is the final text if successful,
|
179
|
+
or original_text if failed. error_message is the stack trace if execution failed,
|
180
|
+
or empty string if successful.
|
181
|
+
"""
|
182
|
+
if not code:
|
183
|
+
return original_text, ""
|
184
|
+
|
185
|
+
captured = {"value": None}
|
186
|
+
|
187
|
+
def input_text(text: str) -> None:
|
188
|
+
"""Capture the final text to be input."""
|
189
|
+
captured["value"] = text
|
190
|
+
|
191
|
+
# Create restricted environment
|
192
|
+
sandbox_globals = {
|
193
|
+
"__builtins__": {}, # Empty builtins for security
|
194
|
+
"input_text": input_text,
|
195
|
+
"ORIGINAL": original_text
|
196
|
+
}
|
197
|
+
sandbox_locals = {}
|
198
|
+
|
199
|
+
try:
|
200
|
+
exec(code, sandbox_globals, sandbox_locals)
|
201
|
+
return captured["value"] if captured["value"] is not None else original_text, ""
|
202
|
+
except Exception:
|
203
|
+
error_message = traceback.format_exc()
|
204
|
+
return original_text, error_message
|
@@ -1,13 +1,13 @@
|
|
1
1
|
from droidrun.agent.planner.planner_agent import PlannerAgent
|
2
2
|
from droidrun.agent.planner.prompts import (
|
3
3
|
DEFAULT_PLANNER_SYSTEM_PROMPT,
|
4
|
+
DEFAULT_PLANNER_TASK_FAILED_PROMPT,
|
4
5
|
DEFAULT_PLANNER_USER_PROMPT,
|
5
|
-
DEFAULT_PLANNER_TASK_FAILED_PROMPT
|
6
6
|
)
|
7
7
|
|
8
8
|
__all__ = [
|
9
|
-
"PlannerAgent",
|
9
|
+
"PlannerAgent",
|
10
10
|
"DEFAULT_PLANNER_SYSTEM_PROMPT",
|
11
11
|
"DEFAULT_PLANNER_USER_PROMPT",
|
12
12
|
"DEFAULT_PLANNER_TASK_FAILED_PROMPT"
|
13
|
-
]
|
13
|
+
]
|
droidrun/agent/planner/events.py
CHANGED
@@ -1,16 +1,19 @@
|
|
1
|
-
from
|
1
|
+
from typing import Optional
|
2
|
+
|
2
3
|
from llama_index.core.base.llms.types import ChatMessage
|
3
|
-
from
|
4
|
+
from llama_index.core.workflow import Event
|
5
|
+
|
4
6
|
from droidrun.agent.context import Task
|
5
7
|
from droidrun.agent.usage import UsageResult
|
6
8
|
|
9
|
+
|
7
10
|
class PlanInputEvent(Event):
|
8
11
|
input: list[ChatMessage]
|
9
12
|
|
10
13
|
|
11
14
|
class PlanThinkingEvent(Event):
|
12
15
|
thoughts: Optional[str] = None
|
13
|
-
code: Optional[str] = None
|
16
|
+
code: Optional[str] = None
|
14
17
|
usage: Optional[UsageResult] = None
|
15
18
|
|
16
19
|
|
@@ -1,33 +1,31 @@
|
|
1
|
-
from droidrun.agent.planner.events import *
|
2
|
-
from droidrun.agent.planner.prompts import (
|
3
|
-
DEFAULT_PLANNER_SYSTEM_PROMPT,
|
4
|
-
DEFAULT_PLANNER_USER_PROMPT,
|
5
|
-
)
|
6
|
-
import logging
|
7
1
|
import asyncio
|
8
|
-
|
9
|
-
import
|
2
|
+
import logging
|
3
|
+
from typing import TYPE_CHECKING, List, Union
|
4
|
+
|
5
|
+
from dotenv import load_dotenv
|
10
6
|
from llama_index.core.base.llms.types import ChatMessage, ChatResponse
|
11
|
-
from llama_index.core.prompts import PromptTemplate
|
12
7
|
from llama_index.core.llms.llm import LLM
|
13
|
-
from llama_index.core.workflow import Workflow, StartEvent, StopEvent, Context, step
|
14
8
|
from llama_index.core.memory import Memory
|
15
|
-
from llama_index.core.
|
16
|
-
from
|
17
|
-
|
18
|
-
from droidrun.agent.
|
9
|
+
from llama_index.core.prompts import PromptTemplate
|
10
|
+
from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
|
11
|
+
|
12
|
+
from droidrun.agent.common.constants import LLM_HISTORY_LIMIT
|
13
|
+
from droidrun.agent.common.events import RecordUIStateEvent, ScreenshotEvent
|
14
|
+
from droidrun.agent.context.agent_persona import AgentPersona
|
19
15
|
from droidrun.agent.context.task_manager import TaskManager
|
20
|
-
from droidrun.tools import Tools
|
21
|
-
from droidrun.agent.common.events import ScreenshotEvent, RecordUIStateEvent
|
22
16
|
from droidrun.agent.planner.events import (
|
23
|
-
PlanInputEvent,
|
24
17
|
PlanCreatedEvent,
|
18
|
+
PlanInputEvent,
|
25
19
|
PlanThinkingEvent,
|
26
20
|
)
|
27
|
-
from droidrun.agent.
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
from droidrun.agent.planner.prompts import (
|
22
|
+
DEFAULT_PLANNER_SYSTEM_PROMPT,
|
23
|
+
DEFAULT_PLANNER_USER_PROMPT,
|
24
|
+
)
|
25
|
+
from droidrun.agent.usage import get_usage_from_response
|
26
|
+
from droidrun.agent.utils import chat_utils
|
27
|
+
from droidrun.agent.utils.executer import SimpleCodeExecutor
|
28
|
+
from droidrun.tools import Tools
|
31
29
|
|
32
30
|
load_dotenv()
|
33
31
|
|
@@ -63,7 +61,6 @@ class PlannerAgent(Workflow):
|
|
63
61
|
|
64
62
|
self.chat_memory = None
|
65
63
|
self.remembered_info = None
|
66
|
-
self.reflection: Reflection = None
|
67
64
|
|
68
65
|
self.current_retry = 0
|
69
66
|
self.steps_counter = 0
|
@@ -97,7 +94,7 @@ class PlannerAgent(Workflow):
|
|
97
94
|
async def prepare_chat(self, ctx: Context, ev: StartEvent) -> PlanInputEvent:
|
98
95
|
logger.info("💬 Preparing planning session...")
|
99
96
|
|
100
|
-
self.chat_memory: Memory = await ctx.get(
|
97
|
+
self.chat_memory: Memory = await ctx.store.get(
|
101
98
|
"chat_memory", default=Memory.from_defaults()
|
102
99
|
)
|
103
100
|
await self.chat_memory.aput(self.user_message)
|
@@ -105,15 +102,10 @@ class PlannerAgent(Workflow):
|
|
105
102
|
if ev.remembered_info:
|
106
103
|
self.remembered_info = ev.remembered_info
|
107
104
|
|
108
|
-
if ev.reflection:
|
109
|
-
self.reflection = ev.reflection
|
110
|
-
else:
|
111
|
-
self.reflection = None
|
112
|
-
|
113
105
|
assert len(self.chat_memory.get_all()) > 0 or self.user_prompt, "Memory input, user prompt or user input cannot be empty."
|
114
|
-
|
106
|
+
|
115
107
|
await self.chat_memory.aput(ChatMessage(role="user", content=PromptTemplate(self.user_prompt or DEFAULT_PLANNER_USER_PROMPT.format(goal=self.goal))))
|
116
|
-
|
108
|
+
|
117
109
|
input_messages = self.chat_memory.get_all()
|
118
110
|
logger.debug(f" - Memory contains {len(input_messages)} messages")
|
119
111
|
return PlanInputEvent(input=input_messages)
|
@@ -129,24 +121,23 @@ class PlannerAgent(Workflow):
|
|
129
121
|
ctx.write_event_to_stream(ev)
|
130
122
|
|
131
123
|
self.steps_counter += 1
|
132
|
-
logger.info(
|
124
|
+
logger.info("🧠 Thinking about how to plan the goal...")
|
133
125
|
|
134
126
|
if self.vision:
|
135
127
|
screenshot = (self.tools_instance.take_screenshot())[1]
|
136
128
|
ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
|
137
|
-
await ctx.set("screenshot", screenshot)
|
129
|
+
await ctx.store.set("screenshot", screenshot)
|
138
130
|
|
139
131
|
try:
|
140
132
|
state = self.tools_instance.get_state()
|
141
|
-
await ctx.set("ui_state", state["a11y_tree"])
|
142
|
-
await ctx.set("phone_state", state["phone_state"])
|
133
|
+
await ctx.store.set("ui_state", state["a11y_tree"])
|
134
|
+
await ctx.store.set("phone_state", state["phone_state"])
|
143
135
|
ctx.write_event_to_stream(RecordUIStateEvent(ui_state=state["a11y_tree"]))
|
144
|
-
except Exception
|
145
|
-
logger.warning(
|
136
|
+
except Exception:
|
137
|
+
logger.warning("⚠️ Error retrieving state from the connected device. Is the Accessibility Service enabled?")
|
146
138
|
|
147
139
|
|
148
|
-
await ctx.set("remembered_info", self.remembered_info)
|
149
|
-
await ctx.set("reflection", self.reflection)
|
140
|
+
await ctx.store.set("remembered_info", self.remembered_info)
|
150
141
|
|
151
142
|
response = await self._get_llm_response(ctx, chat_history)
|
152
143
|
try:
|
@@ -169,18 +160,17 @@ class PlannerAgent(Workflow):
|
|
169
160
|
"""Handle LLM output."""
|
170
161
|
logger.debug("🤖 Processing planning output...")
|
171
162
|
code = ev.code
|
172
|
-
thoughts = ev.thoughts
|
173
163
|
|
174
164
|
if code:
|
175
165
|
try:
|
176
166
|
result = await self.executer.execute(ctx, code)
|
177
|
-
logger.info(
|
167
|
+
logger.info("📝 Planning complete")
|
178
168
|
logger.debug(f" - Planning code executed. Result: {result['output']}")
|
179
169
|
|
180
170
|
screenshots = result['screenshots']
|
181
171
|
for screenshot in screenshots[:-1]: # the last screenshot will be captured by next step
|
182
172
|
ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
|
183
|
-
|
173
|
+
|
184
174
|
ui_states = result['ui_states']
|
185
175
|
for ui_state in ui_states[:-1]:
|
186
176
|
ctx.write_event_to_stream(RecordUIStateEvent(ui_state=ui_state['a11y_tree']))
|
@@ -237,7 +227,7 @@ wrap your code inside this:
|
|
237
227
|
@step
|
238
228
|
async def finalize(self, ev: PlanCreatedEvent, ctx: Context) -> StopEvent:
|
239
229
|
"""Finalize the workflow."""
|
240
|
-
await ctx.set("chat_memory", self.chat_memory)
|
230
|
+
await ctx.store.set("chat_memory", self.chat_memory)
|
241
231
|
|
242
232
|
result = {}
|
243
233
|
result.update(
|
@@ -256,15 +246,15 @@ wrap your code inside this:
|
|
256
246
|
logger.debug(f" - Sending {len(chat_history)} messages to LLM.")
|
257
247
|
|
258
248
|
model = self.llm.class_name()
|
259
|
-
if self.vision
|
249
|
+
if self.vision:
|
260
250
|
if model == "DeepSeek":
|
261
251
|
logger.warning(
|
262
252
|
"[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
|
263
253
|
)
|
264
254
|
else:
|
265
255
|
chat_history = await chat_utils.add_screenshot_image_block(
|
266
|
-
await ctx.get("screenshot"), chat_history
|
267
|
-
)
|
256
|
+
await ctx.store.get("screenshot"), chat_history
|
257
|
+
)
|
268
258
|
|
269
259
|
|
270
260
|
|
@@ -275,18 +265,15 @@ wrap your code inside this:
|
|
275
265
|
chat_history,
|
276
266
|
)
|
277
267
|
|
278
|
-
remembered_info = await ctx.get("remembered_info", default=None)
|
268
|
+
remembered_info = await ctx.store.get("remembered_info", default=None)
|
279
269
|
if remembered_info:
|
280
270
|
chat_history = await chat_utils.add_memory_block(remembered_info, chat_history)
|
281
271
|
|
282
|
-
|
283
|
-
|
284
|
-
chat_history = await chat_utils.add_reflection_summary(reflection, chat_history)
|
285
|
-
|
286
|
-
chat_history = await chat_utils.add_phone_state_block(await ctx.get("phone_state"), chat_history)
|
287
|
-
chat_history = await chat_utils.add_ui_text_block(await ctx.get("ui_state"), chat_history)
|
272
|
+
chat_history = await chat_utils.add_phone_state_block(await ctx.store.get("phone_state"), chat_history)
|
273
|
+
chat_history = await chat_utils.add_ui_text_block(await ctx.store.get("ui_state"), chat_history)
|
288
274
|
|
289
|
-
|
275
|
+
limited_history = self._limit_history(chat_history)
|
276
|
+
messages_to_send = [self.system_message] + limited_history
|
290
277
|
messages_to_send = [
|
291
278
|
chat_utils.message_copy(msg) for msg in messages_to_send
|
292
279
|
]
|
@@ -302,3 +289,23 @@ wrap your code inside this:
|
|
302
289
|
except Exception as e:
|
303
290
|
logger.error(f"Could not get an answer from LLM: {repr(e)}")
|
304
291
|
raise e
|
292
|
+
|
293
|
+
def _limit_history(
|
294
|
+
self, chat_history: List[ChatMessage]
|
295
|
+
) -> List[ChatMessage]:
|
296
|
+
if LLM_HISTORY_LIMIT <= 0:
|
297
|
+
return chat_history
|
298
|
+
|
299
|
+
max_messages = LLM_HISTORY_LIMIT * 2
|
300
|
+
if len(chat_history) <= max_messages:
|
301
|
+
return chat_history
|
302
|
+
|
303
|
+
preserved_head: List[ChatMessage] = []
|
304
|
+
if chat_history and chat_history[0].role == "user":
|
305
|
+
preserved_head = [chat_history[0]]
|
306
|
+
|
307
|
+
tail = chat_history[-max_messages:]
|
308
|
+
if preserved_head and preserved_head[0] in tail:
|
309
|
+
preserved_head = []
|
310
|
+
|
311
|
+
return preserved_head + tail
|
@@ -119,6 +119,6 @@ Instruction: Based **only** on the provided screenshot showing the current state
|
|
119
119
|
# Export all prompts
|
120
120
|
__all__ = [
|
121
121
|
"DEFAULT_PLANNER_SYSTEM_PROMPT",
|
122
|
-
"DEFAULT_PLANNER_USER_PROMPT",
|
122
|
+
"DEFAULT_PLANNER_USER_PROMPT",
|
123
123
|
"DEFAULT_PLANNER_TASK_FAILED_PROMPT"
|
124
|
-
]
|
124
|
+
]
|
droidrun/agent/usage.py
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
import contextlib
|
2
|
-
|
2
|
+
import logging
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
from uuid import uuid4
|
5
|
+
|
3
6
|
from llama_index.core.callbacks.base_handler import BaseCallbackHandler
|
4
7
|
from llama_index.core.callbacks.schema import CBEventType, EventPayload
|
5
8
|
from llama_index.core.llms import LLM, ChatResponse
|
6
9
|
from pydantic import BaseModel
|
7
|
-
from typing import Any, Dict, List, Optional
|
8
|
-
from uuid import uuid4
|
9
|
-
import logging
|
10
10
|
|
11
11
|
logger = logging.getLogger("droidrun")
|
12
12
|
SUPPORTED_PROVIDERS = [
|
13
13
|
"Gemini",
|
14
14
|
"GoogleGenAI",
|
15
|
+
"GenAI",
|
15
16
|
"OpenAI",
|
17
|
+
"openai_llm",
|
16
18
|
"Anthropic",
|
17
19
|
"Ollama",
|
18
20
|
"DeepSeek",
|
@@ -32,14 +34,14 @@ def get_usage_from_response(provider: str, chat_rsp: ChatResponse) -> UsageResul
|
|
32
34
|
|
33
35
|
print(f"rsp: {rsp.__class__.__name__}")
|
34
36
|
|
35
|
-
if provider == "Gemini" or provider == "GoogleGenAI":
|
37
|
+
if provider == "Gemini" or provider == "GoogleGenAI" or provider == "GenAI":
|
36
38
|
return UsageResult(
|
37
39
|
request_tokens=rsp["usage_metadata"]["prompt_token_count"],
|
38
40
|
response_tokens=rsp["usage_metadata"]["candidates_token_count"],
|
39
41
|
total_tokens=rsp["usage_metadata"]["total_token_count"],
|
40
42
|
requests=1,
|
41
43
|
)
|
42
|
-
elif provider == "OpenAI":
|
44
|
+
elif provider == "OpenAI" or provider == "openai_llm":
|
43
45
|
from openai.types import CompletionUsage as OpenAIUsage
|
44
46
|
|
45
47
|
usage: OpenAIUsage = rsp.usage
|
@@ -109,7 +111,7 @@ class TokenCountingHandler(BaseCallbackHandler):
|
|
109
111
|
)
|
110
112
|
|
111
113
|
def _get_event_usage(self, payload: Dict[str, Any]) -> UsageResult:
|
112
|
-
if
|
114
|
+
if EventPayload.RESPONSE not in payload:
|
113
115
|
raise ValueError("No response in payload")
|
114
116
|
|
115
117
|
chat_rsp: ChatResponse = payload.get(EventPayload.RESPONSE)
|
@@ -178,26 +180,26 @@ def create_tracker(llm: LLM) -> TokenCountingHandler:
|
|
178
180
|
|
179
181
|
def track_usage(llm: LLM) -> TokenCountingHandler:
|
180
182
|
"""Track token usage for an LLM instance across all requests.
|
181
|
-
|
183
|
+
|
182
184
|
This function:
|
183
185
|
- Creates a new TokenCountingHandler for the LLM provider
|
184
186
|
- Registers that handler as an LLM callback to monitor all requests
|
185
187
|
- Returns the handler for accessing cumulative usage statistics
|
186
|
-
|
188
|
+
|
187
189
|
The handler counts tokens for total LLM usage across all requests. For fine-grained
|
188
190
|
per-request counting, use either:
|
189
191
|
- `create_tracker()` with `llm_callback()` context manager for temporary tracking
|
190
192
|
- `get_usage_from_response()` to extract usage from individual responses
|
191
|
-
|
193
|
+
|
192
194
|
Args:
|
193
195
|
llm: The LLamaIndex LLM instance to track usage for
|
194
|
-
|
196
|
+
|
195
197
|
Returns:
|
196
198
|
TokenCountingHandler: The registered handler that accumulates usage statistics
|
197
|
-
|
199
|
+
|
198
200
|
Raises:
|
199
201
|
ValueError: If the LLM provider is not supported for tracking
|
200
|
-
|
202
|
+
|
201
203
|
Example:
|
202
204
|
>>> llm = OpenAI()
|
203
205
|
>>> tracker = track_usage(llm)
|