droidrun 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/__init__.py +16 -11
- droidrun/__main__.py +1 -1
- droidrun/adb/__init__.py +3 -3
- droidrun/adb/device.py +1 -1
- droidrun/adb/manager.py +2 -2
- droidrun/agent/__init__.py +6 -0
- droidrun/agent/codeact/__init__.py +2 -4
- droidrun/agent/codeact/codeact_agent.py +321 -235
- droidrun/agent/codeact/events.py +12 -20
- droidrun/agent/codeact/prompts.py +0 -52
- droidrun/agent/common/default.py +5 -0
- droidrun/agent/common/events.py +4 -0
- droidrun/agent/context/__init__.py +23 -0
- droidrun/agent/context/agent_persona.py +15 -0
- droidrun/agent/context/context_injection_manager.py +66 -0
- droidrun/agent/context/episodic_memory.py +15 -0
- droidrun/agent/context/personas/__init__.py +11 -0
- droidrun/agent/context/personas/app_starter.py +44 -0
- droidrun/agent/context/personas/default.py +95 -0
- droidrun/agent/context/personas/extractor.py +52 -0
- droidrun/agent/context/personas/ui_expert.py +107 -0
- droidrun/agent/context/reflection.py +20 -0
- droidrun/agent/context/task_manager.py +124 -0
- droidrun/agent/context/todo.txt +4 -0
- droidrun/agent/droid/__init__.py +2 -2
- droidrun/agent/droid/droid_agent.py +264 -325
- droidrun/agent/droid/events.py +28 -0
- droidrun/agent/oneflows/reflector.py +265 -0
- droidrun/agent/planner/__init__.py +2 -4
- droidrun/agent/planner/events.py +9 -13
- droidrun/agent/planner/planner_agent.py +268 -0
- droidrun/agent/planner/prompts.py +33 -53
- droidrun/agent/utils/__init__.py +3 -0
- droidrun/agent/utils/async_utils.py +1 -40
- droidrun/agent/utils/chat_utils.py +268 -48
- droidrun/agent/utils/executer.py +49 -14
- droidrun/agent/utils/llm_picker.py +14 -10
- droidrun/agent/utils/trajectory.py +184 -0
- droidrun/cli/__init__.py +1 -1
- droidrun/cli/logs.py +283 -0
- droidrun/cli/main.py +333 -439
- droidrun/run.py +105 -0
- droidrun/tools/__init__.py +5 -10
- droidrun/tools/{actions.py → adb.py} +279 -238
- droidrun/tools/ios.py +594 -0
- droidrun/tools/tools.py +99 -0
- droidrun-0.3.0.dist-info/METADATA +149 -0
- droidrun-0.3.0.dist-info/RECORD +52 -0
- droidrun/agent/planner/task_manager.py +0 -355
- droidrun/agent/planner/workflow.py +0 -371
- droidrun/tools/device.py +0 -29
- droidrun/tools/loader.py +0 -60
- droidrun-0.2.0.dist-info/METADATA +0 -373
- droidrun-0.2.0.dist-info/RECORD +0 -32
- {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/WHEEL +0 -0
- {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/entry_points.txt +0 -0
- {droidrun-0.2.0.dist-info → droidrun-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,23 +1,35 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
-
import inspect
|
4
3
|
import time
|
5
|
-
|
6
|
-
|
4
|
+
import asyncio
|
5
|
+
import json
|
6
|
+
import os
|
7
|
+
from typing import List, Optional, Tuple, Union
|
8
|
+
from llama_index.core.base.llms.types import ChatMessage, ChatResponse
|
7
9
|
from llama_index.core.prompts import PromptTemplate
|
8
10
|
from llama_index.core.llms.llm import LLM
|
9
11
|
from llama_index.core.workflow import Workflow, StartEvent, StopEvent, Context, step
|
10
|
-
from llama_index.core.memory import
|
11
|
-
from .events import
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
12
|
+
from llama_index.core.memory import Memory
|
13
|
+
from droidrun.agent.codeact.events import (
|
14
|
+
TaskInputEvent,
|
15
|
+
TaskEndEvent,
|
16
|
+
TaskExecutionEvent,
|
17
|
+
TaskExecutionResultEvent,
|
18
|
+
TaskThinkingEvent,
|
19
|
+
EpisodicMemoryEvent,
|
20
|
+
)
|
21
|
+
from droidrun.agent.common.events import ScreenshotEvent
|
22
|
+
from droidrun.agent.utils import chat_utils
|
23
|
+
from droidrun.agent.utils.executer import SimpleCodeExecutor
|
24
|
+
from droidrun.agent.codeact.prompts import (
|
25
|
+
DEFAULT_CODE_ACT_USER_PROMPT,
|
26
|
+
DEFAULT_NO_THOUGHTS_PROMPT,
|
17
27
|
)
|
18
28
|
|
19
|
-
|
20
|
-
|
29
|
+
from droidrun.agent.context.episodic_memory import EpisodicMemory, EpisodicMemoryStep
|
30
|
+
from droidrun.tools import Tools
|
31
|
+
from typing import Optional, Dict, Tuple, List, Any, Callable
|
32
|
+
from droidrun.agent.context.agent_persona import AgentPersona
|
21
33
|
|
22
34
|
logger = logging.getLogger("droidrun")
|
23
35
|
|
@@ -28,307 +40,381 @@ class CodeActAgent(Workflow):
|
|
28
40
|
to solve problems requiring code execution. It extracts code from
|
29
41
|
Markdown blocks and uses specific step types for tracking.
|
30
42
|
"""
|
43
|
+
|
31
44
|
def __init__(
|
32
45
|
self,
|
33
46
|
llm: LLM,
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
max_steps: int =
|
38
|
-
system_prompt: Optional[str] = None,
|
39
|
-
user_prompt: Optional[str] = None,
|
40
|
-
vision: bool = False,
|
47
|
+
persona: AgentPersona,
|
48
|
+
tools_instance: "Tools",
|
49
|
+
all_tools_list: Dict[str, Callable[..., Any]],
|
50
|
+
max_steps: int = 5,
|
41
51
|
debug: bool = False,
|
42
52
|
*args,
|
43
|
-
**kwargs
|
53
|
+
**kwargs,
|
44
54
|
):
|
45
55
|
# assert instead of if
|
46
56
|
assert llm, "llm must be provided."
|
47
|
-
assert code_execute_fn, "code_execute_fn must be provided"
|
48
57
|
super().__init__(*args, **kwargs)
|
49
58
|
|
50
59
|
self.llm = llm
|
51
|
-
self.
|
52
|
-
|
53
|
-
self.
|
54
|
-
self.max_steps = max_steps # Kept for backwards compatibility but not enforced
|
55
|
-
self.tool_descriptions = self.parse_tool_descriptions() # Parse tool descriptions once at initialization
|
56
|
-
self.system_prompt_content = (system_prompt or DEFAULT_CODE_ACT_SYSTEM_PROMPT).format(tool_descriptions=self.tool_descriptions)
|
57
|
-
self.system_prompt = ChatMessage(role="system", content=self.system_prompt_content)
|
58
|
-
self.user_prompt = user_prompt
|
60
|
+
self.max_steps = max_steps
|
61
|
+
|
62
|
+
self.user_prompt = persona.user_prompt
|
59
63
|
self.no_thoughts_prompt = None
|
60
|
-
|
64
|
+
|
65
|
+
self.chat_memory = None
|
66
|
+
self.episodic_memory = EpisodicMemory(persona=persona)
|
67
|
+
self.remembered_info = None
|
68
|
+
|
61
69
|
self.goal = None
|
62
|
-
self.steps_counter = 0
|
63
|
-
self.code_exec_counter = 0
|
64
|
-
self.vision = vision
|
70
|
+
self.steps_counter = 0
|
71
|
+
self.code_exec_counter = 0
|
65
72
|
self.debug = debug
|
66
|
-
logger.info("✅ CodeActAgent initialized successfully.")
|
67
73
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
def _extract_code_and_thought(self, response_text: str) -> Tuple[Optional[str], str]:
|
96
|
-
"""
|
97
|
-
Extracts code from Markdown blocks (```python ... ```) and the surrounding text (thought),
|
98
|
-
handling indented code blocks.
|
99
|
-
|
100
|
-
Returns:
|
101
|
-
Tuple[Optional[code_string], thought_string]
|
102
|
-
"""
|
103
|
-
if self.debug:
|
104
|
-
logger.debug("✂️ Extracting code and thought from response...")
|
105
|
-
code_pattern = r"^\s*```python\s*\n(.*?)\n^\s*```\s*?$" # Added ^\s*, re.MULTILINE, and made closing fence match more robust
|
106
|
-
# Use re.DOTALL to make '.' match newlines and re.MULTILINE to make '^' match start of lines
|
107
|
-
code_matches = list(re.finditer(code_pattern, response_text, re.DOTALL | re.MULTILINE))
|
108
|
-
|
109
|
-
if not code_matches:
|
110
|
-
# No code found, the entire response is thought
|
111
|
-
if self.debug:
|
112
|
-
logger.debug(" - No code block found. Entire response is thought.")
|
113
|
-
return None, response_text.strip()
|
114
|
-
|
115
|
-
extracted_code_parts = []
|
116
|
-
for match in code_matches:
|
117
|
-
# group(1) is the (.*?) part - the actual code content
|
118
|
-
code_content = match.group(1)
|
119
|
-
extracted_code_parts.append(code_content) # Keep original indentation for now
|
120
|
-
|
121
|
-
extracted_code = "\n\n".join(extracted_code_parts)
|
122
|
-
if self.debug:
|
123
|
-
logger.debug(f" - Combined extracted code:\n```python\n{extracted_code}\n```")
|
124
|
-
|
125
|
-
|
126
|
-
# Extract thought text (text before the first code block, between blocks, and after the last)
|
127
|
-
thought_parts = []
|
128
|
-
last_end = 0
|
129
|
-
for match in code_matches:
|
130
|
-
# Use span(0) to get the start/end of the *entire* match (including fences and indentation)
|
131
|
-
start, end = match.span(0)
|
132
|
-
thought_parts.append(response_text[last_end:start])
|
133
|
-
last_end = end
|
134
|
-
thought_parts.append(response_text[last_end:]) # Text after the last block
|
135
|
-
|
136
|
-
thought_text = "".join(thought_parts).strip()
|
137
|
-
# Avoid overly long debug messages for thought
|
138
|
-
if self.debug:
|
139
|
-
thought_preview = (thought_text[:100] + '...') if len(thought_text) > 100 else thought_text
|
140
|
-
logger.debug(f" - Extracted thought: {thought_preview}")
|
141
|
-
|
142
|
-
return extracted_code, thought_text
|
74
|
+
self.tools = tools_instance
|
75
|
+
|
76
|
+
self.tool_list = {}
|
77
|
+
|
78
|
+
for tool_name in persona.allowed_tools:
|
79
|
+
if tool_name in all_tools_list:
|
80
|
+
self.tool_list[tool_name] = all_tools_list[tool_name]
|
81
|
+
|
82
|
+
self.tool_descriptions = chat_utils.parse_tool_descriptions(self.tool_list)
|
83
|
+
|
84
|
+
self.system_prompt_content = persona.system_prompt.format(
|
85
|
+
tool_descriptions=self.tool_descriptions
|
86
|
+
)
|
87
|
+
self.system_prompt = ChatMessage(
|
88
|
+
role="system", content=self.system_prompt_content
|
89
|
+
)
|
90
|
+
|
91
|
+
self.required_context = persona.required_context
|
92
|
+
|
93
|
+
self.executor = SimpleCodeExecutor(
|
94
|
+
loop=asyncio.get_event_loop(),
|
95
|
+
locals={},
|
96
|
+
tools=self.tool_list,
|
97
|
+
globals={"__builtins__": __builtins__},
|
98
|
+
)
|
99
|
+
|
100
|
+
logger.info("✅ CodeActAgent initialized successfully.")
|
143
101
|
|
144
102
|
@step
|
145
|
-
async def prepare_chat(self,
|
103
|
+
async def prepare_chat(self, ctx: Context, ev: StartEvent) -> TaskInputEvent:
|
146
104
|
"""Prepare chat history from user input."""
|
147
105
|
logger.info("💬 Preparing chat for task execution...")
|
148
|
-
|
149
|
-
self.
|
150
|
-
"
|
106
|
+
|
107
|
+
self.chat_memory: Memory = await ctx.get(
|
108
|
+
"chat_memory", default=Memory.from_defaults()
|
151
109
|
)
|
110
|
+
|
152
111
|
user_input = ev.get("input", default=None)
|
153
112
|
assert user_input, "User input cannot be empty."
|
154
|
-
|
155
|
-
if
|
156
|
-
|
113
|
+
|
114
|
+
if ev.remembered_info:
|
115
|
+
self.remembered_info = ev.remembered_info
|
116
|
+
|
117
|
+
logger.debug(" - Adding goal to memory.")
|
157
118
|
goal = user_input
|
158
|
-
self.user_message = ChatMessage(
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
119
|
+
self.user_message = ChatMessage(
|
120
|
+
role="user",
|
121
|
+
content=PromptTemplate(
|
122
|
+
self.user_prompt or DEFAULT_CODE_ACT_USER_PROMPT
|
123
|
+
).format(goal=goal),
|
124
|
+
)
|
125
|
+
self.no_thoughts_prompt = ChatMessage(
|
126
|
+
role="user",
|
127
|
+
content=PromptTemplate(DEFAULT_NO_THOUGHTS_PROMPT).format(goal=goal),
|
128
|
+
)
|
129
|
+
|
130
|
+
|
131
|
+
await self.chat_memory.aput(self.user_message)
|
132
|
+
|
133
|
+
await ctx.set("chat_memory", self.chat_memory)
|
134
|
+
input_messages = self.chat_memory.get_all()
|
135
|
+
return TaskInputEvent(input=input_messages)
|
136
|
+
|
165
137
|
@step
|
166
|
-
async def handle_llm_input(
|
138
|
+
async def handle_llm_input(
|
139
|
+
self, ctx: Context, ev: TaskInputEvent
|
140
|
+
) -> TaskThinkingEvent | TaskEndEvent:
|
167
141
|
"""Handle LLM input."""
|
168
|
-
# Get chat history from event
|
169
142
|
chat_history = ev.input
|
170
143
|
assert len(chat_history) > 0, "Chat history cannot be empty."
|
144
|
+
ctx.write_event_to_stream(ev)
|
145
|
+
|
146
|
+
if self.steps_counter >= self.max_steps:
|
147
|
+
ev = TaskEndEvent(
|
148
|
+
success=False,
|
149
|
+
reason=f"Reached max step count of {self.max_steps} steps",
|
150
|
+
)
|
151
|
+
ctx.write_event_to_stream(ev)
|
152
|
+
return ev
|
171
153
|
|
172
154
|
self.steps_counter += 1
|
173
155
|
logger.info(f"🧠 Step {self.steps_counter}: Thinking...")
|
156
|
+
|
157
|
+
model = self.llm.class_name()
|
174
158
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
159
|
+
if "remember" in self.tool_list and self.remembered_info:
|
160
|
+
await ctx.set("remembered_info", self.remembered_info)
|
161
|
+
chat_history = await chat_utils.add_memory_block(self.remembered_info, chat_history)
|
162
|
+
|
163
|
+
for context in self.required_context:
|
164
|
+
if context == "screenshot" and model != "DeepSeek":
|
165
|
+
screenshot = (await self.tools.take_screenshot())[1]
|
166
|
+
ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
|
167
|
+
|
168
|
+
await ctx.set("screenshot", screenshot)
|
169
|
+
chat_history = await chat_utils.add_screenshot_image_block(screenshot, chat_history)
|
170
|
+
|
171
|
+
if context == "phone_state":
|
172
|
+
chat_history = await chat_utils.add_phone_state_block(await self.tools.get_phone_state(), chat_history)
|
173
|
+
|
174
|
+
if context == "ui_state":
|
175
|
+
ui_state = await self.tools.get_clickables()
|
176
|
+
await ctx.set("ui_state", ui_state)
|
177
|
+
chat_history = await chat_utils.add_ui_text_block(
|
178
|
+
ui_state, chat_history
|
179
|
+
)
|
180
|
+
|
181
|
+
if context == "packages":
|
182
|
+
chat_history = await chat_utils.add_packages_block(
|
183
|
+
await self.tools.list_packages(include_system_apps=True),
|
184
|
+
chat_history,
|
185
|
+
)
|
186
|
+
|
187
|
+
response = await self._get_llm_response(ctx, chat_history)
|
188
|
+
if response is None:
|
189
|
+
return TaskEndEvent(
|
190
|
+
success=False, reason="LLM response is None. This is a critical error."
|
191
|
+
)
|
192
|
+
|
193
|
+
await self.chat_memory.aput(response.message)
|
194
|
+
|
195
|
+
code, thoughts = chat_utils.extract_code_and_thought(response.message.content)
|
196
|
+
|
197
|
+
event = TaskThinkingEvent(thoughts=thoughts, code=code)
|
198
|
+
ctx.write_event_to_stream(event)
|
199
|
+
return event
|
185
200
|
|
186
201
|
@step
|
187
|
-
async def handle_llm_output(
|
202
|
+
async def handle_llm_output(
|
203
|
+
self, ctx: Context, ev: TaskThinkingEvent
|
204
|
+
) -> Union[TaskExecutionEvent, TaskInputEvent]:
|
188
205
|
"""Handle LLM output."""
|
189
|
-
|
190
|
-
logger.debug("⚙️ Handling LLM output...")
|
191
|
-
# Get code and thoughts from event
|
206
|
+
logger.debug("⚙️ Handling LLM output...")
|
192
207
|
code = ev.code
|
193
208
|
thoughts = ev.thoughts
|
194
209
|
|
195
|
-
# Warning if no thoughts are provided
|
196
210
|
if not thoughts:
|
197
|
-
logger.warning(
|
198
|
-
|
211
|
+
logger.warning(
|
212
|
+
"🤔 LLM provided code without thoughts. Adding reminder prompt."
|
213
|
+
)
|
214
|
+
await self.chat_memory.aput(self.no_thoughts_prompt)
|
199
215
|
else:
|
200
|
-
# print thought but start with emoji at the start of the log
|
201
216
|
logger.info(f"🤔 Reasoning: {thoughts}")
|
202
217
|
|
203
|
-
# If code is present, execute it
|
204
218
|
if code:
|
205
|
-
return
|
219
|
+
return TaskExecutionEvent(code=code)
|
206
220
|
else:
|
207
|
-
message = ChatMessage(
|
208
|
-
|
209
|
-
|
221
|
+
message = ChatMessage(
|
222
|
+
role="user",
|
223
|
+
content="No code was provided. If you want to mark task as complete (whether it failed or succeeded), use complete(success:bool, reason:str) function within a code block ```pythn\n```.",
|
224
|
+
)
|
225
|
+
await self.chat_memory.aput(message)
|
226
|
+
return TaskInputEvent(input=self.chat_memory.get_all())
|
210
227
|
|
211
228
|
@step
|
212
|
-
async def execute_code(
|
229
|
+
async def execute_code(
|
230
|
+
self, ctx: Context, ev: TaskExecutionEvent
|
231
|
+
) -> Union[TaskExecutionResultEvent, TaskEndEvent]:
|
213
232
|
"""Execute the code and return the result."""
|
214
233
|
code = ev.code
|
215
234
|
assert code, "Code cannot be empty."
|
216
235
|
logger.info(f"⚡ Executing action...")
|
217
|
-
|
218
|
-
|
219
|
-
# Execute the code using the provided function
|
236
|
+
logger.debug(f"Code to execute:\n```python\n{code}\n```")
|
237
|
+
|
220
238
|
try:
|
221
239
|
self.code_exec_counter += 1
|
222
|
-
result = await self.
|
240
|
+
result = await self.executor.execute(ctx, code)
|
223
241
|
logger.info(f"💡 Code execution successful. Result: {result}")
|
242
|
+
|
224
243
|
if self.tools.finished == True:
|
225
244
|
logger.debug(" - Task completed.")
|
226
|
-
|
227
|
-
|
245
|
+
event = TaskEndEvent(
|
246
|
+
success=self.tools.success, reason=self.tools.reason
|
247
|
+
)
|
248
|
+
ctx.write_event_to_stream(event)
|
249
|
+
return event
|
250
|
+
|
251
|
+
self.remembered_info = self.tools.memory
|
252
|
+
|
253
|
+
event = TaskExecutionResultEvent(output=str(result))
|
254
|
+
ctx.write_event_to_stream(event)
|
255
|
+
return event
|
256
|
+
|
228
257
|
except Exception as e:
|
229
258
|
logger.error(f"💥 Action failed: {e}")
|
230
259
|
if self.debug:
|
231
260
|
logger.error("Exception details:", exc_info=True)
|
232
261
|
error_message = f"Error during execution: {e}"
|
233
|
-
|
262
|
+
|
263
|
+
event = TaskExecutionResultEvent(output=error_message)
|
264
|
+
ctx.write_event_to_stream(event)
|
265
|
+
return event
|
234
266
|
|
235
267
|
@step
|
236
|
-
async def handle_execution_result(
|
268
|
+
async def handle_execution_result(
|
269
|
+
self, ctx: Context, ev: TaskExecutionResultEvent
|
270
|
+
) -> TaskInputEvent:
|
237
271
|
"""Handle the execution result. Currently it just returns InputEvent."""
|
238
|
-
|
239
|
-
logger.debug("📊 Handling execution result...")
|
272
|
+
logger.debug("📊 Handling execution result...")
|
240
273
|
# Get the output from the event
|
241
274
|
output = ev.output
|
242
275
|
if output is None:
|
243
276
|
output = "Code executed, but produced no output."
|
244
277
|
logger.warning(" - Execution produced no output.")
|
245
278
|
else:
|
246
|
-
|
247
|
-
|
279
|
+
logger.debug(
|
280
|
+
f" - Execution output: {output[:100]}..."
|
281
|
+
if len(output) > 100
|
282
|
+
else f" - Execution output: {output}"
|
283
|
+
)
|
248
284
|
# Add the output to memory as an user message (observation)
|
249
|
-
observation_message = ChatMessage(
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
285
|
+
observation_message = ChatMessage(
|
286
|
+
role="user", content=f"Execution Result:\n```\n{output}\n```"
|
287
|
+
)
|
288
|
+
await self.chat_memory.aput(observation_message)
|
289
|
+
|
290
|
+
return TaskInputEvent(input=self.chat_memory.get_all())
|
255
291
|
|
256
292
|
@step
|
257
|
-
async def finalize(self, ev:
|
293
|
+
async def finalize(self, ev: TaskEndEvent, ctx: Context) -> StopEvent:
|
258
294
|
"""Finalize the workflow."""
|
259
|
-
self.tools.finished = False
|
260
|
-
await ctx.set("
|
295
|
+
self.tools.finished = False
|
296
|
+
await ctx.set("chat_memory", self.chat_memory)
|
261
297
|
|
262
|
-
#
|
263
|
-
|
264
|
-
result.update({
|
265
|
-
"codeact_steps": self.steps_counter,
|
266
|
-
"code_executions": self.code_exec_counter
|
267
|
-
})
|
298
|
+
# Add final state observation to episodic memory
|
299
|
+
await self._add_final_state_observation(ctx)
|
268
300
|
|
269
|
-
|
301
|
+
result = {}
|
302
|
+
result.update(
|
303
|
+
{
|
304
|
+
"success": ev.success,
|
305
|
+
"reason": ev.reason,
|
306
|
+
"codeact_steps": self.steps_counter,
|
307
|
+
"code_executions": self.code_exec_counter,
|
308
|
+
}
|
309
|
+
)
|
270
310
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
if self.vision:
|
277
|
-
chat_history = await add_screenshot_image_block(self.tools, chat_history)
|
278
|
-
elif self.tools.last_screenshot:
|
279
|
-
chat_history = await add_screenshot(chat_history, self.tools.last_screenshot)
|
280
|
-
self.tools.last_screenshot = None # Reset last screenshot after sending it
|
281
|
-
|
282
|
-
# always add ui
|
283
|
-
chat_history = await add_ui_text_block(self.tools, chat_history)
|
284
|
-
|
285
|
-
# Add remembered information if available
|
286
|
-
if hasattr(self.tools, 'memory') and self.tools.memory:
|
287
|
-
memory_block = "\n### Remembered Information:\n"
|
288
|
-
for idx, item in enumerate(self.tools.memory, 1):
|
289
|
-
memory_block += f"{idx}. {item}\n"
|
290
|
-
|
291
|
-
# Find the first user message and inject memory before it
|
292
|
-
for i, msg in enumerate(chat_history):
|
293
|
-
if msg.role == "user":
|
294
|
-
if isinstance(msg.content, str):
|
295
|
-
# For text-only messages
|
296
|
-
updated_content = f"{memory_block}\n\n{msg.content}"
|
297
|
-
chat_history[i] = ChatMessage(role="user", content=updated_content)
|
298
|
-
elif isinstance(msg.content, list):
|
299
|
-
# For multimodal content
|
300
|
-
memory_text_block = TextBlock(text=memory_block)
|
301
|
-
# Insert memory text block at beginning
|
302
|
-
content_blocks = [memory_text_block] + msg.content
|
303
|
-
chat_history[i] = ChatMessage(role="user", content=content_blocks)
|
304
|
-
break
|
305
|
-
|
306
|
-
messages_to_send = [self.system_prompt] + chat_history
|
311
|
+
ctx.write_event_to_stream(
|
312
|
+
EpisodicMemoryEvent(episodic_memory=self.episodic_memory)
|
313
|
+
)
|
314
|
+
|
315
|
+
return StopEvent(result=result)
|
307
316
|
|
308
|
-
|
317
|
+
async def _get_llm_response(
|
318
|
+
self, ctx: Context, chat_history: List[ChatMessage]
|
319
|
+
) -> ChatResponse | None:
|
320
|
+
logger.debug("🔍 Getting LLM response...")
|
321
|
+
messages_to_send = [self.system_prompt] + chat_history
|
322
|
+
messages_to_send = [chat_utils.message_copy(msg) for msg in messages_to_send]
|
309
323
|
try:
|
310
|
-
response = await self.llm.achat(
|
311
|
-
|
324
|
+
response = await self.llm.achat(messages=messages_to_send)
|
325
|
+
logger.debug("🔍 Received LLM response.")
|
326
|
+
|
327
|
+
filtered_chat_history = []
|
328
|
+
for msg in chat_history:
|
329
|
+
filtered_msg = chat_utils.message_copy(msg)
|
330
|
+
if hasattr(filtered_msg, "blocks") and filtered_msg.blocks:
|
331
|
+
filtered_msg.blocks = [
|
332
|
+
block
|
333
|
+
for block in filtered_msg.blocks
|
334
|
+
if not isinstance(block, chat_utils.ImageBlock)
|
335
|
+
]
|
336
|
+
filtered_chat_history.append(filtered_msg)
|
337
|
+
|
338
|
+
# Convert chat history and response to JSON strings
|
339
|
+
chat_history_str = json.dumps(
|
340
|
+
[
|
341
|
+
{"role": msg.role, "content": msg.content}
|
342
|
+
for msg in filtered_chat_history
|
343
|
+
]
|
312
344
|
)
|
313
|
-
|
345
|
+
response_str = json.dumps(
|
346
|
+
{"role": response.message.role, "content": response.message.content}
|
347
|
+
)
|
348
|
+
|
349
|
+
step = EpisodicMemoryStep(
|
350
|
+
chat_history=chat_history_str,
|
351
|
+
response=response_str,
|
352
|
+
timestamp=time.time(),
|
353
|
+
screenshot=(await ctx.get("screenshot", None))
|
354
|
+
)
|
355
|
+
|
356
|
+
self.episodic_memory.steps.append(step)
|
357
|
+
|
358
|
+
assert hasattr(
|
359
|
+
response, "message"
|
360
|
+
), f"LLM response does not have a message attribute.\nResponse: {response}"
|
314
361
|
except Exception as e:
|
315
|
-
if
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
)
|
362
|
+
if (
|
363
|
+
self.llm.class_name() == "Gemini_LLM"
|
364
|
+
and "You exceeded your current quota" in str(e)
|
365
|
+
):
|
366
|
+
s = str(e._details[2])
|
367
|
+
match = re.search(r"seconds:\s*(\d+)", s)
|
368
|
+
if match:
|
369
|
+
seconds = int(match.group(1)) + 1
|
370
|
+
logger.error(f"Rate limit error. Retrying in {seconds} seconds...")
|
371
|
+
time.sleep(seconds)
|
372
|
+
else:
|
373
|
+
logger.error(f"Rate limit error. Retrying in 5 seconds...")
|
374
|
+
time.sleep(40)
|
375
|
+
logger.debug("🔍 Retrying call to LLM...")
|
376
|
+
response = await self.llm.achat(messages=messages_to_send)
|
328
377
|
else:
|
329
|
-
logger.error(f"
|
330
|
-
|
331
|
-
|
332
|
-
logger.debug(" - Received response from LLM.")
|
378
|
+
logger.error(f"Could not get an answer from LLM: {repr(e)}")
|
379
|
+
raise e
|
380
|
+
logger.debug(" - Received response from LLM.")
|
333
381
|
return response
|
334
|
-
|
382
|
+
|
383
|
+
async def _add_final_state_observation(self, ctx: Context) -> None:
|
384
|
+
"""Add the current UI state and screenshot as the final observation step."""
|
385
|
+
try:
|
386
|
+
# Get current screenshot and UI state
|
387
|
+
screenshot = None
|
388
|
+
ui_state = None
|
389
|
+
|
390
|
+
try:
|
391
|
+
_, screenshot_bytes = await self.tools.take_screenshot()
|
392
|
+
screenshot = screenshot_bytes
|
393
|
+
except Exception as e:
|
394
|
+
logger.warning(f"Failed to capture final screenshot: {e}")
|
395
|
+
|
396
|
+
try:
|
397
|
+
ui_state = await self.tools.get_clickables()
|
398
|
+
except Exception as e:
|
399
|
+
logger.warning(f"Failed to capture final UI state: {e}")
|
400
|
+
|
401
|
+
# Create final observation chat history and response
|
402
|
+
final_chat_history = [{"role": "system", "content": "Final state observation after task completion"}]
|
403
|
+
final_response = {
|
404
|
+
"role": "user",
|
405
|
+
"content": f"Final State Observation:\nUI State: {ui_state}\nScreenshot: {'Available' if screenshot else 'Not available'}"
|
406
|
+
}
|
407
|
+
|
408
|
+
# Create final episodic memory step
|
409
|
+
final_step = EpisodicMemoryStep(
|
410
|
+
chat_history=json.dumps(final_chat_history),
|
411
|
+
response=json.dumps(final_response),
|
412
|
+
timestamp=time.time(),
|
413
|
+
screenshot=screenshot
|
414
|
+
)
|
415
|
+
|
416
|
+
self.episodic_memory.steps.append(final_step)
|
417
|
+
logger.info("Added final state observation to episodic memory")
|
418
|
+
|
419
|
+
except Exception as e:
|
420
|
+
logger.error(f"Failed to add final state observation: {e}")
|