droidrun 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. droidrun/__init__.py +16 -11
  2. droidrun/__main__.py +1 -1
  3. droidrun/adb/__init__.py +3 -3
  4. droidrun/adb/device.py +1 -1
  5. droidrun/adb/manager.py +2 -2
  6. droidrun/agent/__init__.py +6 -0
  7. droidrun/agent/codeact/__init__.py +2 -4
  8. droidrun/agent/codeact/codeact_agent.py +330 -235
  9. droidrun/agent/codeact/events.py +12 -20
  10. droidrun/agent/codeact/prompts.py +0 -52
  11. droidrun/agent/common/default.py +5 -0
  12. droidrun/agent/common/events.py +4 -0
  13. droidrun/agent/context/__init__.py +23 -0
  14. droidrun/agent/context/agent_persona.py +15 -0
  15. droidrun/agent/context/context_injection_manager.py +66 -0
  16. droidrun/agent/context/episodic_memory.py +15 -0
  17. droidrun/agent/context/personas/__init__.py +11 -0
  18. droidrun/agent/context/personas/app_starter.py +44 -0
  19. droidrun/agent/context/personas/default.py +95 -0
  20. droidrun/agent/context/personas/extractor.py +52 -0
  21. droidrun/agent/context/personas/ui_expert.py +107 -0
  22. droidrun/agent/context/reflection.py +20 -0
  23. droidrun/agent/context/task_manager.py +124 -0
  24. droidrun/agent/droid/__init__.py +2 -2
  25. droidrun/agent/droid/droid_agent.py +269 -325
  26. droidrun/agent/droid/events.py +28 -0
  27. droidrun/agent/oneflows/reflector.py +265 -0
  28. droidrun/agent/planner/__init__.py +2 -4
  29. droidrun/agent/planner/events.py +9 -13
  30. droidrun/agent/planner/planner_agent.py +288 -0
  31. droidrun/agent/planner/prompts.py +33 -53
  32. droidrun/agent/utils/__init__.py +3 -0
  33. droidrun/agent/utils/async_utils.py +1 -40
  34. droidrun/agent/utils/chat_utils.py +265 -48
  35. droidrun/agent/utils/executer.py +49 -14
  36. droidrun/agent/utils/llm_picker.py +14 -10
  37. droidrun/agent/utils/trajectory.py +184 -0
  38. droidrun/cli/__init__.py +1 -1
  39. droidrun/cli/logs.py +283 -0
  40. droidrun/cli/main.py +364 -441
  41. droidrun/tools/__init__.py +5 -10
  42. droidrun/tools/{actions.py → adb.py} +381 -412
  43. droidrun/tools/ios.py +596 -0
  44. droidrun/tools/tools.py +95 -0
  45. droidrun-0.3.1.dist-info/METADATA +150 -0
  46. droidrun-0.3.1.dist-info/RECORD +50 -0
  47. droidrun/agent/planner/task_manager.py +0 -355
  48. droidrun/agent/planner/workflow.py +0 -371
  49. droidrun/tools/device.py +0 -29
  50. droidrun/tools/loader.py +0 -60
  51. droidrun-0.2.0.dist-info/METADATA +0 -373
  52. droidrun-0.2.0.dist-info/RECORD +0 -32
  53. {droidrun-0.2.0.dist-info → droidrun-0.3.1.dist-info}/WHEEL +0 -0
  54. {droidrun-0.2.0.dist-info → droidrun-0.3.1.dist-info}/entry_points.txt +0 -0
  55. {droidrun-0.2.0.dist-info → droidrun-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,23 +1,35 @@
1
1
  import logging
2
2
  import re
3
- import inspect
4
3
  import time
5
- from typing import Awaitable, Callable, List, Optional, Dict, Any, Tuple, TYPE_CHECKING, Union
6
- from llama_index.core.base.llms.types import ChatMessage, ChatResponse, TextBlock
4
+ import asyncio
5
+ import json
6
+ import os
7
+ from typing import List, Optional, Tuple, Union
8
+ from llama_index.core.base.llms.types import ChatMessage, ChatResponse
7
9
  from llama_index.core.prompts import PromptTemplate
8
10
  from llama_index.core.llms.llm import LLM
9
11
  from llama_index.core.workflow import Workflow, StartEvent, StopEvent, Context, step
10
- from llama_index.core.memory import ChatMemoryBuffer
11
- from .events import FinalizeEvent, InputEvent, ModelOutputEvent, ExecutionEvent, ExecutionResultEvent
12
- from ..utils.chat_utils import add_screenshot, add_screenshot_image_block, add_ui_text_block, message_copy
13
- from .prompts import (
14
- DEFAULT_CODE_ACT_SYSTEM_PROMPT,
15
- DEFAULT_CODE_ACT_USER_PROMPT,
16
- DEFAULT_NO_THOUGHTS_PROMPT
12
+ from llama_index.core.memory import Memory
13
+ from droidrun.agent.codeact.events import (
14
+ TaskInputEvent,
15
+ TaskEndEvent,
16
+ TaskExecutionEvent,
17
+ TaskExecutionResultEvent,
18
+ TaskThinkingEvent,
19
+ EpisodicMemoryEvent,
20
+ )
21
+ from droidrun.agent.common.events import ScreenshotEvent
22
+ from droidrun.agent.utils import chat_utils
23
+ from droidrun.agent.utils.executer import SimpleCodeExecutor
24
+ from droidrun.agent.codeact.prompts import (
25
+ DEFAULT_CODE_ACT_USER_PROMPT,
26
+ DEFAULT_NO_THOUGHTS_PROMPT,
17
27
  )
18
28
 
19
- if TYPE_CHECKING:
20
- from ...tools import Tools
29
+ from droidrun.agent.context.episodic_memory import EpisodicMemory, EpisodicMemoryStep
30
+ from droidrun.tools import Tools
31
+ from typing import Optional, Dict, Tuple, List, Any, Callable
32
+ from droidrun.agent.context.agent_persona import AgentPersona
21
33
 
22
34
  logger = logging.getLogger("droidrun")
23
35
 
@@ -28,307 +40,390 @@ class CodeActAgent(Workflow):
28
40
  to solve problems requiring code execution. It extracts code from
29
41
  Markdown blocks and uses specific step types for tracking.
30
42
  """
43
+
31
44
  def __init__(
32
45
  self,
33
46
  llm: LLM,
34
- code_execute_fn: Callable[[str], Awaitable[Dict[str, Any]]],
35
- tools: 'Tools',
36
- available_tools: List = [],
37
- max_steps: int = 10, # Default max steps (kept for backwards compatibility but no longer enforced)
38
- system_prompt: Optional[str] = None,
39
- user_prompt: Optional[str] = None,
40
- vision: bool = False,
47
+ persona: AgentPersona,
48
+ vision: bool,
49
+ tools_instance: "Tools",
50
+ all_tools_list: Dict[str, Callable[..., Any]],
51
+ max_steps: int = 5,
41
52
  debug: bool = False,
42
53
  *args,
43
- **kwargs
54
+ **kwargs,
44
55
  ):
45
56
  # assert instead of if
46
57
  assert llm, "llm must be provided."
47
- assert code_execute_fn, "code_execute_fn must be provided"
48
58
  super().__init__(*args, **kwargs)
49
59
 
50
60
  self.llm = llm
51
- self.code_execute_fn = code_execute_fn
52
- self.available_tools = available_tools or []
53
- self.tools = tools
54
- self.max_steps = max_steps # Kept for backwards compatibility but not enforced
55
- self.tool_descriptions = self.parse_tool_descriptions() # Parse tool descriptions once at initialization
56
- self.system_prompt_content = (system_prompt or DEFAULT_CODE_ACT_SYSTEM_PROMPT).format(tool_descriptions=self.tool_descriptions)
57
- self.system_prompt = ChatMessage(role="system", content=self.system_prompt_content)
58
- self.user_prompt = user_prompt
61
+ self.max_steps = max_steps
62
+
63
+ self.user_prompt = persona.user_prompt
59
64
  self.no_thoughts_prompt = None
60
- self.memory = None
61
- self.goal = None
62
- self.steps_counter = 0 # Initialize step counter (kept for tracking purposes)
63
- self.code_exec_counter = 0 # Initialize execution counter
65
+
64
66
  self.vision = vision
67
+
68
+ self.chat_memory = None
69
+ self.episodic_memory = EpisodicMemory(persona=persona)
70
+ self.remembered_info = None
71
+
72
+ self.goal = None
73
+ self.steps_counter = 0
74
+ self.code_exec_counter = 0
65
75
  self.debug = debug
66
- logger.info("✅ CodeActAgent initialized successfully.")
67
76
 
68
- def parse_tool_descriptions(self) -> str:
69
- """Parses the available tools and their descriptions for the system prompt."""
70
- logger.info("🛠️ Parsing tool descriptions...")
71
- # self.available_tools is a list of functions, we need to get their docstrings, names, and signatures and display them as `def name(args) -> return_type:\n"""docstring""" ...\n`
72
- tool_descriptions = []
73
- excluded_tools = ["take_screenshot"] # List of tools to exclude
74
-
75
- for tool in self.available_tools:
76
- assert callable(tool), f"Tool {tool} is not callable."
77
- tool_name = tool.__name__
78
-
79
- # Skip excluded tools
80
- if tool_name in excluded_tools:
81
- logger.debug(f" - Skipping excluded tool: {tool_name}")
82
- continue
83
-
84
- tool_signature = inspect.signature(tool)
85
- tool_docstring = tool.__doc__ or "No description available."
86
- # Format the function signature and docstring
87
- formatted_signature = f"def {tool_name}{tool_signature}:\n \"\"\"{tool_docstring}\"\"\"\n..."
88
- tool_descriptions.append(formatted_signature)
89
- logger.debug(f" - Parsed tool: {tool_name}")
90
- # Join all tool descriptions into a single string
91
- descriptions = "\n".join(tool_descriptions)
92
- logger.info(f"🔩 Found {len(tool_descriptions)} tools.")
93
- return descriptions
94
-
95
- def _extract_code_and_thought(self, response_text: str) -> Tuple[Optional[str], str]:
96
- """
97
- Extracts code from Markdown blocks (```python ... ```) and the surrounding text (thought),
98
- handling indented code blocks.
99
-
100
- Returns:
101
- Tuple[Optional[code_string], thought_string]
102
- """
103
- if self.debug:
104
- logger.debug("✂️ Extracting code and thought from response...")
105
- code_pattern = r"^\s*```python\s*\n(.*?)\n^\s*```\s*?$" # Added ^\s*, re.MULTILINE, and made closing fence match more robust
106
- # Use re.DOTALL to make '.' match newlines and re.MULTILINE to make '^' match start of lines
107
- code_matches = list(re.finditer(code_pattern, response_text, re.DOTALL | re.MULTILINE))
108
-
109
- if not code_matches:
110
- # No code found, the entire response is thought
111
- if self.debug:
112
- logger.debug(" - No code block found. Entire response is thought.")
113
- return None, response_text.strip()
114
-
115
- extracted_code_parts = []
116
- for match in code_matches:
117
- # group(1) is the (.*?) part - the actual code content
118
- code_content = match.group(1)
119
- extracted_code_parts.append(code_content) # Keep original indentation for now
120
-
121
- extracted_code = "\n\n".join(extracted_code_parts)
122
- if self.debug:
123
- logger.debug(f" - Combined extracted code:\n```python\n{extracted_code}\n```")
124
-
125
-
126
- # Extract thought text (text before the first code block, between blocks, and after the last)
127
- thought_parts = []
128
- last_end = 0
129
- for match in code_matches:
130
- # Use span(0) to get the start/end of the *entire* match (including fences and indentation)
131
- start, end = match.span(0)
132
- thought_parts.append(response_text[last_end:start])
133
- last_end = end
134
- thought_parts.append(response_text[last_end:]) # Text after the last block
135
-
136
- thought_text = "".join(thought_parts).strip()
137
- # Avoid overly long debug messages for thought
138
- if self.debug:
139
- thought_preview = (thought_text[:100] + '...') if len(thought_text) > 100 else thought_text
140
- logger.debug(f" - Extracted thought: {thought_preview}")
141
-
142
- return extracted_code, thought_text
77
+ self.tools = tools_instance
78
+
79
+ self.tool_list = {}
80
+
81
+ for tool_name in persona.allowed_tools:
82
+ if tool_name in all_tools_list:
83
+ self.tool_list[tool_name] = all_tools_list[tool_name]
84
+
85
+ self.tool_descriptions = chat_utils.parse_tool_descriptions(self.tool_list)
86
+
87
+ self.system_prompt_content = persona.system_prompt.format(
88
+ tool_descriptions=self.tool_descriptions
89
+ )
90
+ self.system_prompt = ChatMessage(
91
+ role="system", content=self.system_prompt_content
92
+ )
93
+
94
+ self.required_context = persona.required_context
95
+
96
+ self.executor = SimpleCodeExecutor(
97
+ loop=asyncio.get_event_loop(),
98
+ locals={},
99
+ tools=self.tool_list,
100
+ globals={"__builtins__": __builtins__},
101
+ )
102
+
103
+ logger.info("✅ CodeActAgent initialized successfully.")
143
104
 
144
105
  @step
145
- async def prepare_chat(self, ev: StartEvent, ctx: Context) -> InputEvent:
106
+ async def prepare_chat(self, ctx: Context, ev: StartEvent) -> TaskInputEvent:
146
107
  """Prepare chat history from user input."""
147
108
  logger.info("💬 Preparing chat for task execution...")
148
- # Get or create memory
149
- self.memory: ChatMemoryBuffer = await ctx.get(
150
- "memory", default=ChatMemoryBuffer.from_defaults(llm=self.llm)
109
+
110
+ self.chat_memory: Memory = await ctx.get(
111
+ "chat_memory", default=Memory.from_defaults()
151
112
  )
113
+
152
114
  user_input = ev.get("input", default=None)
153
115
  assert user_input, "User input cannot be empty."
154
- # Add user input to memory
155
- if self.debug:
156
- logger.debug(" - Adding goal to memory.")
116
+
117
+ if ev.remembered_info:
118
+ self.remembered_info = ev.remembered_info
119
+
120
+ logger.debug(" - Adding goal to memory.")
157
121
  goal = user_input
158
- self.user_message = ChatMessage(role="user", content=PromptTemplate(self.user_prompt or DEFAULT_CODE_ACT_USER_PROMPT).format(goal=goal))
159
- self.no_thoughts_prompt = ChatMessage(role="user", content=PromptTemplate(DEFAULT_NO_THOUGHTS_PROMPT).format(goal=goal))
160
- await self.memory.aput(self.user_message)
161
- # Update context
162
- await ctx.set("memory", self.memory)
163
- input_messages = self.memory.get_all()
164
- return InputEvent(input=input_messages)
122
+ self.user_message = ChatMessage(
123
+ role="user",
124
+ content=PromptTemplate(
125
+ self.user_prompt or DEFAULT_CODE_ACT_USER_PROMPT
126
+ ).format(goal=goal),
127
+ )
128
+ self.no_thoughts_prompt = ChatMessage(
129
+ role="user",
130
+ content=PromptTemplate(DEFAULT_NO_THOUGHTS_PROMPT).format(goal=goal),
131
+ )
132
+
133
+
134
+ await self.chat_memory.aput(self.user_message)
135
+
136
+ await ctx.set("chat_memory", self.chat_memory)
137
+ input_messages = self.chat_memory.get_all()
138
+ return TaskInputEvent(input=input_messages)
139
+
165
140
  @step
166
- async def handle_llm_input(self, ev: InputEvent, ctx: Context) -> Union[ModelOutputEvent, FinalizeEvent]:
141
+ async def handle_llm_input(
142
+ self, ctx: Context, ev: TaskInputEvent
143
+ ) -> TaskThinkingEvent | TaskEndEvent:
167
144
  """Handle LLM input."""
168
- # Get chat history from event
169
145
  chat_history = ev.input
170
146
  assert len(chat_history) > 0, "Chat history cannot be empty."
147
+ ctx.write_event_to_stream(ev)
148
+
149
+ if self.steps_counter >= self.max_steps:
150
+ ev = TaskEndEvent(
151
+ success=False,
152
+ reason=f"Reached max step count of {self.max_steps} steps",
153
+ )
154
+ ctx.write_event_to_stream(ev)
155
+ return ev
171
156
 
172
157
  self.steps_counter += 1
173
158
  logger.info(f"🧠 Step {self.steps_counter}: Thinking...")
159
+
160
+ model = self.llm.class_name()
174
161
 
175
- # Get LLM response
176
- response = await self._get_llm_response(chat_history)
177
- # Add response to memory
178
- await self.memory.aput(response.message)
179
- if self.debug:
180
- logger.debug("🤖 LLM response received.")
181
- code, thoughts = self._extract_code_and_thought(response.message.content)
182
- if self.debug:
183
- logger.debug(f" - Thoughts: {'Yes' if thoughts else 'No'}, Code: {'Yes' if code else 'No'}")
184
- return ModelOutputEvent(thoughts=thoughts, code=code)
162
+ if "remember" in self.tool_list and self.remembered_info:
163
+ await ctx.set("remembered_info", self.remembered_info)
164
+ chat_history = await chat_utils.add_memory_block(self.remembered_info, chat_history)
165
+
166
+ for context in self.required_context:
167
+ if model == "DeepSeek":
168
+ logger.warning(
169
+ "[yellow]DeepSeek doesnt support images. Disabling screenshots[/]"
170
+ )
171
+ elif self.vision == True and context == "screenshot":
172
+ screenshot = (await self.tools.take_screenshot())[1]
173
+ ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
174
+
175
+ await ctx.set("screenshot", screenshot)
176
+ chat_history = await chat_utils.add_screenshot_image_block(screenshot, chat_history)
177
+
178
+ if context == "ui_state":
179
+ try:
180
+ state = await self.tools.get_state()
181
+ await ctx.set("ui_state", state["a11y_tree"])
182
+ chat_history = await chat_utils.add_ui_text_block(
183
+ state["a11y_tree"], chat_history
184
+ )
185
+ chat_history = await chat_utils.add_phone_state_block(state["phone_state"], chat_history)
186
+ except Exception as e:
187
+ logger.warning(f"⚠️ Error retrieving state from the connected device. Is the Accessibility Service enabled?")
188
+
189
+
190
+ if context == "packages":
191
+ chat_history = await chat_utils.add_packages_block(
192
+ await self.tools.list_packages(include_system_apps=True),
193
+ chat_history,
194
+ )
195
+
196
+ response = await self._get_llm_response(ctx, chat_history)
197
+ if response is None:
198
+ return TaskEndEvent(
199
+ success=False, reason="LLM response is None. This is a critical error."
200
+ )
201
+
202
+ await self.chat_memory.aput(response.message)
203
+
204
+ code, thoughts = chat_utils.extract_code_and_thought(response.message.content)
205
+
206
+ event = TaskThinkingEvent(thoughts=thoughts, code=code)
207
+ ctx.write_event_to_stream(event)
208
+ return event
185
209
 
186
210
  @step
187
- async def handle_llm_output(self, ev: ModelOutputEvent, ctx: Context) -> Union[ExecutionEvent, FinalizeEvent]:
211
+ async def handle_llm_output(
212
+ self, ctx: Context, ev: TaskThinkingEvent
213
+ ) -> Union[TaskExecutionEvent, TaskInputEvent]:
188
214
  """Handle LLM output."""
189
- if self.debug:
190
- logger.debug("⚙️ Handling LLM output...")
191
- # Get code and thoughts from event
215
+ logger.debug("⚙️ Handling LLM output...")
192
216
  code = ev.code
193
217
  thoughts = ev.thoughts
194
218
 
195
- # Warning if no thoughts are provided
196
219
  if not thoughts:
197
- logger.warning("🤔 LLM provided code without thoughts. Adding reminder prompt.")
198
- await self.memory.aput(self.no_thoughts_prompt)
220
+ logger.warning(
221
+ "🤔 LLM provided code without thoughts. Adding reminder prompt."
222
+ )
223
+ await self.chat_memory.aput(self.no_thoughts_prompt)
199
224
  else:
200
- # print thought but start with emoji at the start of the log
201
225
  logger.info(f"🤔 Reasoning: {thoughts}")
202
226
 
203
- # If code is present, execute it
204
227
  if code:
205
- return ExecutionEvent(code=code)
228
+ return TaskExecutionEvent(code=code)
206
229
  else:
207
- message = ChatMessage(role="user", content="No code was provided. If you want to mark task as complete (whether it failed or succeeded), use complete(success:bool, reason:str) function within a code block ```pythn\n```.")
208
- await self.memory.aput(message)
209
- return InputEvent(input=self.memory.get_all())
230
+ message = ChatMessage(
231
+ role="user",
232
+ content="No code was provided. If you want to mark task as complete (whether it failed or succeeded), use complete(success:bool, reason:str) function within a code block ```pythn\n```.",
233
+ )
234
+ await self.chat_memory.aput(message)
235
+ return TaskInputEvent(input=self.chat_memory.get_all())
210
236
 
211
237
  @step
212
- async def execute_code(self, ev: ExecutionEvent, ctx: Context) -> ExecutionResultEvent:
238
+ async def execute_code(
239
+ self, ctx: Context, ev: TaskExecutionEvent
240
+ ) -> Union[TaskExecutionResultEvent, TaskEndEvent]:
213
241
  """Execute the code and return the result."""
214
242
  code = ev.code
215
243
  assert code, "Code cannot be empty."
216
244
  logger.info(f"⚡ Executing action...")
217
- if self.debug:
218
- logger.debug(f"Code to execute:\n```python\n{code}\n```")
219
- # Execute the code using the provided function
245
+ logger.debug(f"Code to execute:\n```python\n{code}\n```")
246
+
220
247
  try:
221
248
  self.code_exec_counter += 1
222
- result = await self.code_execute_fn(code)
249
+ result = await self.executor.execute(ctx, code)
223
250
  logger.info(f"💡 Code execution successful. Result: {result}")
251
+
224
252
  if self.tools.finished == True:
225
253
  logger.debug(" - Task completed.")
226
- return FinalizeEvent(result={'success': self.tools.success, 'reason': self.tools.reason})
227
- return ExecutionResultEvent(output=str(result)) # Ensure output is string
254
+ event = TaskEndEvent(
255
+ success=self.tools.success, reason=self.tools.reason
256
+ )
257
+ ctx.write_event_to_stream(event)
258
+ return event
259
+
260
+ self.remembered_info = self.tools.memory
261
+
262
+ event = TaskExecutionResultEvent(output=str(result))
263
+ ctx.write_event_to_stream(event)
264
+ return event
265
+
228
266
  except Exception as e:
229
267
  logger.error(f"💥 Action failed: {e}")
230
268
  if self.debug:
231
269
  logger.error("Exception details:", exc_info=True)
232
270
  error_message = f"Error during execution: {e}"
233
- return ExecutionResultEvent(output=error_message) # Return error message as output
271
+
272
+ event = TaskExecutionResultEvent(output=error_message)
273
+ ctx.write_event_to_stream(event)
274
+ return event
234
275
 
235
276
  @step
236
- async def handle_execution_result(self, ev: ExecutionResultEvent, ctx: Context) -> InputEvent:
277
+ async def handle_execution_result(
278
+ self, ctx: Context, ev: TaskExecutionResultEvent
279
+ ) -> TaskInputEvent:
237
280
  """Handle the execution result. Currently it just returns InputEvent."""
238
- if self.debug:
239
- logger.debug("📊 Handling execution result...")
281
+ logger.debug("📊 Handling execution result...")
240
282
  # Get the output from the event
241
283
  output = ev.output
242
284
  if output is None:
243
285
  output = "Code executed, but produced no output."
244
286
  logger.warning(" - Execution produced no output.")
245
287
  else:
246
- if self.debug:
247
- logger.debug(f" - Execution output: {output[:100]}..." if len(output) > 100 else f" - Execution output: {output}")
288
+ logger.debug(
289
+ f" - Execution output: {output[:100]}..."
290
+ if len(output) > 100
291
+ else f" - Execution output: {output}"
292
+ )
248
293
  # Add the output to memory as an user message (observation)
249
- observation_message = ChatMessage(role="user", content=f"Execution Result:\n```\n{output}\n```")
250
- await self.memory.aput(observation_message)
251
- if self.debug:
252
- logger.debug(" - Added execution result to memory.")
253
- return InputEvent(input=self.memory.get_all())
254
-
294
+ observation_message = ChatMessage(
295
+ role="user", content=f"Execution Result:\n```\n{output}\n```"
296
+ )
297
+ await self.chat_memory.aput(observation_message)
298
+
299
+ return TaskInputEvent(input=self.chat_memory.get_all())
255
300
 
256
301
  @step
257
- async def finalize(self, ev: FinalizeEvent, ctx: Context) -> StopEvent:
302
+ async def finalize(self, ev: TaskEndEvent, ctx: Context) -> StopEvent:
258
303
  """Finalize the workflow."""
259
- self.tools.finished = False # Reset finished flag
260
- await ctx.set("memory", self.memory) # Ensure memory is set in context
304
+ self.tools.finished = False
305
+ await ctx.set("chat_memory", self.chat_memory)
261
306
 
262
- # Include steps and code execution information in the result
263
- result = ev.result or {}
264
- result.update({
265
- "codeact_steps": self.steps_counter,
266
- "code_executions": self.code_exec_counter
267
- })
307
+ # Add final state observation to episodic memory
308
+ await self._add_final_state_observation(ctx)
268
309
 
269
- return StopEvent(result=result)
310
+ result = {}
311
+ result.update(
312
+ {
313
+ "success": ev.success,
314
+ "reason": ev.reason,
315
+ "codeact_steps": self.steps_counter,
316
+ "code_executions": self.code_exec_counter,
317
+ }
318
+ )
270
319
 
271
- async def _get_llm_response(self, chat_history: List[ChatMessage]) -> ChatResponse:
272
- """Get streaming response from LLM."""
273
- if self.debug:
274
- logger.debug(f" - Sending {len(chat_history)} messages to LLM.")
275
- # Combine system prompt with chat history
276
- if self.vision:
277
- chat_history = await add_screenshot_image_block(self.tools, chat_history)
278
- elif self.tools.last_screenshot:
279
- chat_history = await add_screenshot(chat_history, self.tools.last_screenshot)
280
- self.tools.last_screenshot = None # Reset last screenshot after sending it
281
-
282
- # always add ui
283
- chat_history = await add_ui_text_block(self.tools, chat_history)
284
-
285
- # Add remembered information if available
286
- if hasattr(self.tools, 'memory') and self.tools.memory:
287
- memory_block = "\n### Remembered Information:\n"
288
- for idx, item in enumerate(self.tools.memory, 1):
289
- memory_block += f"{idx}. {item}\n"
290
-
291
- # Find the first user message and inject memory before it
292
- for i, msg in enumerate(chat_history):
293
- if msg.role == "user":
294
- if isinstance(msg.content, str):
295
- # For text-only messages
296
- updated_content = f"{memory_block}\n\n{msg.content}"
297
- chat_history[i] = ChatMessage(role="user", content=updated_content)
298
- elif isinstance(msg.content, list):
299
- # For multimodal content
300
- memory_text_block = TextBlock(text=memory_block)
301
- # Insert memory text block at beginning
302
- content_blocks = [memory_text_block] + msg.content
303
- chat_history[i] = ChatMessage(role="user", content=content_blocks)
304
- break
305
-
306
- messages_to_send = [self.system_prompt] + chat_history
320
+ ctx.write_event_to_stream(
321
+ EpisodicMemoryEvent(episodic_memory=self.episodic_memory)
322
+ )
323
+
324
+ return StopEvent(result=result)
307
325
 
308
- messages_to_send = [message_copy(msg) for msg in messages_to_send]
326
+ async def _get_llm_response(
327
+ self, ctx: Context, chat_history: List[ChatMessage]
328
+ ) -> ChatResponse | None:
329
+ logger.debug("🔍 Getting LLM response...")
330
+ messages_to_send = [self.system_prompt] + chat_history
331
+ messages_to_send = [chat_utils.message_copy(msg) for msg in messages_to_send]
309
332
  try:
310
- response = await self.llm.achat(
311
- messages=messages_to_send
333
+ response = await self.llm.achat(messages=messages_to_send)
334
+ logger.debug("🔍 Received LLM response.")
335
+
336
+ filtered_chat_history = []
337
+ for msg in chat_history:
338
+ filtered_msg = chat_utils.message_copy(msg)
339
+ if hasattr(filtered_msg, "blocks") and filtered_msg.blocks:
340
+ filtered_msg.blocks = [
341
+ block
342
+ for block in filtered_msg.blocks
343
+ if not isinstance(block, chat_utils.ImageBlock)
344
+ ]
345
+ filtered_chat_history.append(filtered_msg)
346
+
347
+ # Convert chat history and response to JSON strings
348
+ chat_history_str = json.dumps(
349
+ [
350
+ {"role": msg.role, "content": msg.content}
351
+ for msg in filtered_chat_history
352
+ ]
353
+ )
354
+ response_str = json.dumps(
355
+ {"role": response.message.role, "content": response.message.content}
312
356
  )
313
- assert hasattr(response, "message"), f"LLM response does not have a message attribute.\nResponse: {response}"
357
+
358
+ step = EpisodicMemoryStep(
359
+ chat_history=chat_history_str,
360
+ response=response_str,
361
+ timestamp=time.time(),
362
+ screenshot=(await ctx.get("screenshot", None))
363
+ )
364
+
365
+ self.episodic_memory.steps.append(step)
366
+
367
+ assert hasattr(
368
+ response, "message"
369
+ ), f"LLM response does not have a message attribute.\nResponse: {response}"
314
370
  except Exception as e:
315
- if self.llm.class_name() == "Gemini_LLM" and "You exceeded your current quota" in str(e):
316
- s = str(e._details[2])
317
- match = re.search(r'seconds:\s*(\d+)', s)
318
- if match:
319
- seconds = int(match.group(1)) + 1
320
- logger.error(f"Rate limit error. Retrying in {seconds} seconds...")
321
- time.sleep(seconds)
322
- else:
323
- logger.error(f"Rate limit error. Retrying in 5 seconds...")
324
- time.sleep(40)
325
- response = await self.llm.achat(
326
- messages=messages_to_send
327
- )
371
+ if (
372
+ self.llm.class_name() == "Gemini_LLM"
373
+ and "You exceeded your current quota" in str(e)
374
+ ):
375
+ s = str(e._details[2])
376
+ match = re.search(r"seconds:\s*(\d+)", s)
377
+ if match:
378
+ seconds = int(match.group(1)) + 1
379
+ logger.error(f"Rate limit error. Retrying in {seconds} seconds...")
380
+ time.sleep(seconds)
381
+ else:
382
+ logger.error(f"Rate limit error. Retrying in 5 seconds...")
383
+ time.sleep(40)
384
+ logger.debug("🔍 Retrying call to LLM...")
385
+ response = await self.llm.achat(messages=messages_to_send)
328
386
  else:
329
- logger.error(f"Error getting LLM response: {e}")
330
- return StopEvent(result={'finished': True, 'message': f"Error getting LLM response: {e}", 'steps': self.steps_counter, 'code_executions': self.code_exec_counter}) # Return final message and steps
331
- if self.debug:
332
- logger.debug(" - Received response from LLM.")
387
+ logger.error(f"Could not get an answer from LLM: {repr(e)}")
388
+ raise e
389
+ logger.debug(" - Received response from LLM.")
333
390
  return response
334
-
391
+
392
+ async def _add_final_state_observation(self, ctx: Context) -> None:
393
+ """Add the current UI state and screenshot as the final observation step."""
394
+ try:
395
+ # Get current screenshot and UI state
396
+ screenshot = None
397
+ ui_state = None
398
+
399
+ try:
400
+ _, screenshot_bytes = await self.tools.take_screenshot()
401
+ screenshot = screenshot_bytes
402
+ except Exception as e:
403
+ logger.warning(f"Failed to capture final screenshot: {e}")
404
+
405
+ try:
406
+ (a11y_tree, phone_state) = await self.tools.get_state()
407
+ except Exception as e:
408
+ logger.warning(f"Failed to capture final UI state: {e}")
409
+
410
+ # Create final observation chat history and response
411
+ final_chat_history = [{"role": "system", "content": "Final state observation after task completion"}]
412
+ final_response = {
413
+ "role": "user",
414
+ "content": f"Final State Observation:\nUI State: {a11y_tree}\nScreenshot: {'Available' if screenshot else 'Not available'}"
415
+ }
416
+
417
+ # Create final episodic memory step
418
+ final_step = EpisodicMemoryStep(
419
+ chat_history=json.dumps(final_chat_history),
420
+ response=json.dumps(final_response),
421
+ timestamp=time.time(),
422
+ screenshot=screenshot
423
+ )
424
+
425
+ self.episodic_memory.steps.append(final_step)
426
+ logger.info("Added final state observation to episodic memory")
427
+
428
+ except Exception as e:
429
+ logger.error(f"Failed to add final state observation: {e}")