khoj 1.41.1.dev40__py3-none-any.whl → 1.41.1.dev90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. khoj/database/adapters/__init__.py +1 -1
  2. khoj/interface/compiled/404/index.html +1 -1
  3. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  4. khoj/interface/compiled/agents/index.html +1 -1
  5. khoj/interface/compiled/agents/index.txt +1 -1
  6. khoj/interface/compiled/automations/index.html +1 -1
  7. khoj/interface/compiled/automations/index.txt +1 -1
  8. khoj/interface/compiled/chat/index.html +2 -2
  9. khoj/interface/compiled/chat/index.txt +2 -2
  10. khoj/interface/compiled/index.html +1 -1
  11. khoj/interface/compiled/index.txt +1 -1
  12. khoj/interface/compiled/search/index.html +1 -1
  13. khoj/interface/compiled/search/index.txt +1 -1
  14. khoj/interface/compiled/settings/index.html +1 -1
  15. khoj/interface/compiled/settings/index.txt +1 -1
  16. khoj/interface/compiled/share/chat/index.html +2 -2
  17. khoj/interface/compiled/share/chat/index.txt +2 -2
  18. khoj/processor/conversation/anthropic/anthropic_chat.py +5 -0
  19. khoj/processor/conversation/google/gemini_chat.py +5 -0
  20. khoj/processor/conversation/google/utils.py +4 -0
  21. khoj/processor/conversation/openai/gpt.py +5 -0
  22. khoj/processor/conversation/prompts.py +12 -1
  23. khoj/processor/conversation/utils.py +13 -1
  24. khoj/processor/operator/grounding_agent.py +345 -0
  25. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  26. khoj/processor/operator/operate_browser.py +152 -0
  27. khoj/processor/operator/operator_actions.py +149 -0
  28. khoj/processor/operator/operator_agent_anthropic.py +383 -0
  29. khoj/processor/operator/operator_agent_base.py +80 -0
  30. khoj/processor/operator/operator_agent_binary.py +336 -0
  31. khoj/processor/operator/operator_agent_openai.py +349 -0
  32. khoj/processor/operator/operator_environment_base.py +37 -0
  33. khoj/processor/operator/operator_environment_browser.py +395 -0
  34. khoj/routers/api_chat.py +42 -3
  35. khoj/routers/helpers.py +14 -3
  36. khoj/routers/research.py +48 -1
  37. khoj/utils/helpers.py +17 -0
  38. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/METADATA +5 -3
  39. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/RECORD +44 -34
  40. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  41. /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_buildManifest.js +0 -0
  42. /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_ssgManifest.js +0 -0
  43. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/WHEEL +0 -0
  44. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/entry_points.txt +0 -0
  45. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,152 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ from typing import Callable, List, Optional
6
+
7
+ import requests
8
+
9
+ from khoj.database.adapters import AgentAdapters, ConversationAdapters
10
+ from khoj.database.models import Agent, ChatModel, KhojUser
11
+ from khoj.processor.operator.operator_actions import *
12
+ from khoj.processor.operator.operator_agent_anthropic import AnthropicOperatorAgent
13
+ from khoj.processor.operator.operator_agent_base import OperatorAgent
14
+ from khoj.processor.operator.operator_agent_binary import BinaryOperatorAgent
15
+ from khoj.processor.operator.operator_agent_openai import OpenAIOperatorAgent
16
+ from khoj.processor.operator.operator_environment_base import EnvStepResult
17
+ from khoj.processor.operator.operator_environment_browser import BrowserEnvironment
18
+ from khoj.routers.helpers import ChatEvent
19
+ from khoj.utils.helpers import timer
20
+ from khoj.utils.rawconfig import LocationData
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ # --- Browser Operator Function ---
26
+ async def operate_browser(
27
+ query: str,
28
+ user: KhojUser,
29
+ conversation_log: dict,
30
+ location_data: LocationData,
31
+ send_status_func: Optional[Callable] = None,
32
+ query_images: Optional[List[str]] = None, # TODO: Handle query images
33
+ agent: Agent = None,
34
+ query_files: str = None, # TODO: Handle query files
35
+ cancellation_event: Optional[asyncio.Event] = None,
36
+ tracer: dict = {},
37
+ ):
38
+ response, summary_message, user_input_message = None, None, None
39
+ environment: Optional[BrowserEnvironment] = None
40
+
41
+ # Get the agent chat model
42
+ agent_chat_model = await AgentAdapters.aget_agent_chat_model(agent, user) if agent else None
43
+ reasoning_model: ChatModel = await ConversationAdapters.aget_default_chat_model(user, agent_chat_model)
44
+ if not reasoning_model or not reasoning_model.vision_enabled:
45
+ reasoning_model = await ConversationAdapters.aget_vision_enabled_config()
46
+ if not reasoning_model:
47
+ raise ValueError(f"No vision enabled chat model found. Configure a vision chat model to operate browser.")
48
+
49
+ # Initialize Agent
50
+ max_iterations = int(os.getenv("KHOJ_OPERATOR_ITERATIONS", 40))
51
+ operator_agent: OperatorAgent
52
+ if reasoning_model.name.startswith("gpt-4o"):
53
+ operator_agent = OpenAIOperatorAgent(query, reasoning_model, max_iterations, tracer)
54
+ elif reasoning_model.name.startswith("claude-3-7-sonnet"):
55
+ operator_agent = AnthropicOperatorAgent(query, reasoning_model, max_iterations, tracer)
56
+ else:
57
+ grounding_model_name = "ui-tars-1.5"
58
+ grounding_model = await ConversationAdapters.aget_chat_model_by_name(grounding_model_name)
59
+ if (
60
+ not grounding_model
61
+ or not grounding_model.vision_enabled
62
+ or not grounding_model.model_type == ChatModel.ModelType.OPENAI
63
+ ):
64
+ raise ValueError("No supported visual grounding model for binary operator agent found.")
65
+ operator_agent = BinaryOperatorAgent(query, reasoning_model, grounding_model, max_iterations, tracer)
66
+
67
+ # Initialize Environment
68
+ if send_status_func:
69
+ async for event in send_status_func(f"**Launching Browser**"):
70
+ yield {ChatEvent.STATUS: event}
71
+ environment = BrowserEnvironment()
72
+ await environment.start(width=1024, height=768)
73
+
74
+ # Start Operator Loop
75
+ try:
76
+ summarize_prompt = f"Use the results of our research to provide a comprehensive, self-contained answer for the target query:\n{query}."
77
+ task_completed = False
78
+ iterations = 0
79
+
80
+ with timer(f"Operating browser with {reasoning_model.model_type} {reasoning_model.name}", logger):
81
+ while iterations < max_iterations and not task_completed:
82
+ if cancellation_event and cancellation_event.is_set():
83
+ logger.debug(f"Browser operator cancelled by client disconnect")
84
+ break
85
+
86
+ iterations += 1
87
+
88
+ # 1. Get current environment state
89
+ browser_state = await environment.get_state()
90
+
91
+ # 2. Agent decides action(s)
92
+ agent_result = await operator_agent.act(browser_state)
93
+
94
+ # 3. Execute actions in the environment
95
+ env_steps: List[EnvStepResult] = []
96
+ for action in agent_result.actions:
97
+ if cancellation_event and cancellation_event.is_set():
98
+ logger.debug(f"Browser operator cancelled by client disconnect")
99
+ break
100
+ # Handle request for user action and break the loop
101
+ if isinstance(action, RequestUserAction):
102
+ user_input_message = action.request
103
+ if send_status_func:
104
+ async for event in send_status_func(f"**Requesting User Input**:\n{action.request}"):
105
+ yield {ChatEvent.STATUS: event}
106
+ break
107
+ env_step = await environment.step(action)
108
+ env_steps.append(env_step)
109
+
110
+ # Render status update
111
+ latest_screenshot = f"data:image/webp;base64,{env_steps[-1].screenshot_base64 if env_steps else browser_state.screenshot}"
112
+ render_payload = agent_result.rendered_response
113
+ render_payload["image"] = latest_screenshot
114
+ render_content = f"**Action**: {json.dumps(render_payload)}"
115
+ if send_status_func:
116
+ async for event in send_status_func(f"**Operating Browser**:\n{render_content}"):
117
+ yield {ChatEvent.STATUS: event}
118
+
119
+ # Check if termination conditions are met
120
+ task_completed = not agent_result.actions # No actions requested by agent
121
+ trigger_iteration_limit = iterations == max_iterations
122
+ if user_input_message:
123
+ logger.info(f"User input requested: {user_input_message}")
124
+ break
125
+ if task_completed or trigger_iteration_limit:
126
+ # Summarize results of operator run on last iteration
127
+ operator_agent.add_action_results(env_steps, agent_result)
128
+ summary_message = await operator_agent.summarize(summarize_prompt, browser_state)
129
+ logger.info(f"Task completed: {task_completed}, Iteration limit: {trigger_iteration_limit}")
130
+ break
131
+
132
+ # 4. Update agent on the results of its action on the environment
133
+ operator_agent.add_action_results(env_steps, agent_result)
134
+
135
+ # Determine final response message
136
+ if user_input_message:
137
+ response = user_input_message
138
+ elif task_completed:
139
+ response = summary_message
140
+ else: # Hit iteration limit
141
+ response = f"Operator hit iteration limit ({max_iterations}). If the results seem incomplete try again, assign a smaller task or try a different approach.\nThese were the results till now:\n{summary_message}"
142
+ finally:
143
+ if environment and not user_input_message: # Don't close browser if user input required
144
+ await environment.close()
145
+ if operator_agent:
146
+ operator_agent.reset()
147
+
148
+ yield {
149
+ "query": query,
150
+ "result": user_input_message or response,
151
+ "webpages": [{"link": url, "snippet": ""} for url in environment.visited_urls],
152
+ }
@@ -0,0 +1,149 @@
1
+ # --- Standardized Action Models ---
2
+ from typing import List, Literal, Optional, Union
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class Point(BaseModel):
8
+ x: float
9
+ y: float
10
+
11
+
12
+ class BaseAction(BaseModel):
13
+ type: str
14
+
15
+
16
+ class ClickAction(BaseAction):
17
+ type: Literal["click"] = "click"
18
+ x: float
19
+ y: float
20
+ button: Literal["left", "right", "middle", "wheel"] = "left"
21
+ modifiers: str = None
22
+
23
+
24
+ class DoubleClickAction(BaseAction):
25
+ type: Literal["double_click"] = "double_click"
26
+ x: float
27
+ y: float
28
+
29
+
30
+ class TripleClickAction(BaseAction):
31
+ type: Literal["triple_click"] = "triple_click"
32
+ x: float
33
+ y: float
34
+
35
+
36
+ class ScrollAction(BaseAction):
37
+ type: Literal["scroll"] = "scroll"
38
+ x: Optional[float] = None
39
+ y: Optional[float] = None
40
+ scroll_x: Optional[float] = None
41
+ scroll_y: Optional[float] = None
42
+ scroll_direction: Optional[Literal["up", "down", "left", "right"]] = None
43
+ scroll_amount: Optional[float] = 2.0
44
+
45
+
46
+ class KeypressAction(BaseAction):
47
+ type: Literal["keypress"] = "keypress"
48
+ keys: List[str] # Standardized on list of keys
49
+
50
+
51
+ class TypeAction(BaseAction):
52
+ type: Literal["type"] = "type"
53
+ text: str
54
+
55
+
56
+ class WaitAction(BaseAction):
57
+ type: Literal["wait"] = "wait"
58
+ duration: float = 1.0
59
+
60
+
61
+ class ScreenshotAction(BaseAction):
62
+ type: Literal["screenshot"] = "screenshot"
63
+
64
+
65
+ class MoveAction(BaseAction):
66
+ type: Literal["move"] = "move"
67
+ x: float
68
+ y: float
69
+
70
+
71
+ class DragAction(BaseAction):
72
+ type: Literal["drag"] = "drag"
73
+ path: List[Point]
74
+
75
+
76
+ class MouseDownAction(BaseAction):
77
+ type: Literal["mouse_down"] = "mouse_down"
78
+ button: Literal["left", "right", "middle"] = "left"
79
+
80
+
81
+ class MouseUpAction(BaseAction):
82
+ type: Literal["mouse_up"] = "mouse_up"
83
+ button: Literal["left", "right", "middle"] = "left"
84
+
85
+
86
+ class HoldKeyAction(BaseAction):
87
+ type: Literal["hold_key"] = "hold_key"
88
+ text: str # xdotool style key combination string
89
+ duration: float = 1.0
90
+
91
+
92
+ class KeyUpAction(BaseAction):
93
+ type: Literal["key_up"] = "key_up"
94
+ key: str
95
+
96
+
97
+ class KeyDownAction(BaseAction):
98
+ type: Literal["key_down"] = "key_down"
99
+ key: str
100
+
101
+
102
+ class CursorPositionAction(BaseAction):
103
+ type: Literal["cursor_position"] = "cursor_position"
104
+
105
+
106
+ class GotoAction(BaseAction):
107
+ type: Literal["goto"] = "goto"
108
+ url: str
109
+
110
+
111
+ class BackAction(BaseAction):
112
+ type: Literal["back"] = "back"
113
+
114
+
115
+ class RequestUserAction(BaseAction):
116
+ """Request user action to confirm or provide input."""
117
+
118
+ type: Literal["request_user"] = "request_user"
119
+ request: str
120
+
121
+
122
+ class NoopAction(BaseAction):
123
+ """No operation action."""
124
+
125
+ type: Literal["noop"] = "noop"
126
+
127
+
128
+ OperatorAction = Union[
129
+ ClickAction,
130
+ DoubleClickAction,
131
+ TripleClickAction,
132
+ ScrollAction,
133
+ KeypressAction,
134
+ TypeAction,
135
+ WaitAction,
136
+ ScreenshotAction,
137
+ MoveAction,
138
+ DragAction,
139
+ MouseDownAction,
140
+ MouseUpAction,
141
+ HoldKeyAction,
142
+ KeyDownAction,
143
+ KeyUpAction,
144
+ CursorPositionAction,
145
+ GotoAction,
146
+ BackAction,
147
+ RequestUserAction,
148
+ NoopAction,
149
+ ]
@@ -0,0 +1,383 @@
1
+ import ast
2
+ import json
3
+ import logging
4
+ from copy import deepcopy
5
+ from datetime import datetime
6
+ from typing import Any, List, Optional, cast
7
+
8
+ from anthropic.types.beta import BetaContentBlock
9
+
10
+ from khoj.processor.operator.operator_actions import *
11
+ from khoj.processor.operator.operator_agent_base import (
12
+ AgentActResult,
13
+ AgentMessage,
14
+ OperatorAgent,
15
+ )
16
+ from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
17
+ from khoj.utils.helpers import get_anthropic_async_client, is_none_or_empty
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ # --- Anthropic Operator Agent ---
23
+ class AnthropicOperatorAgent(OperatorAgent):
24
+ async def act(self, current_state: EnvState) -> AgentActResult:
25
+ client = get_anthropic_async_client(
26
+ self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
27
+ )
28
+ tool_version = "2025-01-24"
29
+ betas = [f"computer-use-{tool_version}", "token-efficient-tools-2025-02-19"]
30
+ temperature = 1.0
31
+ actions: List[OperatorAction] = []
32
+ action_results: List[dict] = []
33
+ self._commit_trace() # Commit trace before next action
34
+
35
+ system_prompt = f"""<SYSTEM_CAPABILITY>
36
+ * You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
37
+ * You operate a Chromium browser using Playwright via the 'computer' tool.
38
+ * You cannot access the OS or filesystem.
39
+ * You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more.
40
+ * You can use the additional back() and goto() helper functions to ease navigating the browser. If you see nothing, try goto duckduckgo.com
41
+ * When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
42
+ * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
43
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
44
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
45
+ * The current URL is {current_state.url}.
46
+ </SYSTEM_CAPABILITY>
47
+
48
+ <IMPORTANT>
49
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
50
+ * Do not loop on wait, screenshot for too many turns without taking any action.
51
+ * After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
52
+ </IMPORTANT>
53
+ """
54
+ if is_none_or_empty(self.messages):
55
+ self.messages = [AgentMessage(role="user", content=self.query)]
56
+
57
+ tools = [
58
+ {
59
+ "type": f"computer_20250124",
60
+ "name": "computer",
61
+ "display_width_px": 1024,
62
+ "display_height_px": 768,
63
+ }, # TODO: Get from env
64
+ {
65
+ "name": "back",
66
+ "description": "Go back to the previous page.",
67
+ "input_schema": {"type": "object", "properties": {}},
68
+ },
69
+ {
70
+ "name": "goto",
71
+ "description": "Go to a specific URL.",
72
+ "input_schema": {
73
+ "type": "object",
74
+ "properties": {"url": {"type": "string", "description": "Fully qualified URL to navigate to."}},
75
+ "required": ["url"],
76
+ },
77
+ },
78
+ ]
79
+
80
+ thinking: dict[str, str | int] = {"type": "disabled"}
81
+ if self.vision_model.name.startswith("claude-3-7"):
82
+ thinking = {"type": "enabled", "budget_tokens": 1024}
83
+
84
+ messages_for_api = self._format_message_for_api(self.messages)
85
+ response = await client.beta.messages.create(
86
+ messages=messages_for_api,
87
+ model=self.vision_model.name,
88
+ system=system_prompt,
89
+ tools=tools,
90
+ betas=betas,
91
+ thinking=thinking,
92
+ max_tokens=4096, # TODO: Make configurable?
93
+ temperature=temperature,
94
+ )
95
+
96
+ logger.debug(f"Anthropic response: {response.model_dump_json()}")
97
+ self.messages.append(AgentMessage(role="assistant", content=response.content))
98
+ rendered_response = self._render_response(response.content, current_state.screenshot)
99
+
100
+ for block in response.content:
101
+ if block.type == "tool_use":
102
+ content = None
103
+ is_error = False
104
+
105
+ action_to_run: Optional[OperatorAction] = None
106
+ tool_input = block.input
107
+ tool_name = block.input.get("action") if block.name == "computer" else block.name
108
+ tool_use_id = block.id
109
+
110
+ try:
111
+ if tool_name == "mouse_move":
112
+ coord = self.get_coordinates(tool_input)
113
+ if coord:
114
+ action_to_run = MoveAction(x=coord[0], y=coord[1])
115
+ elif tool_name == "left_click":
116
+ coord = self.get_coordinates(tool_input)
117
+ if coord:
118
+ action_to_run = ClickAction(
119
+ x=coord[0], y=coord[1], button="left", modifier=tool_input.get("text")
120
+ )
121
+ elif tool_name == "right_click":
122
+ coord = self.get_coordinates(tool_input)
123
+ if coord:
124
+ action_to_run = ClickAction(x=coord[0], y=coord[1], button="right")
125
+ elif tool_name == "middle_click":
126
+ coord = self.get_coordinates(tool_input)
127
+ if coord:
128
+ action_to_run = ClickAction(x=coord[0], y=coord[1], button="middle")
129
+ elif tool_name == "double_click":
130
+ coord = self.get_coordinates(tool_input)
131
+ if coord:
132
+ action_to_run = DoubleClickAction(x=coord[0], y=coord[1])
133
+ elif tool_name == "triple_click":
134
+ coord = self.get_coordinates(tool_input)
135
+ if coord:
136
+ action_to_run = TripleClickAction(x=coord[0], y=coord[1])
137
+ elif tool_name == "left_click_drag":
138
+ start_coord = self.get_coordinates(tool_input, key="start_coordinate")
139
+ end_coord = self.get_coordinates(tool_input)
140
+ if start_coord and end_coord:
141
+ action_to_run = DragAction(path=[Point(x=p[0], y=p[1]) for p in [start_coord, end_coord]])
142
+ elif tool_name == "left_mouse_down":
143
+ action_to_run = MouseDownAction(button="left")
144
+ elif tool_name == "left_mouse_up":
145
+ action_to_run = MouseUpAction(button="left")
146
+ elif tool_name == "type":
147
+ text: str = tool_input.get("text")
148
+ if text:
149
+ action_to_run = TypeAction(text=text)
150
+ elif tool_name == "scroll":
151
+ direction = tool_input.get("scroll_direction")
152
+ amount = int(tool_input.get("scroll_amount", 5))
153
+ coord = self.get_coordinates(tool_input)
154
+ x = coord[0] if coord else None
155
+ y = coord[1] if coord else None
156
+ if direction:
157
+ action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, x=x, y=y)
158
+ elif tool_name == "key":
159
+ text = tool_input.get("text")
160
+ if text:
161
+ action_to_run = KeypressAction(keys=text.split("+")) # Split xdotool style
162
+ elif tool_name == "hold_key":
163
+ text = tool_input.get("text")
164
+ duration = tool_input.get("duration", 1.0)
165
+ if text:
166
+ action_to_run = HoldKeyAction(text=text, duration=duration)
167
+ elif tool_name == "wait":
168
+ duration = tool_input.get("duration", 1.0)
169
+ action_to_run = WaitAction(duration=duration)
170
+ elif tool_name == "screenshot":
171
+ action_to_run = ScreenshotAction()
172
+ elif tool_name == "cursor_position":
173
+ action_to_run = CursorPositionAction()
174
+ elif tool_name == "goto":
175
+ url = tool_input.get("url")
176
+ if url:
177
+ action_to_run = GotoAction(url=url)
178
+ else:
179
+ logger.warning("Goto tool called without URL.")
180
+ elif tool_name == "back":
181
+ action_to_run = BackAction()
182
+ else:
183
+ logger.warning(f"Unsupported Anthropic computer action type: {tool_name}")
184
+
185
+ except Exception as e:
186
+ error_msg = f"Error converting Anthropic action {tool_name} ({tool_input}): {e}"
187
+ logger.error(error_msg)
188
+ content = error_msg
189
+ is_error = True
190
+ action_to_run = NoopAction()
191
+
192
+ if action_to_run:
193
+ actions.append(action_to_run)
194
+ action_results.append(
195
+ {
196
+ "type": "tool_result",
197
+ "tool_use_id": tool_use_id,
198
+ "content": content, # Updated after environment step
199
+ "is_error": is_error, # Updated after environment step
200
+ }
201
+ )
202
+
203
+ self._update_usage(
204
+ response.usage.input_tokens,
205
+ response.usage.output_tokens,
206
+ response.usage.cache_read_input_tokens,
207
+ response.usage.cache_creation_input_tokens,
208
+ )
209
+ self.tracer["temperature"] = temperature
210
+
211
+ return AgentActResult(
212
+ actions=actions,
213
+ action_results=action_results,
214
+ rendered_response=rendered_response,
215
+ )
216
+
217
+ def add_action_results(self, env_steps: list[EnvStepResult], agent_action: AgentActResult):
218
+ if not agent_action.action_results:
219
+ return
220
+
221
+ # Update action results with results of applying suggested actions on the environment
222
+ for idx, env_step in enumerate(env_steps):
223
+ action_result = agent_action.action_results[idx]
224
+ result_content = env_step.error or env_step.output or "[Action completed]"
225
+ if env_step.type == "image" and isinstance(result_content, dict):
226
+ # Add screenshot data in anthropic message format
227
+ action_result["content"] = [
228
+ {
229
+ "type": "image",
230
+ "source": {
231
+ "type": "base64",
232
+ "media_type": "image/webp",
233
+ "data": result_content["image"],
234
+ },
235
+ }
236
+ ]
237
+ else:
238
+ # Add text data
239
+ action_result["content"] = result_content
240
+ if env_step.error:
241
+ action_result["is_error"] = True
242
+
243
+ # Append tool results to the message history
244
+ self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
245
+
246
+ # Mark the final tool result as a cache break point
247
+ agent_action.action_results[-1]["cache_control"] = {"type": "ephemeral"}
248
+ # Remove previous cache controls
249
+ for msg in self.messages:
250
+ if msg.role == "environment" and isinstance(msg.content, list):
251
+ for block in msg.content:
252
+ if isinstance(block, dict) and "cache_control" in block:
253
+ del block["cache_control"]
254
+
255
+ def _format_message_for_api(self, messages: list[AgentMessage]) -> list[dict]:
256
+ """Format Anthropic response into a single string."""
257
+ formatted_messages = []
258
+ for message in messages:
259
+ role = "user" if message.role == "environment" else message.role
260
+ content = (
261
+ [{"type": "text", "text": message.content}]
262
+ if not isinstance(message.content, list)
263
+ else message.content
264
+ )
265
+ formatted_messages.append(
266
+ {
267
+ "role": role,
268
+ "content": content,
269
+ }
270
+ )
271
+ return formatted_messages
272
+
273
+ def compile_response(self, response_content: list[BetaContentBlock | dict] | str) -> str:
274
+ """Compile Anthropic response into a single string."""
275
+ if isinstance(response_content, str):
276
+ return response_content
277
+ elif is_none_or_empty(response_content):
278
+ return ""
279
+ # action results are a list dictionaries,
280
+ # beta content blocks are objects with a type attribute
281
+ elif isinstance(response_content[0], dict):
282
+ return json.dumps(response_content)
283
+
284
+ compiled_response = [""]
285
+ for block in deepcopy(response_content):
286
+ block = cast(BetaContentBlock, block) # Ensure block is of type BetaContentBlock
287
+ if block.type == "text":
288
+ compiled_response.append(block.text)
289
+ elif block.type == "tool_use":
290
+ block_input = {"action": block.name}
291
+ if block.name == "computer":
292
+ block_input = block.input # Computer action details are in input dict
293
+ elif block.name == "goto":
294
+ block_input["url"] = block.input.get("url", "[Missing URL]")
295
+
296
+ # Avoid showing large image data in compiled text log
297
+ if isinstance(block_input, dict) and block_input.get("action") == "screenshot":
298
+ block_input["image"] = "[placeholder for screenshot]"
299
+ compiled_response.append(f"**Action**: {json.dumps(block_input)}")
300
+ else:
301
+ compiled_response.append(f"**Action**: {json.dumps(block_input)}")
302
+ elif block.type == "thinking":
303
+ # Check if thinking content exists before appending
304
+ thinking_content = getattr(block, "thinking", None)
305
+ if thinking_content:
306
+ compiled_response.append(f"**Thought**: {thinking_content}")
307
+
308
+ return "\n- ".join(filter(None, compiled_response)) # Filter out empty strings
309
+
310
+ def _render_response(self, response_content: list[BetaContentBlock], screenshot: str | None) -> dict:
311
+ """Render Anthropic response, potentially including actual screenshots."""
312
+ render_texts = []
313
+ for block in deepcopy(response_content): # Use deepcopy to avoid modifying original
314
+ if block.type == "thinking":
315
+ thinking_content = getattr(block, "thinking", None)
316
+ if thinking_content:
317
+ render_texts += [f"**Thought**: {thinking_content}"]
318
+ elif block.type == "text":
319
+ render_texts += [block.text]
320
+ elif block.type == "tool_use":
321
+ if block.name == "goto":
322
+ render_texts += [f"Open URL: {block.input.get('url', '[Missing URL]')}"]
323
+ elif block.name == "back":
324
+ render_texts += ["Go back to the previous page."]
325
+ elif block.name == "computer":
326
+ block_input = block.input
327
+ if not isinstance(block_input, dict):
328
+ render_texts += [json.dumps(block_input)]
329
+ # Handle computer action details
330
+ elif "action" in block_input:
331
+ action = block_input["action"]
332
+ if action == "type":
333
+ text: str = block_input.get("text")
334
+ if text:
335
+ render_texts += [f'Type "{text}"']
336
+ elif action == "key":
337
+ text = block_input.get("text")
338
+ if text:
339
+ render_texts += [f"Press {text}"]
340
+ elif action == "hold_key":
341
+ text = block_input.get("text")
342
+ duration = block_input.get("duration", 1.0)
343
+ if text:
344
+ render_texts += [f"Hold {text} for {duration} seconds"]
345
+ else:
346
+ # Handle other actions
347
+ render_texts += [f"{action.capitalize()}"]
348
+
349
+ # If screenshot is not available when screenshot action was requested
350
+ if isinstance(block.input, dict) and block.input.get("action") == "screenshot" and not screenshot:
351
+ render_texts += ["Failed to get screenshot"]
352
+
353
+ # Do not show screenshot if no actions requested
354
+ if all([block.type != "tool_use" for block in response_content]):
355
+ # If all blocks are not tool_use, return None
356
+ screenshot = None
357
+
358
+ # Create render payload
359
+ render_payload = {
360
+ # Combine text into a single string and filter out empty strings
361
+ "text": "\n- ".join(filter(None, render_texts)),
362
+ # Add screenshot data if available
363
+ "image": f"data:image/webp;base64,{screenshot}" if screenshot else None,
364
+ }
365
+
366
+ return render_payload
367
+
368
+ def get_coordinates(self, tool_input: dict, key: str = "coordinate") -> Optional[list | tuple]:
369
+ """Get coordinates from tool input."""
370
+ raw_coord = tool_input.get(key)
371
+ if not raw_coord:
372
+ return None
373
+ try:
374
+ coord = ast.literal_eval(raw_coord) if isinstance(raw_coord, str) else raw_coord
375
+ except (ValueError, SyntaxError):
376
+ logger.warning(f"Could not parse coordinate from value: {raw_coord}")
377
+ return None
378
+
379
+ if not isinstance(coord, (list, tuple)) or not len(coord) == 2:
380
+ logger.warning(f"Parsed coordinate string '{raw_coord}' is not a 2-element list/tuple: {coord}")
381
+ return None
382
+
383
+ return coord