khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. khoj/database/adapters/__init__.py +17 -6
  2. khoj/interface/compiled/404/index.html +2 -2
  3. khoj/interface/compiled/_next/static/chunks/{2327-f03b2a77f67b8f8c.js → 2327-aa22697ed9c8d54a.js} +1 -1
  4. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  5. khoj/interface/compiled/_next/static/chunks/{8515-010dd769c584b672.js → 8515-f305779d95dd5780.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/app/chat/{page-14ac9d1ad5cb84c5.js → page-7e780dc11eb5e5d3.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/{webpack-1169ca6e9e7e6247.js → webpack-21f76f7f59582bc7.js} +1 -1
  10. khoj/interface/compiled/agents/index.html +2 -2
  11. khoj/interface/compiled/agents/index.txt +2 -2
  12. khoj/interface/compiled/automations/index.html +2 -2
  13. khoj/interface/compiled/automations/index.txt +2 -2
  14. khoj/interface/compiled/chat/index.html +2 -2
  15. khoj/interface/compiled/chat/index.txt +2 -2
  16. khoj/interface/compiled/index.html +2 -2
  17. khoj/interface/compiled/index.txt +2 -2
  18. khoj/interface/compiled/search/index.html +2 -2
  19. khoj/interface/compiled/search/index.txt +2 -2
  20. khoj/interface/compiled/settings/index.html +2 -2
  21. khoj/interface/compiled/settings/index.txt +2 -2
  22. khoj/interface/compiled/share/chat/index.html +2 -2
  23. khoj/interface/compiled/share/chat/index.txt +2 -2
  24. khoj/processor/conversation/anthropic/anthropic_chat.py +7 -2
  25. khoj/processor/conversation/anthropic/utils.py +37 -19
  26. khoj/processor/conversation/google/gemini_chat.py +7 -2
  27. khoj/processor/conversation/offline/chat_model.py +2 -2
  28. khoj/processor/conversation/openai/gpt.py +7 -2
  29. khoj/processor/conversation/prompts.py +13 -2
  30. khoj/processor/conversation/utils.py +34 -6
  31. khoj/processor/operator/grounding_agent.py +345 -0
  32. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  33. khoj/processor/operator/operate_browser.py +165 -0
  34. khoj/processor/operator/operator_actions.py +149 -0
  35. khoj/processor/operator/operator_agent_anthropic.py +402 -0
  36. khoj/processor/operator/operator_agent_base.py +80 -0
  37. khoj/processor/operator/operator_agent_binary.py +336 -0
  38. khoj/processor/operator/operator_agent_openai.py +349 -0
  39. khoj/processor/operator/operator_environment_base.py +37 -0
  40. khoj/processor/operator/operator_environment_browser.py +395 -0
  41. khoj/routers/api_chat.py +44 -6
  42. khoj/routers/helpers.py +18 -8
  43. khoj/routers/research.py +48 -1
  44. khoj/utils/constants.py +6 -0
  45. khoj/utils/helpers.py +17 -0
  46. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/METADATA +4 -2
  47. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/RECORD +52 -42
  48. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  49. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
  50. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
  51. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_buildManifest.js +0 -0
  52. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_ssgManifest.js +0 -0
  53. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/WHEEL +0 -0
  54. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/entry_points.txt +0 -0
  55. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,402 @@
1
+ import ast
2
+ import json
3
+ import logging
4
+ from copy import deepcopy
5
+ from datetime import datetime
6
+ from typing import List, Optional, cast
7
+
8
+ from anthropic.types.beta import BetaContentBlock
9
+
10
+ from khoj.processor.conversation.anthropic.utils import is_reasoning_model
11
+ from khoj.processor.operator.operator_actions import *
12
+ from khoj.processor.operator.operator_agent_base import (
13
+ AgentActResult,
14
+ AgentMessage,
15
+ OperatorAgent,
16
+ )
17
+ from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
18
+ from khoj.utils.helpers import get_anthropic_async_client, is_none_or_empty
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # --- Anthropic Operator Agent ---
24
+ class AnthropicOperatorAgent(OperatorAgent):
25
+ async def act(self, current_state: EnvState) -> AgentActResult:
26
+ client = get_anthropic_async_client(
27
+ self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
28
+ )
29
+ betas = self.model_default_headers()
30
+ temperature = 1.0
31
+ actions: List[OperatorAction] = []
32
+ action_results: List[dict] = []
33
+ self._commit_trace() # Commit trace before next action
34
+
35
+ system_prompt = f"""<SYSTEM_CAPABILITY>
36
+ * You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
37
+ * You operate a Chromium browser using Playwright via the 'computer' tool.
38
+ * You cannot access the OS or filesystem.
39
+ * You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more.
40
+ * You can use the additional back() and goto() helper functions to ease navigating the browser. If you see nothing, try goto duckduckgo.com
41
+ * When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
42
+ * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
43
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
44
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
45
+ * The current URL is {current_state.url}.
46
+ </SYSTEM_CAPABILITY>
47
+
48
+ <IMPORTANT>
49
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
50
+ * Do not loop on wait, screenshot for too many turns without taking any action.
51
+ * After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
52
+ </IMPORTANT>
53
+ """
54
+ if is_none_or_empty(self.messages):
55
+ self.messages = [AgentMessage(role="user", content=self.query)]
56
+
57
+ tools = [
58
+ {
59
+ "type": self.model_default_tool("computer"),
60
+ "name": "computer",
61
+ "display_width_px": 1024,
62
+ "display_height_px": 768,
63
+ }, # TODO: Get from env
64
+ {
65
+ "name": "back",
66
+ "description": "Go back to the previous page.",
67
+ "input_schema": {"type": "object", "properties": {}},
68
+ },
69
+ {
70
+ "name": "goto",
71
+ "description": "Go to a specific URL.",
72
+ "input_schema": {
73
+ "type": "object",
74
+ "properties": {"url": {"type": "string", "description": "Fully qualified URL to navigate to."}},
75
+ "required": ["url"],
76
+ },
77
+ },
78
+ ]
79
+
80
+ thinking: dict[str, str | int] = {"type": "disabled"}
81
+ if is_reasoning_model(self.vision_model.name):
82
+ thinking = {"type": "enabled", "budget_tokens": 1024}
83
+
84
+ messages_for_api = self._format_message_for_api(self.messages)
85
+ response = await client.beta.messages.create(
86
+ messages=messages_for_api,
87
+ model=self.vision_model.name,
88
+ system=system_prompt,
89
+ tools=tools,
90
+ betas=betas,
91
+ thinking=thinking,
92
+ max_tokens=4096, # TODO: Make configurable?
93
+ temperature=temperature,
94
+ )
95
+
96
+ logger.debug(f"Anthropic response: {response.model_dump_json()}")
97
+ self.messages.append(AgentMessage(role="assistant", content=response.content))
98
+ rendered_response = self._render_response(response.content, current_state.screenshot)
99
+
100
+ for block in response.content:
101
+ if block.type == "tool_use":
102
+ content = None
103
+ is_error = False
104
+
105
+ action_to_run: Optional[OperatorAction] = None
106
+ tool_input = block.input
107
+ tool_name = block.input.get("action") if block.name == "computer" else block.name
108
+ tool_use_id = block.id
109
+
110
+ try:
111
+ if tool_name == "mouse_move":
112
+ coord = self.get_coordinates(tool_input)
113
+ if coord:
114
+ action_to_run = MoveAction(x=coord[0], y=coord[1])
115
+ elif tool_name == "left_click":
116
+ coord = self.get_coordinates(tool_input)
117
+ if coord:
118
+ action_to_run = ClickAction(
119
+ x=coord[0], y=coord[1], button="left", modifier=tool_input.get("text")
120
+ )
121
+ elif tool_name == "right_click":
122
+ coord = self.get_coordinates(tool_input)
123
+ if coord:
124
+ action_to_run = ClickAction(x=coord[0], y=coord[1], button="right")
125
+ elif tool_name == "middle_click":
126
+ coord = self.get_coordinates(tool_input)
127
+ if coord:
128
+ action_to_run = ClickAction(x=coord[0], y=coord[1], button="middle")
129
+ elif tool_name == "double_click":
130
+ coord = self.get_coordinates(tool_input)
131
+ if coord:
132
+ action_to_run = DoubleClickAction(x=coord[0], y=coord[1])
133
+ elif tool_name == "triple_click":
134
+ coord = self.get_coordinates(tool_input)
135
+ if coord:
136
+ action_to_run = TripleClickAction(x=coord[0], y=coord[1])
137
+ elif tool_name == "left_click_drag":
138
+ start_coord = self.get_coordinates(tool_input, key="start_coordinate")
139
+ end_coord = self.get_coordinates(tool_input)
140
+ if start_coord and end_coord:
141
+ action_to_run = DragAction(path=[Point(x=p[0], y=p[1]) for p in [start_coord, end_coord]])
142
+ elif tool_name == "left_mouse_down":
143
+ action_to_run = MouseDownAction(button="left")
144
+ elif tool_name == "left_mouse_up":
145
+ action_to_run = MouseUpAction(button="left")
146
+ elif tool_name == "type":
147
+ text: str = tool_input.get("text")
148
+ if text:
149
+ action_to_run = TypeAction(text=text)
150
+ elif tool_name == "scroll":
151
+ direction = tool_input.get("scroll_direction")
152
+ amount = int(tool_input.get("scroll_amount", 5))
153
+ coord = self.get_coordinates(tool_input)
154
+ x = coord[0] if coord else None
155
+ y = coord[1] if coord else None
156
+ if direction:
157
+ action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, x=x, y=y)
158
+ elif tool_name == "key":
159
+ text = tool_input.get("text")
160
+ if text:
161
+ action_to_run = KeypressAction(keys=text.split("+")) # Split xdotool style
162
+ elif tool_name == "hold_key":
163
+ text = tool_input.get("text")
164
+ duration = tool_input.get("duration", 1.0)
165
+ if text:
166
+ action_to_run = HoldKeyAction(text=text, duration=duration)
167
+ elif tool_name == "wait":
168
+ duration = tool_input.get("duration", 1.0)
169
+ action_to_run = WaitAction(duration=duration)
170
+ elif tool_name == "screenshot":
171
+ action_to_run = ScreenshotAction()
172
+ elif tool_name == "cursor_position":
173
+ action_to_run = CursorPositionAction()
174
+ elif tool_name == "goto":
175
+ url = tool_input.get("url")
176
+ if url:
177
+ action_to_run = GotoAction(url=url)
178
+ else:
179
+ logger.warning("Goto tool called without URL.")
180
+ elif tool_name == "back":
181
+ action_to_run = BackAction()
182
+ else:
183
+ logger.warning(f"Unsupported Anthropic computer action type: {tool_name}")
184
+
185
+ except Exception as e:
186
+ error_msg = f"Error converting Anthropic action {tool_name} ({tool_input}): {e}"
187
+ logger.error(error_msg)
188
+ content = error_msg
189
+ is_error = True
190
+ action_to_run = NoopAction()
191
+
192
+ if action_to_run:
193
+ actions.append(action_to_run)
194
+ action_results.append(
195
+ {
196
+ "type": "tool_result",
197
+ "tool_use_id": tool_use_id,
198
+ "content": content, # Updated after environment step
199
+ "is_error": is_error, # Updated after environment step
200
+ }
201
+ )
202
+
203
+ self._update_usage(
204
+ response.usage.input_tokens,
205
+ response.usage.output_tokens,
206
+ response.usage.cache_read_input_tokens,
207
+ response.usage.cache_creation_input_tokens,
208
+ )
209
+ self.tracer["temperature"] = temperature
210
+
211
+ return AgentActResult(
212
+ actions=actions,
213
+ action_results=action_results,
214
+ rendered_response=rendered_response,
215
+ )
216
+
217
+ def add_action_results(self, env_steps: list[EnvStepResult], agent_action: AgentActResult):
218
+ if not agent_action.action_results:
219
+ return
220
+
221
+ # Update action results with results of applying suggested actions on the environment
222
+ for idx, env_step in enumerate(env_steps):
223
+ action_result = agent_action.action_results[idx]
224
+ result_content = env_step.error or env_step.output or "[Action completed]"
225
+ if env_step.type == "image" and isinstance(result_content, dict):
226
+ # Add screenshot data in anthropic message format
227
+ action_result["content"] = [
228
+ {
229
+ "type": "image",
230
+ "source": {
231
+ "type": "base64",
232
+ "media_type": "image/webp",
233
+ "data": result_content["image"],
234
+ },
235
+ }
236
+ ]
237
+ else:
238
+ # Add text data
239
+ action_result["content"] = result_content
240
+ if env_step.error:
241
+ action_result["is_error"] = True
242
+
243
+ # Append tool results to the message history
244
+ self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
245
+
246
+ # Mark the final tool result as a cache break point
247
+ agent_action.action_results[-1]["cache_control"] = {"type": "ephemeral"}
248
+ # Remove previous cache controls
249
+ for msg in self.messages:
250
+ if msg.role == "environment" and isinstance(msg.content, list):
251
+ for block in msg.content:
252
+ if isinstance(block, dict) and "cache_control" in block:
253
+ del block["cache_control"]
254
+
255
+ def _format_message_for_api(self, messages: list[AgentMessage]) -> list[dict]:
256
+ """Format Anthropic response into a single string."""
257
+ formatted_messages = []
258
+ for message in messages:
259
+ role = "user" if message.role == "environment" else message.role
260
+ content = (
261
+ [{"type": "text", "text": message.content}]
262
+ if not isinstance(message.content, list)
263
+ else message.content
264
+ )
265
+ formatted_messages.append(
266
+ {
267
+ "role": role,
268
+ "content": content,
269
+ }
270
+ )
271
+ return formatted_messages
272
+
273
+ def compile_response(self, response_content: list[BetaContentBlock | dict] | str) -> str:
274
+ """Compile Anthropic response into a single string."""
275
+ if isinstance(response_content, str):
276
+ return response_content
277
+ elif is_none_or_empty(response_content):
278
+ return ""
279
+ # action results are a list dictionaries,
280
+ # beta content blocks are objects with a type attribute
281
+ elif isinstance(response_content[0], dict):
282
+ return json.dumps(response_content)
283
+
284
+ compiled_response = [""]
285
+ for block in deepcopy(response_content):
286
+ block = cast(BetaContentBlock, block) # Ensure block is of type BetaContentBlock
287
+ if block.type == "text":
288
+ compiled_response.append(block.text)
289
+ elif block.type == "tool_use":
290
+ block_input = {"action": block.name}
291
+ if block.name == "computer":
292
+ block_input = block.input # Computer action details are in input dict
293
+ elif block.name == "goto":
294
+ block_input["url"] = block.input.get("url", "[Missing URL]")
295
+
296
+ # Avoid showing large image data in compiled text log
297
+ if isinstance(block_input, dict) and block_input.get("action") == "screenshot":
298
+ block_input["image"] = "[placeholder for screenshot]"
299
+ compiled_response.append(f"**Action**: {json.dumps(block_input)}")
300
+ else:
301
+ compiled_response.append(f"**Action**: {json.dumps(block_input)}")
302
+ elif block.type == "thinking":
303
+ # Check if thinking content exists before appending
304
+ thinking_content = getattr(block, "thinking", None)
305
+ if thinking_content:
306
+ compiled_response.append(f"**Thought**: {thinking_content}")
307
+
308
+ return "\n- ".join(filter(None, compiled_response)) # Filter out empty strings
309
+
310
+ def _render_response(self, response_content: list[BetaContentBlock], screenshot: str | None) -> dict:
311
+ """Render Anthropic response, potentially including actual screenshots."""
312
+ render_texts = []
313
+ for block in deepcopy(response_content): # Use deepcopy to avoid modifying original
314
+ if block.type == "thinking":
315
+ thinking_content = getattr(block, "thinking", None)
316
+ if thinking_content:
317
+ render_texts += [f"**Thought**: {thinking_content}"]
318
+ elif block.type == "text":
319
+ render_texts += [block.text]
320
+ elif block.type == "tool_use":
321
+ if block.name == "goto":
322
+ render_texts += [f"Open URL: {block.input.get('url', '[Missing URL]')}"]
323
+ elif block.name == "back":
324
+ render_texts += ["Go back to the previous page."]
325
+ elif block.name == "computer":
326
+ block_input = block.input
327
+ if not isinstance(block_input, dict):
328
+ render_texts += [json.dumps(block_input)]
329
+ # Handle computer action details
330
+ elif "action" in block_input:
331
+ action = block_input["action"]
332
+ if action == "type":
333
+ text: str = block_input.get("text")
334
+ if text:
335
+ render_texts += [f'Type "{text}"']
336
+ elif action == "key":
337
+ text = block_input.get("text")
338
+ if text:
339
+ render_texts += [f"Press {text}"]
340
+ elif action == "hold_key":
341
+ text = block_input.get("text")
342
+ duration = block_input.get("duration", 1.0)
343
+ if text:
344
+ render_texts += [f"Hold {text} for {duration} seconds"]
345
+ else:
346
+ # Handle other actions
347
+ render_texts += [f"{action.capitalize()}"]
348
+
349
+ # If screenshot is not available when screenshot action was requested
350
+ if isinstance(block.input, dict) and block.input.get("action") == "screenshot" and not screenshot:
351
+ render_texts += ["Failed to get screenshot"]
352
+
353
+ # Do not show screenshot if no actions requested
354
+ if all([block.type != "tool_use" for block in response_content]):
355
+ # If all blocks are not tool_use, return None
356
+ screenshot = None
357
+
358
+ # Create render payload
359
+ render_payload = {
360
+ # Combine text into a single string and filter out empty strings
361
+ "text": "\n- ".join(filter(None, render_texts)),
362
+ # Add screenshot data if available
363
+ "image": f"data:image/webp;base64,{screenshot}" if screenshot else None,
364
+ }
365
+
366
+ return render_payload
367
+
368
+ def get_coordinates(self, tool_input: dict, key: str = "coordinate") -> Optional[list | tuple]:
369
+ """Get coordinates from tool input."""
370
+ raw_coord = tool_input.get(key)
371
+ if not raw_coord:
372
+ return None
373
+ try:
374
+ coord = ast.literal_eval(raw_coord) if isinstance(raw_coord, str) else raw_coord
375
+ except (ValueError, SyntaxError):
376
+ logger.warning(f"Could not parse coordinate from value: {raw_coord}")
377
+ return None
378
+
379
+ if not isinstance(coord, (list, tuple)) or not len(coord) == 2:
380
+ logger.warning(f"Parsed coordinate string '{raw_coord}' is not a 2-element list/tuple: {coord}")
381
+ return None
382
+
383
+ return coord
384
+
385
+ def model_default_tool(self, tool_type: Literal["computer", "editor", "terminal"]) -> str:
386
+ """Get the default tool of specified type for the given model."""
387
+ if self.vision_model.name.startswith("claude-3-7-sonnet"):
388
+ if tool_type == "computer":
389
+ return "computer_20250124"
390
+ elif self.vision_model.name.startswith("claude-sonnet-4") or self.vision_model.name.startswith("claude-opus-4"):
391
+ if tool_type == "computer":
392
+ return "computer_20250124"
393
+ raise ValueError(f"Unsupported tool type for model '{self.vision_model.name}': {tool_type}")
394
+
395
+ def model_default_headers(self) -> list[str]:
396
+ """Get the default computer use headers for the given model."""
397
+ if self.vision_model.name.startswith("claude-3-7-sonnet"):
398
+ return [f"computer-use-2025-01-24", "token-efficient-tools-2025-02-19"]
399
+ elif self.vision_model.name.startswith("claude-sonnet-4") or self.vision_model.name.startswith("claude-opus-4"):
400
+ return ["computer-use-2025-01-24"]
401
+ else:
402
+ return []
@@ -0,0 +1,80 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import List, Literal, Optional, Union
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from khoj.database.models import ChatModel
8
+ from khoj.processor.conversation.utils import commit_conversation_trace
9
+ from khoj.processor.operator.operator_actions import OperatorAction
10
+ from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
11
+ from khoj.utils.helpers import get_chat_usage_metrics, is_promptrace_enabled
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class AgentActResult(BaseModel):
17
+ actions: List[OperatorAction] = []
18
+ action_results: List[dict] = [] # Model-specific format
19
+ rendered_response: Optional[dict] = None
20
+
21
+
22
+ class AgentMessage(BaseModel):
23
+ role: Literal["user", "assistant", "system", "environment"]
24
+ content: Union[str, List]
25
+
26
+
27
+ class OperatorAgent(ABC):
28
+ def __init__(self, query: str, vision_model: ChatModel, max_iterations: int, tracer: dict):
29
+ self.query = query
30
+ self.vision_model = vision_model
31
+ self.max_iterations = max_iterations
32
+ self.tracer = tracer
33
+ self.messages: List[AgentMessage] = []
34
+
35
+ @abstractmethod
36
+ async def act(self, current_state: EnvState) -> AgentActResult:
37
+ pass
38
+
39
+ @abstractmethod
40
+ def add_action_results(self, env_steps: list[EnvStepResult], agent_action: AgentActResult) -> None:
41
+ """Track results of agent actions on the environment."""
42
+ pass
43
+
44
+ async def summarize(self, summarize_prompt: str, current_state: EnvState) -> str:
45
+ """Summarize the agent's actions and results."""
46
+ self.messages.append(AgentMessage(role="user", content=summarize_prompt))
47
+ await self.act(current_state)
48
+ if not self.messages:
49
+ return "No actions to summarize."
50
+ return self.compile_response(self.messages[-1].content)
51
+
52
+ @abstractmethod
53
+ def compile_response(self, response: List | str) -> str:
54
+ pass
55
+
56
+ @abstractmethod
57
+ def _render_response(self, response: List, screenshot: Optional[str]) -> dict:
58
+ pass
59
+
60
+ @abstractmethod
61
+ def _format_message_for_api(self, message: AgentMessage) -> List:
62
+ pass
63
+
64
+ def _update_usage(self, input_tokens: int, output_tokens: int, cache_read: int = 0, cache_write: int = 0):
65
+ self.tracer["usage"] = get_chat_usage_metrics(
66
+ self.vision_model.name, input_tokens, output_tokens, cache_read, cache_write, usage=self.tracer.get("usage")
67
+ )
68
+ logger.debug(f"Operator usage by {self.vision_model.model_type}: {self.tracer['usage']}")
69
+
70
+ def _commit_trace(self):
71
+ self.tracer["chat_model"] = self.vision_model.name
72
+ if is_promptrace_enabled() and len(self.messages) > 1:
73
+ compiled_messages = [
74
+ AgentMessage(role=msg.role, content=self.compile_response(msg.content)) for msg in self.messages
75
+ ]
76
+ commit_conversation_trace(compiled_messages[:-1], compiled_messages[-1].content, self.tracer)
77
+
78
+ def reset(self):
79
+ """Reset the agent state."""
80
+ self.messages = []