khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. khoj/database/adapters/__init__.py +1 -1
  2. khoj/interface/compiled/404/index.html +2 -2
  3. khoj/interface/compiled/_next/static/chunks/{2327-f03b2a77f67b8f8c.js → 2327-aa22697ed9c8d54a.js} +1 -1
  4. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  5. khoj/interface/compiled/_next/static/chunks/{8515-010dd769c584b672.js → 8515-f305779d95dd5780.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/agents/{page-ceeb9a91edea74ce.js → page-996513ae80f8720c.js} +1 -1
  8. khoj/interface/compiled/_next/static/chunks/app/automations/{page-e3cb78747ab98cc7.js → page-2320231573aa9a49.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
  10. khoj/interface/compiled/_next/static/chunks/app/chat/{page-14ac9d1ad5cb84c5.js → page-6257055246cdebd5.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/{page-a4053e1bb578b2ce.js → page-d9a2e44bbcf49f82.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/search/layout-f5881c7ae3ba0795.js +1 -0
  13. khoj/interface/compiled/_next/static/chunks/app/search/{page-8973da2f4c076fe1.js → page-31452bbda0e0a56f.js} +1 -1
  14. khoj/interface/compiled/_next/static/chunks/app/settings/{page-375136dbb400525b.js → page-fdb72b15ca908b43.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
  16. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-384b54fc953b18f2.js → page-5b7cb35d835af900.js} +1 -1
  17. khoj/interface/compiled/_next/static/chunks/{webpack-1169ca6e9e7e6247.js → webpack-e091508620cb8aef.js} +1 -1
  18. khoj/interface/compiled/_next/static/css/{fca983d49c3dd1a3.css → 0db53bacf81896f5.css} +1 -1
  19. khoj/interface/compiled/_next/static/css/55d4a822f8d94b67.css +1 -0
  20. khoj/interface/compiled/agents/index.html +2 -2
  21. khoj/interface/compiled/agents/index.txt +2 -2
  22. khoj/interface/compiled/automations/index.html +2 -2
  23. khoj/interface/compiled/automations/index.txt +3 -3
  24. khoj/interface/compiled/chat/index.html +2 -2
  25. khoj/interface/compiled/chat/index.txt +2 -2
  26. khoj/interface/compiled/index.html +2 -2
  27. khoj/interface/compiled/index.txt +2 -2
  28. khoj/interface/compiled/search/index.html +2 -2
  29. khoj/interface/compiled/search/index.txt +2 -2
  30. khoj/interface/compiled/settings/index.html +2 -2
  31. khoj/interface/compiled/settings/index.txt +4 -4
  32. khoj/interface/compiled/share/chat/index.html +2 -2
  33. khoj/interface/compiled/share/chat/index.txt +2 -2
  34. khoj/processor/conversation/anthropic/anthropic_chat.py +5 -0
  35. khoj/processor/conversation/google/gemini_chat.py +5 -0
  36. khoj/processor/conversation/openai/gpt.py +5 -0
  37. khoj/processor/conversation/prompts.py +12 -1
  38. khoj/processor/conversation/utils.py +12 -0
  39. khoj/processor/operator/grounding_agent.py +345 -0
  40. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  41. khoj/processor/operator/operate_browser.py +152 -0
  42. khoj/processor/operator/operator_actions.py +149 -0
  43. khoj/processor/operator/operator_agent_anthropic.py +383 -0
  44. khoj/processor/operator/operator_agent_base.py +80 -0
  45. khoj/processor/operator/operator_agent_binary.py +336 -0
  46. khoj/processor/operator/operator_agent_openai.py +349 -0
  47. khoj/processor/operator/operator_environment_base.py +37 -0
  48. khoj/processor/operator/operator_environment_browser.py +395 -0
  49. khoj/routers/api_chat.py +42 -3
  50. khoj/routers/helpers.py +14 -3
  51. khoj/routers/research.py +48 -1
  52. khoj/utils/helpers.py +17 -0
  53. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/METADATA +3 -1
  54. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/RECORD +65 -55
  55. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  56. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
  57. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
  58. khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
  59. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
  60. khoj/interface/compiled/_next/static/css/f29752d6e1be7624.css +0 -1
  61. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → WLmcH2J-wz36GlS6O8HSL}/_buildManifest.js +0 -0
  62. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → WLmcH2J-wz36GlS6O8HSL}/_ssgManifest.js +0 -0
  63. /khoj/interface/compiled/_next/static/chunks/{1915-ab4353eaca76f690.js → 1915-1943ee8a628b893c.js} +0 -0
  64. /khoj/interface/compiled/_next/static/chunks/{2117-1c18aa2098982bf9.js → 2117-5a41630a2bd2eae8.js} +0 -0
  65. /khoj/interface/compiled/_next/static/chunks/{4363-4efaf12abe696251.js → 4363-e6ac2203564d1a3b.js} +0 -0
  66. /khoj/interface/compiled/_next/static/chunks/{4447-5d44807c40355b1a.js → 4447-e038b251d626c340.js} +0 -0
  67. /khoj/interface/compiled/_next/static/chunks/{8667-adbe6017a66cef10.js → 8667-8136f74e9a086fca.js} +0 -0
  68. /khoj/interface/compiled/_next/static/chunks/{9259-d8bcd9da9e80c81e.js → 9259-640fdd77408475df.js} +0 -0
  69. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/WHEEL +0 -0
  70. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/entry_points.txt +0 -0
  71. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,80 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import List, Literal, Optional, Union
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from khoj.database.models import ChatModel
8
+ from khoj.processor.conversation.utils import commit_conversation_trace
9
+ from khoj.processor.operator.operator_actions import OperatorAction
10
+ from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
11
+ from khoj.utils.helpers import get_chat_usage_metrics, is_promptrace_enabled
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class AgentActResult(BaseModel):
17
+ actions: List[OperatorAction] = []
18
+ action_results: List[dict] = [] # Model-specific format
19
+ rendered_response: Optional[dict] = None
20
+
21
+
22
+ class AgentMessage(BaseModel):
23
+ role: Literal["user", "assistant", "system", "environment"]
24
+ content: Union[str, List]
25
+
26
+
27
+ class OperatorAgent(ABC):
28
+ def __init__(self, query: str, vision_model: ChatModel, max_iterations: int, tracer: dict):
29
+ self.query = query
30
+ self.vision_model = vision_model
31
+ self.max_iterations = max_iterations
32
+ self.tracer = tracer
33
+ self.messages: List[AgentMessage] = []
34
+
35
+ @abstractmethod
36
+ async def act(self, current_state: EnvState) -> AgentActResult:
37
+ pass
38
+
39
+ @abstractmethod
40
+ def add_action_results(self, env_steps: list[EnvStepResult], agent_action: AgentActResult) -> None:
41
+ """Track results of agent actions on the environment."""
42
+ pass
43
+
44
+ async def summarize(self, summarize_prompt: str, current_state: EnvState) -> str:
45
+ """Summarize the agent's actions and results."""
46
+ self.messages.append(AgentMessage(role="user", content=summarize_prompt))
47
+ await self.act(current_state)
48
+ if not self.messages:
49
+ return "No actions to summarize."
50
+ return self.compile_response(self.messages[-1].content)
51
+
52
+ @abstractmethod
53
+ def compile_response(self, response: List | str) -> str:
54
+ pass
55
+
56
+ @abstractmethod
57
+ def _render_response(self, response: List, screenshot: Optional[str]) -> dict:
58
+ pass
59
+
60
+ @abstractmethod
61
+ def _format_message_for_api(self, message: AgentMessage) -> List:
62
+ pass
63
+
64
+ def _update_usage(self, input_tokens: int, output_tokens: int, cache_read: int = 0, cache_write: int = 0):
65
+ self.tracer["usage"] = get_chat_usage_metrics(
66
+ self.vision_model.name, input_tokens, output_tokens, cache_read, cache_write, usage=self.tracer.get("usage")
67
+ )
68
+ logger.debug(f"Operator usage by {self.vision_model.model_type}: {self.tracer['usage']}")
69
+
70
+ def _commit_trace(self):
71
+ self.tracer["chat_model"] = self.vision_model.name
72
+ if is_promptrace_enabled() and len(self.messages) > 1:
73
+ compiled_messages = [
74
+ AgentMessage(role=msg.role, content=self.compile_response(msg.content)) for msg in self.messages
75
+ ]
76
+ commit_conversation_trace(compiled_messages[:-1], compiled_messages[-1].content, self.tracer)
77
+
78
+ def reset(self):
79
+ """Reset the agent state."""
80
+ self.messages = []
@@ -0,0 +1,336 @@
1
+ import json
2
+ import logging
3
+ from datetime import datetime
4
+ from typing import List, Optional
5
+
6
+ from openai.types.chat import ChatCompletion
7
+
8
+ from khoj.database.models import ChatModel
9
+ from khoj.processor.conversation.utils import construct_structured_message
10
+ from khoj.processor.operator.grounding_agent import GroundingAgent
11
+ from khoj.processor.operator.grounding_agent_uitars import GroundingAgentUitars
12
+ from khoj.processor.operator.operator_actions import *
13
+ from khoj.processor.operator.operator_agent_base import (
14
+ AgentActResult,
15
+ AgentMessage,
16
+ OperatorAgent,
17
+ )
18
+ from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
19
+ from khoj.routers.helpers import send_message_to_model_wrapper
20
+ from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ # --- Binary Operator Agent ---
26
+ class BinaryOperatorAgent(OperatorAgent):
27
+ """
28
+ An OperatorAgent that uses two LLMs:
29
+ 1. Reasoning LLM: Determines the next high-level action based on the objective and current visual reasoning trajectory.
30
+ 2. Grounding LLM: Converts the high-level action into specific, executable browser actions.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ query: str,
36
+ reasoning_model: ChatModel,
37
+ grounding_model: ChatModel,
38
+ max_iterations: int,
39
+ tracer: dict,
40
+ ):
41
+ super().__init__(query, reasoning_model, max_iterations, tracer) # Use reasoning model for primary tracking
42
+ self.reasoning_model = reasoning_model
43
+ self.grounding_model = grounding_model
44
+ # Initialize openai api compatible client for grounding model
45
+ grounding_client = get_openai_async_client(
46
+ grounding_model.ai_model_api.api_key, grounding_model.ai_model_api.api_base_url
47
+ )
48
+
49
+ self.grounding_agent: GroundingAgent | GroundingAgentUitars = None
50
+ if "ui-tars-1.5" in grounding_model.name:
51
+ self.grounding_agent = GroundingAgentUitars(
52
+ grounding_model.name, grounding_client, max_iterations, environment_type="web", tracer=tracer
53
+ )
54
+ else:
55
+ self.grounding_agent = GroundingAgent(grounding_model.name, grounding_client, max_iterations, tracer=tracer)
56
+
57
+ async def act(self, current_state: EnvState) -> AgentActResult:
58
+ """
59
+ Uses a two-step LLM process to determine and structure the next action.
60
+ """
61
+ self._commit_trace() # Commit trace before next action
62
+
63
+ # --- Step 1: Reasoning LLM determines high-level action ---
64
+ reasoner_response = await self.act_reason(current_state)
65
+ natural_language_action = reasoner_response["message"]
66
+ if reasoner_response["type"] == "error":
67
+ logger.error(natural_language_action)
68
+ return AgentActResult(
69
+ actions=[],
70
+ action_results=[],
71
+ rendered_response={"text": natural_language_action, "image": None},
72
+ )
73
+ elif reasoner_response["type"] == "done":
74
+ return AgentActResult(
75
+ actions=[],
76
+ action_results=[],
77
+ rendered_response={"text": natural_language_action, "image": None},
78
+ )
79
+
80
+ # --- Step 2: Grounding LLM converts NL action to structured action ---
81
+ return await self.act_ground(natural_language_action, current_state)
82
+
83
+ async def act_reason(self, current_state: EnvState) -> dict[str, str]:
84
+ """
85
+ Uses the reasoning LLM to determine the next high-level action based on the operation trajectory.
86
+ """
87
+ reasoning_system_prompt = f"""
88
+ # Introduction
89
+ * You are Khoj, a smart and resourceful web browsing assistant. You help the user accomplish their task using a web browser.
90
+ * You are given the user's query and screenshots of the browser's state transitions.
91
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
92
+ * The current URL is {current_state.url}.
93
+
94
+ # Your Task
95
+ * First look at the screenshots carefully to notice all pertinent information.
96
+ * Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
97
+ * Make sure you scroll down to see everything before deciding something isn't available.
98
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
99
+ * Use your creativity to find alternate ways to make progress if you get stuck at any point.
100
+
101
+ # Tool AI Capabilities
102
+ * The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
103
+ * It can interact with the web browser with these actions: click, right click, double click, type, scroll, drag, wait, goto url and go back to previous page.
104
+ * It cannot access the OS, filesystem or application window. It just controls a single Chromium browser tab via Playwright.
105
+
106
+ # IMPORTANT
107
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
108
+ * To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
109
+ * To navigate back to the previous page, end your response with "BACK" (without quotes).
110
+ * Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
111
+
112
+ # Examples
113
+ ## Example 1
114
+ GOTO https://example.com
115
+ ## Example 2
116
+ click the blue login button located at the top right corner
117
+ ## Example 3
118
+ scroll down the page
119
+ ## Example 4
120
+ type the username example@email.com into the input field labeled Username
121
+ ## Example 5
122
+ DONE
123
+
124
+ # Instructions
125
+ Now describe a single high-level action to take next to progress towards the user's goal in detail.
126
+ Focus on the visual action and provide all necessary context.
127
+ """.strip()
128
+
129
+ if is_none_or_empty(self.messages):
130
+ query_text = f"**Main Objective**: {self.query}"
131
+ query_screenshot = [f"data:image/webp;base64,{current_state.screenshot}"]
132
+ first_message_content = construct_structured_message(
133
+ message=query_text,
134
+ images=query_screenshot,
135
+ model_type=self.reasoning_model.model_type,
136
+ vision_enabled=True,
137
+ )
138
+ current_message = AgentMessage(role="user", content=first_message_content)
139
+ else:
140
+ current_message = self.messages.pop()
141
+ query_text = self._get_message_text(current_message)
142
+ query_screenshot = self._get_message_images(current_message)
143
+
144
+ # Construct input for visual reasoner history
145
+ visual_reasoner_history = {"chat": self._format_message_for_api(self.messages)}
146
+ try:
147
+ natural_language_action = await send_message_to_model_wrapper(
148
+ query=query_text,
149
+ query_images=query_screenshot,
150
+ system_message=reasoning_system_prompt,
151
+ conversation_log=visual_reasoner_history,
152
+ agent_chat_model=self.reasoning_model,
153
+ tracer=self.tracer,
154
+ )
155
+
156
+ if not isinstance(natural_language_action, str) or not natural_language_action.strip():
157
+ raise ValueError(f"Natural language action is empty or not a string. Got {natural_language_action}")
158
+
159
+ self.messages.append(current_message)
160
+ self.messages.append(AgentMessage(role="assistant", content=natural_language_action))
161
+
162
+ if natural_language_action.strip().endswith("DONE"):
163
+ return {"type": "done", "message": "Completed task."}
164
+
165
+ logger.info(f"Reasoning LLM suggested action: {natural_language_action}")
166
+
167
+ except Exception as e:
168
+ logger.error(f"Error calling Reasoning LLM: {e}", exc_info=True)
169
+ return {"type": "error", "message": f"Error calling Reasoning LLM: {e}"}
170
+
171
+ return {"type": "action", "message": natural_language_action}
172
+
173
+ async def act_ground(self, action_instruction: str, current_state: EnvState) -> AgentActResult:
174
+ """Uses the grounding LLM to convert the high-level action into structured browser actions."""
175
+ actions: List[OperatorAction] = []
176
+ action_results: List[dict] = []
177
+ rendered_parts = [f"**Thought (Vision)**: {action_instruction}"]
178
+
179
+ try:
180
+ grounding_response, actions = await self.grounding_agent.act(action_instruction, current_state)
181
+
182
+ # Process grounding response
183
+ if grounding_response.strip().endswith("DONE"):
184
+ # Ignore DONE response by the grounding agent. Reasoning agent handles termination.
185
+ actions.append(WaitAction(duration=1.0))
186
+ rendered_parts += ["Nothing to do."]
187
+ elif grounding_response.strip().endswith("FAIL"):
188
+ # Ignore FAIL response by the grounding agent. Reasoning agent handles termination.
189
+ actions.append(WaitAction(duration=1.0))
190
+ rendered_parts += ["Could not process response."]
191
+ else:
192
+ grounding_thoughts = grounding_response.rsplit("\nAction: ", 1)[0]
193
+ rendered_parts += [f"**Thought (Grounding)**: {grounding_thoughts}"]
194
+ for action in actions:
195
+ if action.type == "type":
196
+ rendered_parts += [f'**Action**: Type "{action.text}"']
197
+ elif action.type == "keypress":
198
+ rendered_parts += [f'**Action**: Press "{action.keys}"']
199
+ elif action.type == "hold_key":
200
+ rendered_parts += [f'**Action**: Hold "{action.text}" for {action.duration} seconds']
201
+ elif action.type == "key_up":
202
+ rendered_parts += [f'**Action**: Release Key "{action.key}"']
203
+ elif action.type == "key_down":
204
+ rendered_parts += [f'**Action**: Press Key "{action.key}"']
205
+ elif action.type == "screenshot" and not current_state.screenshot:
206
+ rendered_parts += [f"**Error**: Failed to take screenshot"]
207
+ elif action.type == "goto":
208
+ rendered_parts += [f"**Action**: Open URL {action.url}"]
209
+ else:
210
+ rendered_parts += [f"**Action**: {action.type}"]
211
+ action_results += [{"content": None}] # content set after environment step
212
+ except Exception as e:
213
+ logger.error(f"Error calling Grounding LLM: {e}", exc_info=True)
214
+ rendered_parts += [f"**Error**: Error contacting Grounding LLM: {e}"]
215
+
216
+ rendered_response = self._render_response(rendered_parts, current_state.screenshot)
217
+
218
+ return AgentActResult(
219
+ actions=actions,
220
+ action_results=action_results,
221
+ rendered_response=rendered_response,
222
+ )
223
+
224
+ def add_action_results(self, env_steps: list[EnvStepResult], agent_action: AgentActResult) -> None:
225
+ """
226
+ Adds the results of executed actions back into the message history,
227
+ formatted for the next OpenAI vision LLM call.
228
+ """
229
+ if not agent_action.action_results:
230
+ return
231
+
232
+ for idx, env_step in enumerate(env_steps):
233
+ result_content = env_step.error or env_step.output or "[Action completed]"
234
+ action_result = agent_action.action_results[idx]
235
+ if env_step.type == "image":
236
+ message = "**Action Result**: Took screenshot"
237
+ images = [f"data:image/png;base64,{env_step.screenshot_base64}"]
238
+ elif idx == len(env_steps) - 1:
239
+ message = f"**Action Result**: {json.dumps(result_content)}"
240
+ images = [f"data:image/png;base64,{env_step.screenshot_base64}"]
241
+ else:
242
+ message = f"**Action Result**: {json.dumps(result_content)}"
243
+ images = []
244
+ if not images:
245
+ action_result["content"] = [{"type": "text", "text": message}]
246
+ else:
247
+ action_result["content"] = construct_structured_message(
248
+ message=message,
249
+ images=images,
250
+ model_type=self.reasoning_model.model_type,
251
+ vision_enabled=True,
252
+ )
253
+
254
+ # Append action results to history
255
+ action_results_content = []
256
+ for action_result in agent_action.action_results:
257
+ if not action_result.get("content"):
258
+ logger.error("Action result content is empty or None: {action_result}")
259
+ action_results_content.extend(action_result["content"])
260
+ self.messages.append(AgentMessage(role="environment", content=action_results_content))
261
+
262
+ async def summarize(self, summarize_prompt: str, env_state: EnvState) -> str:
263
+ conversation_history = {"chat": self._format_message_for_api(self.messages)}
264
+ try:
265
+ summary = await send_message_to_model_wrapper(
266
+ query=summarize_prompt,
267
+ conversation_log=conversation_history,
268
+ agent_chat_model=self.reasoning_model,
269
+ tracer=self.tracer,
270
+ )
271
+ # Set summary to last action message
272
+ if not summary:
273
+ raise ValueError("Summary is empty.")
274
+ except Exception as e:
275
+ logger.error(f"Error calling Reasoning LLM for summary: {e}")
276
+ summary = "\n".join([self._get_message_text(msg) for msg in self.messages])
277
+
278
+ # Append summary messages to history
279
+ trigger_summary = AgentMessage(role="user", content=summarize_prompt)
280
+ summary_message = AgentMessage(role="assistant", content=summary)
281
+ self.messages.extend([trigger_summary, summary_message])
282
+
283
+ return summary
284
+
285
+ def compile_response(self, response_content: str | List) -> str:
286
+ """Compile response content into a string, handling OpenAI message structures."""
287
+ if isinstance(response_content, str):
288
+ return response_content
289
+
290
+ if isinstance(response_content, list): # Tool results list
291
+ compiled = ["**Tool Results**:"]
292
+ for item in response_content:
293
+ if isinstance(item, dict) and item.get("role") == "tool":
294
+ compiled.append(f" - ID {item.get('tool_call_id')}: {item.get('content')}")
295
+ else:
296
+ compiled.append(f" - {str(item)}") # Fallback
297
+ return "\n".join(compiled)
298
+
299
+ # Fallback for unexpected types
300
+ return str(response_content)
301
+
302
+ def _render_response(self, response: List, screenshot: str | None) -> dict:
303
+ """Render response for display"""
304
+ render_payload = {
305
+ "text": "\n- ".join(response),
306
+ "image": f"data:image/webp;base64,{screenshot}" if screenshot else None,
307
+ }
308
+ return render_payload
309
+
310
+ def _get_message_text(self, message: AgentMessage) -> str:
311
+ if isinstance(message.content, list):
312
+ return "\n".join([item["text"] for item in message.content if item["type"] == "text"])
313
+ return message.content
314
+
315
+ def _get_message_images(self, message: AgentMessage) -> List[str]:
316
+ images = []
317
+ if isinstance(message.content, list):
318
+ images = [item["image_url"]["url"] for item in message.content if item["type"] == "image_url"]
319
+ return images
320
+
321
+ def _format_message_for_api(self, messages: list[AgentMessage]) -> List[dict]:
322
+ """Format operator agent messages into the Khoj conversation history format."""
323
+ formatted_messages = [
324
+ {
325
+ "message": self._get_message_text(message),
326
+ "images": self._get_message_images(message),
327
+ "by": "you" if message.role in ["user", "environment"] else message.role,
328
+ }
329
+ for message in messages
330
+ ]
331
+ return formatted_messages
332
+
333
+ def reset(self):
334
+ """Reset the agent state."""
335
+ super().reset()
336
+ self.grounding_agent.reset() # Reset grounding agent state