khoj 1.41.1.dev40__py3-none-any.whl → 1.41.1.dev90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. khoj/database/adapters/__init__.py +1 -1
  2. khoj/interface/compiled/404/index.html +1 -1
  3. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  4. khoj/interface/compiled/agents/index.html +1 -1
  5. khoj/interface/compiled/agents/index.txt +1 -1
  6. khoj/interface/compiled/automations/index.html +1 -1
  7. khoj/interface/compiled/automations/index.txt +1 -1
  8. khoj/interface/compiled/chat/index.html +2 -2
  9. khoj/interface/compiled/chat/index.txt +2 -2
  10. khoj/interface/compiled/index.html +1 -1
  11. khoj/interface/compiled/index.txt +1 -1
  12. khoj/interface/compiled/search/index.html +1 -1
  13. khoj/interface/compiled/search/index.txt +1 -1
  14. khoj/interface/compiled/settings/index.html +1 -1
  15. khoj/interface/compiled/settings/index.txt +1 -1
  16. khoj/interface/compiled/share/chat/index.html +2 -2
  17. khoj/interface/compiled/share/chat/index.txt +2 -2
  18. khoj/processor/conversation/anthropic/anthropic_chat.py +5 -0
  19. khoj/processor/conversation/google/gemini_chat.py +5 -0
  20. khoj/processor/conversation/google/utils.py +4 -0
  21. khoj/processor/conversation/openai/gpt.py +5 -0
  22. khoj/processor/conversation/prompts.py +12 -1
  23. khoj/processor/conversation/utils.py +13 -1
  24. khoj/processor/operator/grounding_agent.py +345 -0
  25. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  26. khoj/processor/operator/operate_browser.py +152 -0
  27. khoj/processor/operator/operator_actions.py +149 -0
  28. khoj/processor/operator/operator_agent_anthropic.py +383 -0
  29. khoj/processor/operator/operator_agent_base.py +80 -0
  30. khoj/processor/operator/operator_agent_binary.py +336 -0
  31. khoj/processor/operator/operator_agent_openai.py +349 -0
  32. khoj/processor/operator/operator_environment_base.py +37 -0
  33. khoj/processor/operator/operator_environment_browser.py +395 -0
  34. khoj/routers/api_chat.py +42 -3
  35. khoj/routers/helpers.py +14 -3
  36. khoj/routers/research.py +48 -1
  37. khoj/utils/helpers.py +17 -0
  38. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/METADATA +5 -3
  39. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/RECORD +44 -34
  40. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  41. /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_buildManifest.js +0 -0
  42. /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_ssgManifest.js +0 -0
  43. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/WHEEL +0 -0
  44. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/entry_points.txt +0 -0
  45. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,349 @@
1
+ import json
2
+ import logging
3
+ from copy import deepcopy
4
+ from datetime import datetime
5
+ from typing import List, Optional, cast
6
+
7
+ from openai.types.responses import Response, ResponseOutputItem
8
+
9
+ from khoj.processor.operator.operator_actions import *
10
+ from khoj.processor.operator.operator_agent_base import (
11
+ AgentActResult,
12
+ AgentMessage,
13
+ OperatorAgent,
14
+ )
15
+ from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
16
+ from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # --- Anthropic Operator Agent ---
22
+ class OpenAIOperatorAgent(OperatorAgent):
23
+ async def act(self, current_state: EnvState) -> AgentActResult:
24
+ client = get_openai_async_client(
25
+ self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
26
+ )
27
+ safety_check_prefix = "Say 'continue' after resolving the following safety checks to proceed:"
28
+ safety_check_message = None
29
+ actions: List[OperatorAction] = []
30
+ action_results: List[dict] = []
31
+ self._commit_trace() # Commit trace before next action
32
+ system_prompt = f"""<SYSTEM_CAPABILITY>
33
+ * You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
34
+ * You operate a single Chromium browser page using Playwright.
35
+ * You cannot access the OS or filesystem.
36
+ * You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
37
+ * You can use the additional back() and goto() functions to navigate the browser.
38
+ * Always use the goto() function to navigate to a specific URL. If you see nothing, try goto duckduckgo.com
39
+ * When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
40
+ * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
41
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
42
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
43
+ * The current URL is {current_state.url}.
44
+ </SYSTEM_CAPABILITY>
45
+
46
+ <IMPORTANT>
47
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
48
+ * After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
49
+ </IMPORTANT>
50
+ """
51
+ tools = [
52
+ {
53
+ "type": "computer_use_preview",
54
+ "display_width": 1024, # TODO: Get from env
55
+ "display_height": 768, # TODO: Get from env
56
+ "environment": "browser",
57
+ },
58
+ {
59
+ "type": "function",
60
+ "name": "back",
61
+ "description": "Go back to the previous page.",
62
+ "parameters": {},
63
+ },
64
+ {
65
+ "type": "function",
66
+ "name": "goto",
67
+ "description": "Go to a specific URL.",
68
+ "parameters": {
69
+ "type": "object",
70
+ "properties": {
71
+ "url": {
72
+ "type": "string",
73
+ "description": "Fully qualified URL to navigate to.",
74
+ },
75
+ },
76
+ "additionalProperties": False,
77
+ "required": ["url"],
78
+ },
79
+ },
80
+ ]
81
+
82
+ if is_none_or_empty(self.messages):
83
+ self.messages = [AgentMessage(role="user", content=self.query)]
84
+
85
+ messages_for_api = self._format_message_for_api(self.messages)
86
+ response: Response = await client.responses.create(
87
+ model="computer-use-preview",
88
+ input=messages_for_api,
89
+ instructions=system_prompt,
90
+ tools=tools,
91
+ parallel_tool_calls=False, # Keep sequential for now
92
+ max_output_tokens=4096, # TODO: Make configurable?
93
+ truncation="auto",
94
+ )
95
+
96
+ logger.debug(f"Openai response: {response.model_dump_json()}")
97
+ self.messages += [AgentMessage(role="environment", content=response.output)]
98
+ rendered_response = self._render_response(response.output, current_state.screenshot)
99
+
100
+ last_call_id = None
101
+ content = None
102
+ for block in response.output:
103
+ action_to_run: Optional[OperatorAction] = None
104
+ if block.type == "function_call":
105
+ last_call_id = block.call_id
106
+ if block.name == "goto":
107
+ try:
108
+ args = json.loads(block.arguments)
109
+ url = args.get("url")
110
+ if url:
111
+ action_to_run = GotoAction(url=url)
112
+ else:
113
+ logger.warning("Goto function called without URL argument.")
114
+ except json.JSONDecodeError:
115
+ logger.warning(f"Failed to parse arguments for goto: {block.arguments}")
116
+ elif block.name == "back":
117
+ action_to_run = BackAction()
118
+
119
+ elif block.type == "computer_call":
120
+ last_call_id = block.call_id
121
+ if block.pending_safety_checks:
122
+ safety_check_body = "\n- ".join([check.message for check in block.pending_safety_checks])
123
+ safety_check_message = f"{safety_check_prefix}\n- {safety_check_body}"
124
+ action_to_run = RequestUserAction(request=safety_check_message)
125
+ actions.append(action_to_run)
126
+ break # Stop processing actions if safety check needed
127
+
128
+ # Convert OpenAI action to standardized BrowserAction
129
+ openai_action = block.action
130
+ action_type = openai_action.type
131
+ try:
132
+ if action_type == "click":
133
+ action_to_run = ClickAction(x=openai_action.x, y=openai_action.y, button=openai_action.button)
134
+ elif action_type == "double_click":
135
+ action_to_run = DoubleClickAction(x=openai_action.x, y=openai_action.y)
136
+ elif action_type == "scroll":
137
+ action_to_run = ScrollAction(
138
+ x=openai_action.x,
139
+ y=openai_action.y,
140
+ scroll_x=openai_action.scroll_x,
141
+ scroll_y=openai_action.scroll_y,
142
+ )
143
+ elif action_type == "keypress":
144
+ action_to_run = KeypressAction(keys=openai_action.keys)
145
+ elif action_type == "type":
146
+ action_to_run = TypeAction(text=openai_action.text)
147
+ elif action_type == "wait":
148
+ action_to_run = WaitAction(duration=2.0) # OpenAI doesn't specify duration, default?
149
+ elif action_type == "screenshot":
150
+ action_to_run = ScreenshotAction()
151
+ elif action_type == "move":
152
+ action_to_run = MoveAction(x=openai_action.x, y=openai_action.y)
153
+ elif action_type == "drag":
154
+ action_to_run = DragAction(path=[Point(x=p.x, y=p.y) for p in openai_action.path])
155
+ else:
156
+ raise ValueError(f"Unsupported OpenAI computer action type: {action_type}")
157
+ except ValueError as ve:
158
+ logger.error(f"Error converting OpenAI action {action_type}: {ve}")
159
+ content = f"ValueError: {action_type}: {ve}"
160
+ except Exception as e:
161
+ logger.error(f"Error converting OpenAI action {action_type}: {e}")
162
+ content = f"Error: {action_type}: {e}"
163
+ elif block.type == "message":
164
+ rendered_response["text"] = response.output_text
165
+ elif block.type == "reasoning":
166
+ actions.append(NoopAction())
167
+ # Add placeholder action result for reasoning
168
+ # This is to prevent run termination.
169
+ # It will be removed later by add_action_results func
170
+ action_results.append(
171
+ {
172
+ "type": block.type,
173
+ "id": block.id,
174
+ "summary": [],
175
+ }
176
+ )
177
+ if action_to_run or content:
178
+ actions.append(action_to_run)
179
+ if action_to_run or content:
180
+ # Prepare the action result
181
+ action_results.append(
182
+ {
183
+ "type": f"{block.type}_output",
184
+ "output": content, # Updated after environment step
185
+ "call_id": last_call_id,
186
+ }
187
+ )
188
+
189
+ self._update_usage(response.usage.input_tokens, response.usage.output_tokens)
190
+
191
+ return AgentActResult(
192
+ actions=actions,
193
+ action_results=action_results,
194
+ rendered_response=rendered_response,
195
+ )
196
+
197
+ def add_action_results(self, env_steps: list[EnvStepResult], agent_action: AgentActResult) -> None:
198
+ if not agent_action.action_results:
199
+ return
200
+
201
+ # Update action results with results of applying suggested actions on the environment
202
+ items_to_pop = []
203
+ for idx, env_step in enumerate(env_steps):
204
+ action_result = agent_action.action_results[idx]
205
+ result_content = env_step.error or env_step.output or "[Action completed]"
206
+ if env_step.type == "image" and isinstance(result_content, dict):
207
+ # Add screenshot data in openai message format
208
+ action_result["output"] = {
209
+ "type": "input_image",
210
+ "image_url": f'data:image/webp;base64,{result_content["image"]}',
211
+ "current_url": result_content["url"],
212
+ }
213
+ elif action_result["type"] == "computer_call_output" and idx == len(env_steps) - 1:
214
+ # Always add screenshot, current url to last action result, when computer tool used
215
+ action_result["output"] = {
216
+ "type": "input_image",
217
+ "image_url": f"data:image/webp;base64,{env_step.screenshot_base64}",
218
+ "current_url": env_step.current_url,
219
+ }
220
+ elif action_result["type"] == "reasoning":
221
+ items_to_pop.append(idx) # Mark placeholder reasoning action result for removal
222
+ continue
223
+ else:
224
+ # Add text data
225
+ action_result["output"] = result_content
226
+
227
+ for idx in reversed(items_to_pop):
228
+ agent_action.action_results.pop(idx)
229
+
230
+ self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
231
+
232
+ def _format_message_for_api(self, messages: list[AgentMessage]) -> list:
233
+ """Format the message for OpenAI API."""
234
+ formatted_messages: list = []
235
+ for message in messages:
236
+ if message.role == "environment":
237
+ if isinstance(message.content, list):
238
+ # Remove reasoning message if not followed by computer call
239
+ if (
240
+ len(message.content) > 1
241
+ and all(hasattr(item, "type") for item in message.content)
242
+ and message.content[0].type == "reasoning"
243
+ and message.content[1].type != "computer_call"
244
+ ) or (
245
+ len(message.content) == 1
246
+ and all(hasattr(item, "type") for item in message.content)
247
+ and message.content[0].type == "reasoning"
248
+ ):
249
+ logger.warning(
250
+ f"Removing reasoning message not followed by a computer call action: {message.content}"
251
+ )
252
+ message.content.pop(0)
253
+ formatted_messages.extend(message.content)
254
+ else:
255
+ logger.warning(f"Expected message content list from environment, got {type(message.content)}")
256
+ else:
257
+ formatted_messages.append(
258
+ {
259
+ "role": message.role,
260
+ "content": message.content,
261
+ }
262
+ )
263
+ return formatted_messages
264
+
265
+ def compile_response(self, response_content: str | list[dict | ResponseOutputItem]) -> str:
266
+ """Compile the response from model into a single string."""
267
+ # Handle case where response content is a string.
268
+ # This is the case when response content is a user query
269
+ if isinstance(response_content, str):
270
+ return response_content
271
+ elif is_none_or_empty(response_content):
272
+ return ""
273
+ # Handle case where response_content is a dictionary and not ResponseOutputItem
274
+ # This is the case when response_content contains action results
275
+ if not hasattr(response_content[0], "type"):
276
+ return "**Action**: " + json.dumps(response_content[0].get("output", "Noop"))
277
+
278
+ compiled_response = [""]
279
+ for block in deepcopy(response_content):
280
+ block = cast(ResponseOutputItem, block) # Ensure block is of type ResponseOutputItem
281
+ # Handle different block types
282
+ if block.type == "message":
283
+ # Extract text content if available
284
+ for content in block.content:
285
+ text_content = ""
286
+ if hasattr(content, "text"):
287
+ text_content += content.text
288
+ elif hasattr(content, "refusal"):
289
+ text_content += f"Refusal: {content.refusal}"
290
+ else:
291
+ text_content += content.model_dump_json()
292
+ compiled_response.append(text_content)
293
+ elif block.type == "function_call":
294
+ block_function_input = {"action": block.name}
295
+ if block.name == "goto":
296
+ try:
297
+ args = json.loads(block.arguments)
298
+ block_function_input["url"] = args.get("url", "[Missing URL]")
299
+ except json.JSONDecodeError:
300
+ block_function_input["arguments"] = block.arguments # Show raw args on error
301
+ compiled_response.append(f"**Action**: {json.dumps(block_function_input)}")
302
+ elif block.type == "computer_call":
303
+ block_computer_input = block.action
304
+ # If it's a screenshot action
305
+ if block_computer_input.type == "screenshot":
306
+ # Use a placeholder for screenshot data
307
+ block_input_render = block_computer_input.model_dump()
308
+ block_input_render["image"] = "[placeholder for screenshot]"
309
+ compiled_response.append(f"**Action**: {json.dumps(block_input_render)}")
310
+ else:
311
+ compiled_response.append(f"**Action**: {block_computer_input.model_dump_json()}")
312
+ elif block.type == "reasoning" and block.summary:
313
+ compiled_response.append(f"**Thought**: {block.summary}")
314
+ return "\n- ".join(filter(None, compiled_response)) # Filter out empty strings
315
+
316
+ def _render_response(self, response_content: list[ResponseOutputItem], screenshot: str | None) -> dict:
317
+ """Render OpenAI response for display, potentially including screenshots."""
318
+ render_texts = []
319
+ for block in deepcopy(response_content): # Use deepcopy to avoid modifying original
320
+ if block.type == "message":
321
+ text_content = block.text if hasattr(block, "text") else block.model_dump_json()
322
+ render_texts += [text_content]
323
+ elif block.type == "function_call":
324
+ if block.name == "goto":
325
+ args = json.loads(block.arguments)
326
+ render_texts = [f'Open URL: {args.get("url", "[Missing URL]")}']
327
+ else:
328
+ render_texts += [block.name]
329
+ elif block.type == "computer_call":
330
+ block_input = block.action
331
+ if block_input.type == "screenshot" and not screenshot:
332
+ render_texts += ["Failed to get screenshot"]
333
+ elif block_input.type == "type":
334
+ render_texts += [f'Type "{block_input.text}"']
335
+ elif block_input.type == "keypress":
336
+ render_texts += [f"Press {'+'.join(block_input.keys)}"]
337
+ else:
338
+ render_texts += [f"{block_input.type.capitalize()}"]
339
+ elif block.type == "reasoning" and block.summary:
340
+ render_texts += [f"**Thought**: {block.summary}"]
341
+
342
+ render_payload = {
343
+ # Combine text into a single string and filter out empty strings
344
+ "text": "\n- ".join(filter(None, render_texts)),
345
+ # Add screenshot data if available
346
+ "image": f"data:image/webp;base64,{screenshot}" if screenshot else None,
347
+ }
348
+
349
+ return render_payload
@@ -0,0 +1,37 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Literal, Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from khoj.processor.operator.operator_actions import OperatorAction
7
+
8
+
9
+ class EnvState(BaseModel):
10
+ url: str
11
+ screenshot: Optional[str] = None
12
+
13
+
14
+ class EnvStepResult(BaseModel):
15
+ type: Literal["text", "image"] = "text"
16
+ output: Optional[str | dict] = None
17
+ error: Optional[str] = None
18
+ current_url: Optional[str] = None
19
+ screenshot_base64: Optional[str] = None
20
+
21
+
22
+ class Environment(ABC):
23
+ @abstractmethod
24
+ async def start(self, width: int, height: int) -> None:
25
+ pass
26
+
27
+ @abstractmethod
28
+ async def step(self, action: OperatorAction) -> EnvStepResult:
29
+ pass
30
+
31
+ @abstractmethod
32
+ async def close(self) -> None:
33
+ pass
34
+
35
+ @abstractmethod
36
+ async def get_state(self) -> EnvState:
37
+ pass