khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. khoj/database/adapters/__init__.py +20 -0
  2. khoj/database/models/__init__.py +1 -1
  3. khoj/interface/compiled/404/index.html +2 -2
  4. khoj/interface/compiled/_next/static/chunks/{2327-aa22697ed9c8d54a.js → 2327-f03b2a77f67b8f8c.js} +1 -1
  5. khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-81457f7f59956b56.js} +9 -9
  6. khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/app/agents/{page-c9ceb9b94e24b94a.js → page-774c78ff0f55a228.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/app/automations/{page-3dc59a0df3827dc7.js → page-4454891c5007b870.js} +1 -1
  10. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
  11. khoj/interface/compiled/_next/static/chunks/app/chat/{page-2b27c7118d8d5a16.js → page-5a2559825b4d5def.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/{page-38f1f125d7aeb4c7.js → page-f7a0286dfc31ad6b.js} +1 -1
  13. khoj/interface/compiled/_next/static/chunks/app/search/{page-26d4492fb1200e0e.js → page-f1a7f278c89e09b6.js} +1 -1
  14. khoj/interface/compiled/_next/static/chunks/app/settings/{page-bf1a4e488b29fceb.js → page-5d9134d4a97f8834.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
  16. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-a1f10c96366c3a4f.js → page-32cd0ceb9ffbd777.js} +1 -1
  17. khoj/interface/compiled/_next/static/chunks/{webpack-c6bde5961098facd.js → webpack-952bc0d41769db77.js} +1 -1
  18. khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +1 -0
  19. khoj/interface/compiled/_next/static/css/93eeacc43e261162.css +1 -0
  20. khoj/interface/compiled/agents/index.html +2 -2
  21. khoj/interface/compiled/agents/index.txt +2 -2
  22. khoj/interface/compiled/automations/index.html +2 -2
  23. khoj/interface/compiled/automations/index.txt +3 -3
  24. khoj/interface/compiled/chat/index.html +2 -2
  25. khoj/interface/compiled/chat/index.txt +2 -2
  26. khoj/interface/compiled/index.html +2 -2
  27. khoj/interface/compiled/index.txt +2 -2
  28. khoj/interface/compiled/search/index.html +2 -2
  29. khoj/interface/compiled/search/index.txt +2 -2
  30. khoj/interface/compiled/settings/index.html +2 -2
  31. khoj/interface/compiled/settings/index.txt +4 -4
  32. khoj/interface/compiled/share/chat/index.html +2 -2
  33. khoj/interface/compiled/share/chat/index.txt +2 -2
  34. khoj/processor/conversation/anthropic/anthropic_chat.py +8 -9
  35. khoj/processor/conversation/anthropic/utils.py +30 -7
  36. khoj/processor/conversation/google/gemini_chat.py +10 -10
  37. khoj/processor/conversation/google/utils.py +20 -12
  38. khoj/processor/conversation/offline/chat_model.py +2 -7
  39. khoj/processor/conversation/openai/gpt.py +8 -9
  40. khoj/processor/conversation/utils.py +132 -21
  41. khoj/processor/operator/README.md +59 -0
  42. khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
  43. khoj/processor/operator/grounding_agent.py +229 -175
  44. khoj/processor/operator/grounding_agent_uitars.py +59 -48
  45. khoj/processor/operator/operator_actions.py +48 -0
  46. khoj/processor/operator/operator_agent_anthropic.py +298 -90
  47. khoj/processor/operator/operator_agent_base.py +45 -14
  48. khoj/processor/operator/operator_agent_binary.py +125 -57
  49. khoj/processor/operator/operator_agent_openai.py +183 -75
  50. khoj/processor/operator/operator_environment_base.py +11 -1
  51. khoj/processor/operator/operator_environment_browser.py +5 -3
  52. khoj/processor/operator/operator_environment_computer.py +658 -0
  53. khoj/routers/api_chat.py +36 -25
  54. khoj/routers/helpers.py +8 -17
  55. khoj/routers/research.py +43 -20
  56. khoj/utils/constants.py +4 -4
  57. khoj/utils/helpers.py +12 -15
  58. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/METADATA +3 -1
  59. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/RECORD +70 -68
  60. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
  61. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
  62. khoj/interface/compiled/_next/static/chunks/app/chat/layout-d5ae861e1ade9d08.js +0 -1
  63. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-64a53f8ec4afa6b3.js +0 -1
  64. khoj/interface/compiled/_next/static/css/bb7ea98028b368f3.css +0 -1
  65. khoj/interface/compiled/_next/static/css/ee66643a6a5bf71c.css +0 -1
  66. /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → 4CIEX6Ko-Qehhb7L-ymZw}/_buildManifest.js +0 -0
  67. /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → 4CIEX6Ko-Qehhb7L-ymZw}/_ssgManifest.js +0 -0
  68. /khoj/interface/compiled/_next/static/chunks/{1915-ab4353eaca76f690.js → 1915-1943ee8a628b893c.js} +0 -0
  69. /khoj/interface/compiled/_next/static/chunks/{2117-1c18aa2098982bf9.js → 2117-5a41630a2bd2eae8.js} +0 -0
  70. /khoj/interface/compiled/_next/static/chunks/{4363-4efaf12abe696251.js → 4363-e6ac2203564d1a3b.js} +0 -0
  71. /khoj/interface/compiled/_next/static/chunks/{4447-5d44807c40355b1a.js → 4447-e038b251d626c340.js} +0 -0
  72. /khoj/interface/compiled/_next/static/chunks/{8667-adbe6017a66cef10.js → 8667-8136f74e9a086fca.js} +0 -0
  73. /khoj/interface/compiled/_next/static/chunks/{9259-d8bcd9da9e80c81e.js → 9259-640fdd77408475df.js} +0 -0
  74. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/WHEEL +0 -0
  75. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/entry_points.txt +0 -0
  76. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/licenses/LICENSE +0 -0
@@ -1,21 +1,24 @@
1
1
  import json
2
2
  import logging
3
3
  from datetime import datetime
4
+ from textwrap import dedent
4
5
  from typing import List, Optional
5
6
 
6
- from openai.types.chat import ChatCompletion
7
-
8
7
  from khoj.database.models import ChatModel
9
- from khoj.processor.conversation.utils import construct_structured_message
8
+ from khoj.processor.conversation.utils import (
9
+ AgentMessage,
10
+ OperatorRun,
11
+ construct_structured_message,
12
+ )
10
13
  from khoj.processor.operator.grounding_agent import GroundingAgent
11
14
  from khoj.processor.operator.grounding_agent_uitars import GroundingAgentUitars
12
15
  from khoj.processor.operator.operator_actions import *
13
- from khoj.processor.operator.operator_agent_base import (
14
- AgentActResult,
15
- AgentMessage,
16
- OperatorAgent,
16
+ from khoj.processor.operator.operator_agent_base import AgentActResult, OperatorAgent
17
+ from khoj.processor.operator.operator_environment_base import (
18
+ EnvironmentType,
19
+ EnvState,
20
+ EnvStepResult,
17
21
  )
18
- from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
19
22
  from khoj.routers.helpers import send_message_to_model_wrapper
20
23
  from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
21
24
 
@@ -27,7 +30,7 @@ class BinaryOperatorAgent(OperatorAgent):
27
30
  """
28
31
  An OperatorAgent that uses two LLMs:
29
32
  1. Reasoning LLM: Determines the next high-level action based on the objective and current visual reasoning trajectory.
30
- 2. Grounding LLM: Converts the high-level action into specific, executable browser actions.
33
+ 2. Grounding LLM: Converts the high-level action into specific, actions executable on the environment.
31
34
  """
32
35
 
33
36
  def __init__(
@@ -35,10 +38,23 @@ class BinaryOperatorAgent(OperatorAgent):
35
38
  query: str,
36
39
  reasoning_model: ChatModel,
37
40
  grounding_model: ChatModel,
41
+ environment_type: EnvironmentType,
38
42
  max_iterations: int,
39
- tracer: dict,
43
+ max_context: int,
44
+ chat_history: List[AgentMessage] = [],
45
+ previous_trajectory: Optional[OperatorRun] = None,
46
+ tracer: dict = {},
40
47
  ):
41
- super().__init__(query, reasoning_model, max_iterations, tracer) # Use reasoning model for primary tracking
48
+ super().__init__(
49
+ query,
50
+ reasoning_model,
51
+ environment_type,
52
+ max_iterations,
53
+ max_context,
54
+ chat_history,
55
+ previous_trajectory,
56
+ tracer,
57
+ ) # Use reasoning model for primary tracking
42
58
  self.reasoning_model = reasoning_model
43
59
  self.grounding_model = grounding_model
44
60
  # Initialize openai api compatible client for grounding model
@@ -49,10 +65,12 @@ class BinaryOperatorAgent(OperatorAgent):
49
65
  self.grounding_agent: GroundingAgent | GroundingAgentUitars = None
50
66
  if "ui-tars-1.5" in grounding_model.name:
51
67
  self.grounding_agent = GroundingAgentUitars(
52
- grounding_model.name, grounding_client, max_iterations, environment_type="web", tracer=tracer
68
+ grounding_model.name, self.environment_type, grounding_client, max_iterations, tracer=tracer
53
69
  )
54
70
  else:
55
- self.grounding_agent = GroundingAgent(grounding_model.name, grounding_client, max_iterations, tracer=tracer)
71
+ self.grounding_agent = GroundingAgent(
72
+ grounding_model.name, self.environment_type, grounding_client, max_iterations, tracer=tracer
73
+ )
56
74
 
57
75
  async def act(self, current_state: EnvState) -> AgentActResult:
58
76
  """
@@ -84,48 +102,7 @@ class BinaryOperatorAgent(OperatorAgent):
84
102
  """
85
103
  Uses the reasoning LLM to determine the next high-level action based on the operation trajectory.
86
104
  """
87
- reasoning_system_prompt = f"""
88
- # Introduction
89
- * You are Khoj, a smart and resourceful web browsing assistant. You help the user accomplish their task using a web browser.
90
- * You are given the user's query and screenshots of the browser's state transitions.
91
- * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
92
- * The current URL is {current_state.url}.
93
-
94
- # Your Task
95
- * First look at the screenshots carefully to notice all pertinent information.
96
- * Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
97
- * Make sure you scroll down to see everything before deciding something isn't available.
98
- * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
99
- * Use your creativity to find alternate ways to make progress if you get stuck at any point.
100
-
101
- # Tool AI Capabilities
102
- * The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
103
- * It can interact with the web browser with these actions: click, right click, double click, type, scroll, drag, wait, goto url and go back to previous page.
104
- * It cannot access the OS, filesystem or application window. It just controls a single Chromium browser tab via Playwright.
105
-
106
- # IMPORTANT
107
- * You are allowed upto {self.max_iterations} iterations to complete the task.
108
- * To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
109
- * To navigate back to the previous page, end your response with "BACK" (without quotes).
110
- * Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
111
-
112
- # Examples
113
- ## Example 1
114
- GOTO https://example.com
115
- ## Example 2
116
- click the blue login button located at the top right corner
117
- ## Example 3
118
- scroll down the page
119
- ## Example 4
120
- type the username example@email.com into the input field labeled Username
121
- ## Example 5
122
- DONE
123
-
124
- # Instructions
125
- Now describe a single high-level action to take next to progress towards the user's goal in detail.
126
- Focus on the visual action and provide all necessary context.
127
- """.strip()
128
-
105
+ reasoning_system_prompt = self.get_instruction(self.environment_type, current_state)
129
106
  if is_none_or_empty(self.messages):
130
107
  query_text = f"**Main Objective**: {self.query}"
131
108
  query_screenshot = [f"data:image/webp;base64,{current_state.screenshot}"]
@@ -259,7 +236,8 @@ Focus on the visual action and provide all necessary context.
259
236
  action_results_content.extend(action_result["content"])
260
237
  self.messages.append(AgentMessage(role="environment", content=action_results_content))
261
238
 
262
- async def summarize(self, summarize_prompt: str, env_state: EnvState) -> str:
239
+ async def summarize(self, env_state: EnvState, summarize_prompt: str = None) -> str:
240
+ summarize_prompt = summarize_prompt or self.summarize_prompt
263
241
  conversation_history = {"chat": self._format_message_for_api(self.messages)}
264
242
  try:
265
243
  summary = await send_message_to_model_wrapper(
@@ -282,7 +260,7 @@ Focus on the visual action and provide all necessary context.
282
260
 
283
261
  return summary
284
262
 
285
- def compile_response(self, response_content: str | List) -> str:
263
+ def _compile_response(self, response_content: str | List) -> str:
286
264
  """Compile response content into a string, handling OpenAI message structures."""
287
265
  if isinstance(response_content, str):
288
266
  return response_content
@@ -330,6 +308,96 @@ Focus on the visual action and provide all necessary context.
330
308
  ]
331
309
  return formatted_messages
332
310
 
311
+ def get_instruction(self, environment_type: EnvironmentType, env_state: EnvState) -> str:
312
+ """Get the system instruction for the reasoning agent."""
313
+ if environment_type == EnvironmentType.BROWSER:
314
+ return dedent(
315
+ f"""
316
+ # Introduction
317
+ * You are Khoj, a smart and resourceful web browsing assistant. You help the user accomplish their task using a web browser.
318
+ * You are given the user's query and screenshots of the browser's state transitions.
319
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
320
+ * The current URL is {env_state.url}.
321
+
322
+ # Your Task
323
+ * First look at the screenshots carefully to notice all pertinent information.
324
+ * Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
325
+ * Make sure you scroll down to see everything before deciding something isn't available.
326
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
327
+ * Use your creativity to find alternate ways to make progress if you get stuck at any point.
328
+
329
+ # Tool AI Capabilities
330
+ * The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
331
+ * It can interact with the web browser with these actions: click, right click, double click, type, scroll, drag, wait, goto url and go back to previous page.
332
+ * It cannot access the OS, filesystem or application window. It just controls a single Chromium browser tab via Playwright.
333
+
334
+ # IMPORTANT
335
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
336
+ * To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
337
+ * To navigate back to the previous page, end your response with "BACK" (without quotes).
338
+ * Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
339
+
340
+ # Examples
341
+ ## Example 1
342
+ GOTO https://example.com
343
+ ## Example 2
344
+ click the blue login button located at the top right corner
345
+ ## Example 3
346
+ scroll down the page
347
+ ## Example 4
348
+ type the username example@email.com into the input field labeled Username
349
+ ## Example 5
350
+ DONE
351
+
352
+ # Instructions
353
+ Now describe a single high-level action to take next to progress towards the user's goal in detail.
354
+ Focus on the visual action and provide all necessary context.
355
+ """
356
+ ).strip()
357
+
358
+ elif environment_type == EnvironmentType.COMPUTER:
359
+ return dedent(
360
+ f"""
361
+ # Introduction
362
+ * You are Khoj, a smart and resourceful computer assistant. You help the user accomplish their task using a computer.
363
+ * You are given the user's query and screenshots of the computer's state transitions.
364
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
365
+
366
+ # Your Task
367
+ * First look at the screenshots carefully to notice all pertinent information.
368
+ * Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
369
+ * Make sure you scroll down to see everything before deciding something isn't available.
370
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
371
+ * Use your creativity to find alternate ways to make progress if you get stuck at any point.
372
+
373
+ # Tool AI Capabilities
374
+ * The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
375
+ * It can interact with the computer with these actions: click, right click, double click, type, scroll, drag, wait to previous page.
376
+
377
+ # IMPORTANT
378
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
379
+ * Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
380
+
381
+ # Examples
382
+ ## Example 1
383
+ type https://example.com into the address bar and press Enter
384
+ ## Example 2
385
+ click the blue login button located at the top right corner
386
+ ## Example 3
387
+ scroll down the page
388
+ ## Example 4
389
+ type the username example@email.com into the input field labeled Username
390
+ ## Example 5
391
+ DONE
392
+
393
+ # Instructions
394
+ Now describe a single high-level action to take next to progress towards the user's goal in detail.
395
+ Focus on the visual action and provide all necessary context.
396
+ """
397
+ ).strip()
398
+ else:
399
+ raise ValueError(f"Expected environment type: Computer or Browser. Got {environment_type}.")
400
+
333
401
  def reset(self):
334
402
  """Reset the agent state."""
335
403
  super().reset()
@@ -1,18 +1,22 @@
1
1
  import json
2
2
  import logging
3
+ import platform
3
4
  from copy import deepcopy
4
5
  from datetime import datetime
6
+ from textwrap import dedent
5
7
  from typing import List, Optional, cast
6
8
 
7
9
  from openai.types.responses import Response, ResponseOutputItem
8
10
 
11
+ from khoj.database.models import ChatModel
12
+ from khoj.processor.conversation.utils import AgentMessage
9
13
  from khoj.processor.operator.operator_actions import *
10
- from khoj.processor.operator.operator_agent_base import (
11
- AgentActResult,
12
- AgentMessage,
13
- OperatorAgent,
14
+ from khoj.processor.operator.operator_agent_base import AgentActResult, OperatorAgent
15
+ from khoj.processor.operator.operator_environment_base import (
16
+ EnvironmentType,
17
+ EnvState,
18
+ EnvStepResult,
14
19
  )
15
- from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
16
20
  from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
17
21
 
18
22
  logger = logging.getLogger(__name__)
@@ -21,80 +25,18 @@ logger = logging.getLogger(__name__)
21
25
  # --- Anthropic Operator Agent ---
22
26
  class OpenAIOperatorAgent(OperatorAgent):
23
27
  async def act(self, current_state: EnvState) -> AgentActResult:
24
- client = get_openai_async_client(
25
- self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
26
- )
27
28
  safety_check_prefix = "Say 'continue' after resolving the following safety checks to proceed:"
28
29
  safety_check_message = None
29
30
  actions: List[OperatorAction] = []
30
31
  action_results: List[dict] = []
31
32
  self._commit_trace() # Commit trace before next action
32
- system_prompt = f"""<SYSTEM_CAPABILITY>
33
- * You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
34
- * You operate a single Chromium browser page using Playwright.
35
- * You cannot access the OS or filesystem.
36
- * You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
37
- * You can use the additional back() and goto() functions to navigate the browser.
38
- * Always use the goto() function to navigate to a specific URL. If you see nothing, try goto duckduckgo.com
39
- * When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
40
- * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
41
- * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
42
- * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
43
- * The current URL is {current_state.url}.
44
- </SYSTEM_CAPABILITY>
45
-
46
- <IMPORTANT>
47
- * You are allowed upto {self.max_iterations} iterations to complete the task.
48
- * After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
49
- </IMPORTANT>
50
- """
51
- tools = [
52
- {
53
- "type": "computer_use_preview",
54
- "display_width": 1024, # TODO: Get from env
55
- "display_height": 768, # TODO: Get from env
56
- "environment": "browser",
57
- },
58
- {
59
- "type": "function",
60
- "name": "back",
61
- "description": "Go back to the previous page.",
62
- "parameters": {},
63
- },
64
- {
65
- "type": "function",
66
- "name": "goto",
67
- "description": "Go to a specific URL.",
68
- "parameters": {
69
- "type": "object",
70
- "properties": {
71
- "url": {
72
- "type": "string",
73
- "description": "Fully qualified URL to navigate to.",
74
- },
75
- },
76
- "additionalProperties": False,
77
- "required": ["url"],
78
- },
79
- },
80
- ]
81
-
33
+ system_prompt = self.get_instructions(self.environment_type, current_state)
34
+ tools = self.get_tools(self.environment_type, current_state)
82
35
  if is_none_or_empty(self.messages):
83
36
  self.messages = [AgentMessage(role="user", content=self.query)]
84
37
 
85
- messages_for_api = self._format_message_for_api(self.messages)
86
- response: Response = await client.responses.create(
87
- model="computer-use-preview",
88
- input=messages_for_api,
89
- instructions=system_prompt,
90
- tools=tools,
91
- parallel_tool_calls=False, # Keep sequential for now
92
- max_output_tokens=4096, # TODO: Make configurable?
93
- truncation="auto",
94
- )
95
-
96
- logger.debug(f"Openai response: {response.model_dump_json()}")
97
- self.messages += [AgentMessage(role="environment", content=response.output)]
38
+ response = await self._call_model(self.vision_model, system_prompt, tools)
39
+ self.messages += [AgentMessage(role="assistant", content=response.output)]
98
40
  rendered_response = self._render_response(response.output, current_state.screenshot)
99
41
 
100
42
  last_call_id = None
@@ -174,6 +116,9 @@ class OpenAIOperatorAgent(OperatorAgent):
174
116
  "summary": [],
175
117
  }
176
118
  )
119
+ else:
120
+ logger.warning(f"Unsupported response block type: {block.type}")
121
+ content = f"Unsupported response block type: {block.type}"
177
122
  if action_to_run or content:
178
123
  actions.append(action_to_run)
179
124
  if action_to_run or content:
@@ -220,6 +165,10 @@ class OpenAIOperatorAgent(OperatorAgent):
220
165
  elif action_result["type"] == "reasoning":
221
166
  items_to_pop.append(idx) # Mark placeholder reasoning action result for removal
222
167
  continue
168
+ elif action_result["type"] == "computer_call" and action_result["status"] == "in_progress":
169
+ if isinstance(result_content, dict):
170
+ result_content["status"] = "completed" # Mark in-progress actions as completed
171
+ action_result["output"] = result_content
223
172
  else:
224
173
  # Add text data
225
174
  action_result["output"] = result_content
@@ -229,11 +178,45 @@ class OpenAIOperatorAgent(OperatorAgent):
229
178
 
230
179
  self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
231
180
 
181
+ async def summarize(self, current_state: EnvState, summarize_prompt: str = None) -> str:
182
+ summarize_prompt = summarize_prompt or self.summarize_prompt
183
+ self.messages.append(AgentMessage(role="user", content=summarize_prompt))
184
+ response = await self._call_model(self.vision_model, summarize_prompt, [])
185
+ self.messages += [AgentMessage(role="assistant", content=response.output)]
186
+ if not self.messages:
187
+ return "No actions to summarize."
188
+ return self._compile_response(self.messages[-1].content)
189
+
190
+ async def _call_model(self, model: ChatModel, system_prompt, tools) -> Response:
191
+ client = get_openai_async_client(model.ai_model_api.api_key, model.ai_model_api.api_base_url)
192
+ if tools:
193
+ model_name = "computer-use-preview"
194
+ else:
195
+ model_name = model.name
196
+
197
+ # Format messages for OpenAI API
198
+ messages_for_api = self._format_message_for_api(self.messages)
199
+ # format messages for summary if model is not computer-use-preview
200
+ if model_name != "computer-use-preview":
201
+ messages_for_api = self._format_messages_for_summary(messages_for_api)
202
+
203
+ response: Response = await client.responses.create(
204
+ model=model_name,
205
+ input=messages_for_api,
206
+ instructions=system_prompt,
207
+ tools=tools,
208
+ parallel_tool_calls=False,
209
+ truncation="auto",
210
+ )
211
+
212
+ logger.debug(f"Openai response: {response.model_dump_json()}")
213
+ return response
214
+
232
215
  def _format_message_for_api(self, messages: list[AgentMessage]) -> list:
233
216
  """Format the message for OpenAI API."""
234
217
  formatted_messages: list = []
235
218
  for message in messages:
236
- if message.role == "environment":
219
+ if message.role == "assistant":
237
220
  if isinstance(message.content, list):
238
221
  # Remove reasoning message if not followed by computer call
239
222
  if (
@@ -252,18 +235,23 @@ class OpenAIOperatorAgent(OperatorAgent):
252
235
  message.content.pop(0)
253
236
  formatted_messages.extend(message.content)
254
237
  else:
255
- logger.warning(f"Expected message content list from environment, got {type(message.content)}")
238
+ logger.warning(f"Expected message content list from assistant, got {type(message.content)}")
239
+ elif message.role == "environment":
240
+ formatted_messages.extend(message.content)
256
241
  else:
242
+ if isinstance(message.content, list):
243
+ message.content = "\n".join([part["text"] for part in message.content if part["type"] == "text"])
257
244
  formatted_messages.append(
258
245
  {
259
246
  "role": message.role,
260
247
  "content": message.content,
261
248
  }
262
249
  )
250
+
263
251
  return formatted_messages
264
252
 
265
- def compile_response(self, response_content: str | list[dict | ResponseOutputItem]) -> str:
266
- """Compile the response from model into a single string."""
253
+ def _compile_response(self, response_content: str | list[dict | ResponseOutputItem]) -> str:
254
+ """Compile the response from model into a single string for prompt tracing."""
267
255
  # Handle case where response content is a string.
268
256
  # This is the case when response content is a user query
269
257
  if isinstance(response_content, str):
@@ -347,3 +335,123 @@ class OpenAIOperatorAgent(OperatorAgent):
347
335
  }
348
336
 
349
337
  return render_payload
338
+
339
+ def get_instructions(self, environment_type: EnvironmentType, current_state: EnvState) -> str:
340
+ """Return system instructions for the OpenAI operator."""
341
+ if environment_type == EnvironmentType.BROWSER:
342
+ return dedent(
343
+ f"""
344
+ <SYSTEM_CAPABILITY>
345
+ * You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
346
+ * You operate a single Chromium browser page using Playwright.
347
+ * You cannot access the OS or filesystem.
348
+ * You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
349
+ * You can use the additional back() and goto() functions to navigate the browser.
350
+ * Always use the goto() function to navigate to a specific URL. If you see nothing, try goto duckduckgo.com
351
+ * When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
352
+ * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
353
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
354
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
355
+ * The current URL is {current_state.url}.
356
+ </SYSTEM_CAPABILITY>
357
+
358
+ <IMPORTANT>
359
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
360
+ * After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
361
+ </IMPORTANT>
362
+ """
363
+ ).lstrip()
364
+ elif environment_type == EnvironmentType.COMPUTER:
365
+ return dedent(
366
+ f"""
367
+ <SYSTEM_CAPABILITY>
368
+ * You are Khoj, a smart computer operating assistant. You help the users accomplish their tasks using a computer.
369
+ * You can interact with the computer to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
370
+ * When viewing a document or webpage it can be helpful to zoom out or scroll down to ensure you see everything before deciding something isn't available.
371
+ * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
372
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
373
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
374
+ </SYSTEM_CAPABILITY>
375
+
376
+ <CONTEXT>
377
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
378
+ </CONTEXT>
379
+ """
380
+ ).lstrip()
381
+ else:
382
+ raise ValueError(f"Unsupported environment type: {environment_type}")
383
+
384
+ def get_tools(self, environment_type: EnvironmentType, current_state: EnvState) -> list[dict]:
385
+ """Return the tools available for the OpenAI operator."""
386
+ if environment_type == EnvironmentType.COMPUTER:
387
+ # TODO: Get OS info from the environment
388
+ # For now, assume Linux as the environment OS
389
+ environment_os = "linux"
390
+ # environment = "mac" if platform.system() == "Darwin" else "windows" if platform.system() == "Windows" else "linux"
391
+ else:
392
+ environment_os = "browser"
393
+
394
+ tools = [
395
+ {
396
+ "type": "computer_use_preview",
397
+ "display_width": current_state.width,
398
+ "display_height": current_state.height,
399
+ "environment": environment_os,
400
+ }
401
+ ]
402
+ if environment_type == EnvironmentType.BROWSER:
403
+ tools += [
404
+ {
405
+ "type": "function",
406
+ "name": "back",
407
+ "description": "Go back to the previous page.",
408
+ "parameters": {},
409
+ },
410
+ {
411
+ "type": "function",
412
+ "name": "goto",
413
+ "description": "Go to a specific URL.",
414
+ "parameters": {
415
+ "type": "object",
416
+ "properties": {
417
+ "url": {
418
+ "type": "string",
419
+ "description": "Fully qualified URL to navigate to.",
420
+ },
421
+ },
422
+ "additionalProperties": False,
423
+ "required": ["url"],
424
+ },
425
+ },
426
+ ]
427
+ return tools
428
+
429
+ def _format_messages_for_summary(self, formatted_messages: List[dict]) -> List[dict]:
430
+ """Format messages for summary."""
431
+ # Format messages to interact with non computer use AI models
432
+ items_to_drop = [] # Track indices to drop reasoning messages
433
+ for idx, msg in enumerate(formatted_messages):
434
+ if isinstance(msg, dict) and "content" in msg:
435
+ continue
436
+ elif isinstance(msg, dict) and "output" in msg:
437
+ # Drop current_url from output as not supported for non computer operations
438
+ if "current_url" in msg["output"]:
439
+ del msg["output"]["current_url"]
440
+ formatted_messages[idx] = {"role": "user", "content": [msg["output"]]}
441
+ elif isinstance(msg, str):
442
+ formatted_messages[idx] = {"role": "user", "content": [{"type": "input_text", "text": msg}]}
443
+ else:
444
+ text = self._compile_response([msg])
445
+ if not text:
446
+ items_to_drop.append(idx) # Track index to drop reasoning message
447
+ else:
448
+ formatted_messages[idx] = {
449
+ "role": "assistant",
450
+ "content": [{"type": "output_text", "text": text}],
451
+ }
452
+
453
+ # Remove reasoning messages for non-computer use models
454
+ for idx in reversed(items_to_drop):
455
+ formatted_messages.pop(idx)
456
+
457
+ return formatted_messages
@@ -1,4 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
+ from enum import Enum
2
3
  from typing import Literal, Optional
3
4
 
4
5
  from pydantic import BaseModel
@@ -6,9 +7,18 @@ from pydantic import BaseModel
6
7
  from khoj.processor.operator.operator_actions import OperatorAction
7
8
 
8
9
 
10
+ class EnvironmentType(Enum):
11
+ """Type of environment to operate."""
12
+
13
+ COMPUTER = "computer"
14
+ BROWSER = "browser"
15
+
16
+
9
17
  class EnvState(BaseModel):
10
- url: str
18
+ height: int
19
+ width: int
11
20
  screenshot: Optional[str] = None
21
+ url: Optional[str] = None
12
22
 
13
23
 
14
24
  class EnvStepResult(BaseModel):
@@ -5,7 +5,7 @@ import logging
5
5
  import os
6
6
  from typing import Optional, Set, Union
7
7
 
8
- from khoj.processor.operator.operator_actions import OperatorAction, Point
8
+ from khoj.processor.operator.operator_actions import DragAction, OperatorAction, Point
9
9
  from khoj.processor.operator.operator_environment_base import (
10
10
  Environment,
11
11
  EnvState,
@@ -124,10 +124,10 @@ class BrowserEnvironment(Environment):
124
124
 
125
125
  async def get_state(self) -> EnvState:
126
126
  if not self.page or self.page.is_closed():
127
- return EnvState(url="about:blank", screenshot=None)
127
+ return EnvState(url="about:blank", screenshot=None, height=self.height, width=self.width)
128
128
  url = self.page.url
129
129
  screenshot = await self._get_screenshot()
130
- return EnvState(url=url, screenshot=screenshot)
130
+ return EnvState(url=url, screenshot=screenshot, height=self.height, width=self.width)
131
131
 
132
132
  async def step(self, action: OperatorAction) -> EnvStepResult:
133
133
  if not self.page or self.page.is_closed():
@@ -246,6 +246,8 @@ class BrowserEnvironment(Environment):
246
246
  logger.debug(f"Action: {action.type} to ({x},{y})")
247
247
 
248
248
  case "drag":
249
+ if not isinstance(action, DragAction):
250
+ raise TypeError(f"Invalid action type for drag")
249
251
  path = action.path
250
252
  if not path:
251
253
  error = "Missing path for drag action"