khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +20 -0
- khoj/database/models/__init__.py +1 -1
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/{2327-aa22697ed9c8d54a.js → 2327-f03b2a77f67b8f8c.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-81457f7f59956b56.js} +9 -9
- khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-c9ceb9b94e24b94a.js → page-774c78ff0f55a228.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-3dc59a0df3827dc7.js → page-4454891c5007b870.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-2b27c7118d8d5a16.js → page-5a2559825b4d5def.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-38f1f125d7aeb4c7.js → page-f7a0286dfc31ad6b.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/{page-26d4492fb1200e0e.js → page-f1a7f278c89e09b6.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-bf1a4e488b29fceb.js → page-5d9134d4a97f8834.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-a1f10c96366c3a4f.js → page-32cd0ceb9ffbd777.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-c6bde5961098facd.js → webpack-952bc0d41769db77.js} +1 -1
- khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +1 -0
- khoj/interface/compiled/_next/static/css/93eeacc43e261162.css +1 -0
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +3 -3
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +4 -4
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/anthropic/anthropic_chat.py +8 -9
- khoj/processor/conversation/anthropic/utils.py +30 -7
- khoj/processor/conversation/google/gemini_chat.py +10 -10
- khoj/processor/conversation/google/utils.py +20 -12
- khoj/processor/conversation/offline/chat_model.py +2 -7
- khoj/processor/conversation/openai/gpt.py +8 -9
- khoj/processor/conversation/utils.py +132 -21
- khoj/processor/operator/README.md +59 -0
- khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
- khoj/processor/operator/grounding_agent.py +229 -175
- khoj/processor/operator/grounding_agent_uitars.py +59 -48
- khoj/processor/operator/operator_actions.py +48 -0
- khoj/processor/operator/operator_agent_anthropic.py +298 -90
- khoj/processor/operator/operator_agent_base.py +45 -14
- khoj/processor/operator/operator_agent_binary.py +125 -57
- khoj/processor/operator/operator_agent_openai.py +183 -75
- khoj/processor/operator/operator_environment_base.py +11 -1
- khoj/processor/operator/operator_environment_browser.py +5 -3
- khoj/processor/operator/operator_environment_computer.py +658 -0
- khoj/routers/api_chat.py +36 -25
- khoj/routers/helpers.py +8 -17
- khoj/routers/research.py +43 -20
- khoj/utils/constants.py +4 -4
- khoj/utils/helpers.py +12 -15
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/METADATA +3 -1
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/RECORD +70 -68
- khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-d5ae861e1ade9d08.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-64a53f8ec4afa6b3.js +0 -1
- khoj/interface/compiled/_next/static/css/bb7ea98028b368f3.css +0 -1
- khoj/interface/compiled/_next/static/css/ee66643a6a5bf71c.css +0 -1
- /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → 4CIEX6Ko-Qehhb7L-ymZw}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → 4CIEX6Ko-Qehhb7L-ymZw}/_ssgManifest.js +0 -0
- /khoj/interface/compiled/_next/static/chunks/{1915-ab4353eaca76f690.js → 1915-1943ee8a628b893c.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{2117-1c18aa2098982bf9.js → 2117-5a41630a2bd2eae8.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4363-4efaf12abe696251.js → 4363-e6ac2203564d1a3b.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4447-5d44807c40355b1a.js → 4447-e038b251d626c340.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{8667-adbe6017a66cef10.js → 8667-8136f74e9a086fca.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{9259-d8bcd9da9e80c81e.js → 9259-640fdd77408475df.js} +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/WHEEL +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/entry_points.txt +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev142.dist-info}/licenses/LICENSE +0 -0
@@ -1,21 +1,24 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
3
|
from datetime import datetime
|
4
|
+
from textwrap import dedent
|
4
5
|
from typing import List, Optional
|
5
6
|
|
6
|
-
from openai.types.chat import ChatCompletion
|
7
|
-
|
8
7
|
from khoj.database.models import ChatModel
|
9
|
-
from khoj.processor.conversation.utils import
|
8
|
+
from khoj.processor.conversation.utils import (
|
9
|
+
AgentMessage,
|
10
|
+
OperatorRun,
|
11
|
+
construct_structured_message,
|
12
|
+
)
|
10
13
|
from khoj.processor.operator.grounding_agent import GroundingAgent
|
11
14
|
from khoj.processor.operator.grounding_agent_uitars import GroundingAgentUitars
|
12
15
|
from khoj.processor.operator.operator_actions import *
|
13
|
-
from khoj.processor.operator.operator_agent_base import
|
14
|
-
|
15
|
-
|
16
|
-
|
16
|
+
from khoj.processor.operator.operator_agent_base import AgentActResult, OperatorAgent
|
17
|
+
from khoj.processor.operator.operator_environment_base import (
|
18
|
+
EnvironmentType,
|
19
|
+
EnvState,
|
20
|
+
EnvStepResult,
|
17
21
|
)
|
18
|
-
from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
|
19
22
|
from khoj.routers.helpers import send_message_to_model_wrapper
|
20
23
|
from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
|
21
24
|
|
@@ -27,7 +30,7 @@ class BinaryOperatorAgent(OperatorAgent):
|
|
27
30
|
"""
|
28
31
|
An OperatorAgent that uses two LLMs:
|
29
32
|
1. Reasoning LLM: Determines the next high-level action based on the objective and current visual reasoning trajectory.
|
30
|
-
2. Grounding LLM: Converts the high-level action into specific, executable
|
33
|
+
2. Grounding LLM: Converts the high-level action into specific, actions executable on the environment.
|
31
34
|
"""
|
32
35
|
|
33
36
|
def __init__(
|
@@ -35,10 +38,23 @@ class BinaryOperatorAgent(OperatorAgent):
|
|
35
38
|
query: str,
|
36
39
|
reasoning_model: ChatModel,
|
37
40
|
grounding_model: ChatModel,
|
41
|
+
environment_type: EnvironmentType,
|
38
42
|
max_iterations: int,
|
39
|
-
|
43
|
+
max_context: int,
|
44
|
+
chat_history: List[AgentMessage] = [],
|
45
|
+
previous_trajectory: Optional[OperatorRun] = None,
|
46
|
+
tracer: dict = {},
|
40
47
|
):
|
41
|
-
super().__init__(
|
48
|
+
super().__init__(
|
49
|
+
query,
|
50
|
+
reasoning_model,
|
51
|
+
environment_type,
|
52
|
+
max_iterations,
|
53
|
+
max_context,
|
54
|
+
chat_history,
|
55
|
+
previous_trajectory,
|
56
|
+
tracer,
|
57
|
+
) # Use reasoning model for primary tracking
|
42
58
|
self.reasoning_model = reasoning_model
|
43
59
|
self.grounding_model = grounding_model
|
44
60
|
# Initialize openai api compatible client for grounding model
|
@@ -49,10 +65,12 @@ class BinaryOperatorAgent(OperatorAgent):
|
|
49
65
|
self.grounding_agent: GroundingAgent | GroundingAgentUitars = None
|
50
66
|
if "ui-tars-1.5" in grounding_model.name:
|
51
67
|
self.grounding_agent = GroundingAgentUitars(
|
52
|
-
grounding_model.name, grounding_client, max_iterations,
|
68
|
+
grounding_model.name, self.environment_type, grounding_client, max_iterations, tracer=tracer
|
53
69
|
)
|
54
70
|
else:
|
55
|
-
self.grounding_agent = GroundingAgent(
|
71
|
+
self.grounding_agent = GroundingAgent(
|
72
|
+
grounding_model.name, self.environment_type, grounding_client, max_iterations, tracer=tracer
|
73
|
+
)
|
56
74
|
|
57
75
|
async def act(self, current_state: EnvState) -> AgentActResult:
|
58
76
|
"""
|
@@ -84,48 +102,7 @@ class BinaryOperatorAgent(OperatorAgent):
|
|
84
102
|
"""
|
85
103
|
Uses the reasoning LLM to determine the next high-level action based on the operation trajectory.
|
86
104
|
"""
|
87
|
-
reasoning_system_prompt =
|
88
|
-
# Introduction
|
89
|
-
* You are Khoj, a smart and resourceful web browsing assistant. You help the user accomplish their task using a web browser.
|
90
|
-
* You are given the user's query and screenshots of the browser's state transitions.
|
91
|
-
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
92
|
-
* The current URL is {current_state.url}.
|
93
|
-
|
94
|
-
# Your Task
|
95
|
-
* First look at the screenshots carefully to notice all pertinent information.
|
96
|
-
* Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
|
97
|
-
* Make sure you scroll down to see everything before deciding something isn't available.
|
98
|
-
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
99
|
-
* Use your creativity to find alternate ways to make progress if you get stuck at any point.
|
100
|
-
|
101
|
-
# Tool AI Capabilities
|
102
|
-
* The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
|
103
|
-
* It can interact with the web browser with these actions: click, right click, double click, type, scroll, drag, wait, goto url and go back to previous page.
|
104
|
-
* It cannot access the OS, filesystem or application window. It just controls a single Chromium browser tab via Playwright.
|
105
|
-
|
106
|
-
# IMPORTANT
|
107
|
-
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
108
|
-
* To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
|
109
|
-
* To navigate back to the previous page, end your response with "BACK" (without quotes).
|
110
|
-
* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
|
111
|
-
|
112
|
-
# Examples
|
113
|
-
## Example 1
|
114
|
-
GOTO https://example.com
|
115
|
-
## Example 2
|
116
|
-
click the blue login button located at the top right corner
|
117
|
-
## Example 3
|
118
|
-
scroll down the page
|
119
|
-
## Example 4
|
120
|
-
type the username example@email.com into the input field labeled Username
|
121
|
-
## Example 5
|
122
|
-
DONE
|
123
|
-
|
124
|
-
# Instructions
|
125
|
-
Now describe a single high-level action to take next to progress towards the user's goal in detail.
|
126
|
-
Focus on the visual action and provide all necessary context.
|
127
|
-
""".strip()
|
128
|
-
|
105
|
+
reasoning_system_prompt = self.get_instruction(self.environment_type, current_state)
|
129
106
|
if is_none_or_empty(self.messages):
|
130
107
|
query_text = f"**Main Objective**: {self.query}"
|
131
108
|
query_screenshot = [f"data:image/webp;base64,{current_state.screenshot}"]
|
@@ -259,7 +236,8 @@ Focus on the visual action and provide all necessary context.
|
|
259
236
|
action_results_content.extend(action_result["content"])
|
260
237
|
self.messages.append(AgentMessage(role="environment", content=action_results_content))
|
261
238
|
|
262
|
-
async def summarize(self,
|
239
|
+
async def summarize(self, env_state: EnvState, summarize_prompt: str = None) -> str:
|
240
|
+
summarize_prompt = summarize_prompt or self.summarize_prompt
|
263
241
|
conversation_history = {"chat": self._format_message_for_api(self.messages)}
|
264
242
|
try:
|
265
243
|
summary = await send_message_to_model_wrapper(
|
@@ -282,7 +260,7 @@ Focus on the visual action and provide all necessary context.
|
|
282
260
|
|
283
261
|
return summary
|
284
262
|
|
285
|
-
def
|
263
|
+
def _compile_response(self, response_content: str | List) -> str:
|
286
264
|
"""Compile response content into a string, handling OpenAI message structures."""
|
287
265
|
if isinstance(response_content, str):
|
288
266
|
return response_content
|
@@ -330,6 +308,96 @@ Focus on the visual action and provide all necessary context.
|
|
330
308
|
]
|
331
309
|
return formatted_messages
|
332
310
|
|
311
|
+
def get_instruction(self, environment_type: EnvironmentType, env_state: EnvState) -> str:
|
312
|
+
"""Get the system instruction for the reasoning agent."""
|
313
|
+
if environment_type == EnvironmentType.BROWSER:
|
314
|
+
return dedent(
|
315
|
+
f"""
|
316
|
+
# Introduction
|
317
|
+
* You are Khoj, a smart and resourceful web browsing assistant. You help the user accomplish their task using a web browser.
|
318
|
+
* You are given the user's query and screenshots of the browser's state transitions.
|
319
|
+
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
320
|
+
* The current URL is {env_state.url}.
|
321
|
+
|
322
|
+
# Your Task
|
323
|
+
* First look at the screenshots carefully to notice all pertinent information.
|
324
|
+
* Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
|
325
|
+
* Make sure you scroll down to see everything before deciding something isn't available.
|
326
|
+
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
327
|
+
* Use your creativity to find alternate ways to make progress if you get stuck at any point.
|
328
|
+
|
329
|
+
# Tool AI Capabilities
|
330
|
+
* The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
|
331
|
+
* It can interact with the web browser with these actions: click, right click, double click, type, scroll, drag, wait, goto url and go back to previous page.
|
332
|
+
* It cannot access the OS, filesystem or application window. It just controls a single Chromium browser tab via Playwright.
|
333
|
+
|
334
|
+
# IMPORTANT
|
335
|
+
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
336
|
+
* To navigate to a specific URL, put "GOTO <URL>" (without quotes) on the last line of your response.
|
337
|
+
* To navigate back to the previous page, end your response with "BACK" (without quotes).
|
338
|
+
* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
|
339
|
+
|
340
|
+
# Examples
|
341
|
+
## Example 1
|
342
|
+
GOTO https://example.com
|
343
|
+
## Example 2
|
344
|
+
click the blue login button located at the top right corner
|
345
|
+
## Example 3
|
346
|
+
scroll down the page
|
347
|
+
## Example 4
|
348
|
+
type the username example@email.com into the input field labeled Username
|
349
|
+
## Example 5
|
350
|
+
DONE
|
351
|
+
|
352
|
+
# Instructions
|
353
|
+
Now describe a single high-level action to take next to progress towards the user's goal in detail.
|
354
|
+
Focus on the visual action and provide all necessary context.
|
355
|
+
"""
|
356
|
+
).strip()
|
357
|
+
|
358
|
+
elif environment_type == EnvironmentType.COMPUTER:
|
359
|
+
return dedent(
|
360
|
+
f"""
|
361
|
+
# Introduction
|
362
|
+
* You are Khoj, a smart and resourceful computer assistant. You help the user accomplish their task using a computer.
|
363
|
+
* You are given the user's query and screenshots of the computer's state transitions.
|
364
|
+
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
365
|
+
|
366
|
+
# Your Task
|
367
|
+
* First look at the screenshots carefully to notice all pertinent information.
|
368
|
+
* Then instruct a tool AI to perform the next action that will help you progress towards the user's goal.
|
369
|
+
* Make sure you scroll down to see everything before deciding something isn't available.
|
370
|
+
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
371
|
+
* Use your creativity to find alternate ways to make progress if you get stuck at any point.
|
372
|
+
|
373
|
+
# Tool AI Capabilities
|
374
|
+
* The tool AI only has access to the current screenshot and your instructions. It uses your instructions to perform the next action on the page.
|
375
|
+
* It can interact with the computer with these actions: click, right click, double click, type, scroll, drag, wait to previous page.
|
376
|
+
|
377
|
+
# IMPORTANT
|
378
|
+
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
379
|
+
* Once you've verified that the main objective has been achieved, end your response with "DONE" (without quotes).
|
380
|
+
|
381
|
+
# Examples
|
382
|
+
## Example 1
|
383
|
+
type https://example.com into the address bar and press Enter
|
384
|
+
## Example 2
|
385
|
+
click the blue login button located at the top right corner
|
386
|
+
## Example 3
|
387
|
+
scroll down the page
|
388
|
+
## Example 4
|
389
|
+
type the username example@email.com into the input field labeled Username
|
390
|
+
## Example 5
|
391
|
+
DONE
|
392
|
+
|
393
|
+
# Instructions
|
394
|
+
Now describe a single high-level action to take next to progress towards the user's goal in detail.
|
395
|
+
Focus on the visual action and provide all necessary context.
|
396
|
+
"""
|
397
|
+
).strip()
|
398
|
+
else:
|
399
|
+
raise ValueError(f"Expected environment type: Computer or Browser. Got {environment_type}.")
|
400
|
+
|
333
401
|
def reset(self):
|
334
402
|
"""Reset the agent state."""
|
335
403
|
super().reset()
|
@@ -1,18 +1,22 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
+
import platform
|
3
4
|
from copy import deepcopy
|
4
5
|
from datetime import datetime
|
6
|
+
from textwrap import dedent
|
5
7
|
from typing import List, Optional, cast
|
6
8
|
|
7
9
|
from openai.types.responses import Response, ResponseOutputItem
|
8
10
|
|
11
|
+
from khoj.database.models import ChatModel
|
12
|
+
from khoj.processor.conversation.utils import AgentMessage
|
9
13
|
from khoj.processor.operator.operator_actions import *
|
10
|
-
from khoj.processor.operator.operator_agent_base import
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
+
from khoj.processor.operator.operator_agent_base import AgentActResult, OperatorAgent
|
15
|
+
from khoj.processor.operator.operator_environment_base import (
|
16
|
+
EnvironmentType,
|
17
|
+
EnvState,
|
18
|
+
EnvStepResult,
|
14
19
|
)
|
15
|
-
from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
|
16
20
|
from khoj.utils.helpers import get_openai_async_client, is_none_or_empty
|
17
21
|
|
18
22
|
logger = logging.getLogger(__name__)
|
@@ -21,80 +25,18 @@ logger = logging.getLogger(__name__)
|
|
21
25
|
# --- Anthropic Operator Agent ---
|
22
26
|
class OpenAIOperatorAgent(OperatorAgent):
|
23
27
|
async def act(self, current_state: EnvState) -> AgentActResult:
|
24
|
-
client = get_openai_async_client(
|
25
|
-
self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
|
26
|
-
)
|
27
28
|
safety_check_prefix = "Say 'continue' after resolving the following safety checks to proceed:"
|
28
29
|
safety_check_message = None
|
29
30
|
actions: List[OperatorAction] = []
|
30
31
|
action_results: List[dict] = []
|
31
32
|
self._commit_trace() # Commit trace before next action
|
32
|
-
system_prompt =
|
33
|
-
|
34
|
-
* You operate a single Chromium browser page using Playwright.
|
35
|
-
* You cannot access the OS or filesystem.
|
36
|
-
* You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
|
37
|
-
* You can use the additional back() and goto() functions to navigate the browser.
|
38
|
-
* Always use the goto() function to navigate to a specific URL. If you see nothing, try goto duckduckgo.com
|
39
|
-
* When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
|
40
|
-
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
|
41
|
-
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
42
|
-
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
43
|
-
* The current URL is {current_state.url}.
|
44
|
-
</SYSTEM_CAPABILITY>
|
45
|
-
|
46
|
-
<IMPORTANT>
|
47
|
-
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
48
|
-
* After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
|
49
|
-
</IMPORTANT>
|
50
|
-
"""
|
51
|
-
tools = [
|
52
|
-
{
|
53
|
-
"type": "computer_use_preview",
|
54
|
-
"display_width": 1024, # TODO: Get from env
|
55
|
-
"display_height": 768, # TODO: Get from env
|
56
|
-
"environment": "browser",
|
57
|
-
},
|
58
|
-
{
|
59
|
-
"type": "function",
|
60
|
-
"name": "back",
|
61
|
-
"description": "Go back to the previous page.",
|
62
|
-
"parameters": {},
|
63
|
-
},
|
64
|
-
{
|
65
|
-
"type": "function",
|
66
|
-
"name": "goto",
|
67
|
-
"description": "Go to a specific URL.",
|
68
|
-
"parameters": {
|
69
|
-
"type": "object",
|
70
|
-
"properties": {
|
71
|
-
"url": {
|
72
|
-
"type": "string",
|
73
|
-
"description": "Fully qualified URL to navigate to.",
|
74
|
-
},
|
75
|
-
},
|
76
|
-
"additionalProperties": False,
|
77
|
-
"required": ["url"],
|
78
|
-
},
|
79
|
-
},
|
80
|
-
]
|
81
|
-
|
33
|
+
system_prompt = self.get_instructions(self.environment_type, current_state)
|
34
|
+
tools = self.get_tools(self.environment_type, current_state)
|
82
35
|
if is_none_or_empty(self.messages):
|
83
36
|
self.messages = [AgentMessage(role="user", content=self.query)]
|
84
37
|
|
85
|
-
|
86
|
-
|
87
|
-
model="computer-use-preview",
|
88
|
-
input=messages_for_api,
|
89
|
-
instructions=system_prompt,
|
90
|
-
tools=tools,
|
91
|
-
parallel_tool_calls=False, # Keep sequential for now
|
92
|
-
max_output_tokens=4096, # TODO: Make configurable?
|
93
|
-
truncation="auto",
|
94
|
-
)
|
95
|
-
|
96
|
-
logger.debug(f"Openai response: {response.model_dump_json()}")
|
97
|
-
self.messages += [AgentMessage(role="environment", content=response.output)]
|
38
|
+
response = await self._call_model(self.vision_model, system_prompt, tools)
|
39
|
+
self.messages += [AgentMessage(role="assistant", content=response.output)]
|
98
40
|
rendered_response = self._render_response(response.output, current_state.screenshot)
|
99
41
|
|
100
42
|
last_call_id = None
|
@@ -174,6 +116,9 @@ class OpenAIOperatorAgent(OperatorAgent):
|
|
174
116
|
"summary": [],
|
175
117
|
}
|
176
118
|
)
|
119
|
+
else:
|
120
|
+
logger.warning(f"Unsupported response block type: {block.type}")
|
121
|
+
content = f"Unsupported response block type: {block.type}"
|
177
122
|
if action_to_run or content:
|
178
123
|
actions.append(action_to_run)
|
179
124
|
if action_to_run or content:
|
@@ -220,6 +165,10 @@ class OpenAIOperatorAgent(OperatorAgent):
|
|
220
165
|
elif action_result["type"] == "reasoning":
|
221
166
|
items_to_pop.append(idx) # Mark placeholder reasoning action result for removal
|
222
167
|
continue
|
168
|
+
elif action_result["type"] == "computer_call" and action_result["status"] == "in_progress":
|
169
|
+
if isinstance(result_content, dict):
|
170
|
+
result_content["status"] = "completed" # Mark in-progress actions as completed
|
171
|
+
action_result["output"] = result_content
|
223
172
|
else:
|
224
173
|
# Add text data
|
225
174
|
action_result["output"] = result_content
|
@@ -229,11 +178,45 @@ class OpenAIOperatorAgent(OperatorAgent):
|
|
229
178
|
|
230
179
|
self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
|
231
180
|
|
181
|
+
async def summarize(self, current_state: EnvState, summarize_prompt: str = None) -> str:
|
182
|
+
summarize_prompt = summarize_prompt or self.summarize_prompt
|
183
|
+
self.messages.append(AgentMessage(role="user", content=summarize_prompt))
|
184
|
+
response = await self._call_model(self.vision_model, summarize_prompt, [])
|
185
|
+
self.messages += [AgentMessage(role="assistant", content=response.output)]
|
186
|
+
if not self.messages:
|
187
|
+
return "No actions to summarize."
|
188
|
+
return self._compile_response(self.messages[-1].content)
|
189
|
+
|
190
|
+
async def _call_model(self, model: ChatModel, system_prompt, tools) -> Response:
|
191
|
+
client = get_openai_async_client(model.ai_model_api.api_key, model.ai_model_api.api_base_url)
|
192
|
+
if tools:
|
193
|
+
model_name = "computer-use-preview"
|
194
|
+
else:
|
195
|
+
model_name = model.name
|
196
|
+
|
197
|
+
# Format messages for OpenAI API
|
198
|
+
messages_for_api = self._format_message_for_api(self.messages)
|
199
|
+
# format messages for summary if model is not computer-use-preview
|
200
|
+
if model_name != "computer-use-preview":
|
201
|
+
messages_for_api = self._format_messages_for_summary(messages_for_api)
|
202
|
+
|
203
|
+
response: Response = await client.responses.create(
|
204
|
+
model=model_name,
|
205
|
+
input=messages_for_api,
|
206
|
+
instructions=system_prompt,
|
207
|
+
tools=tools,
|
208
|
+
parallel_tool_calls=False,
|
209
|
+
truncation="auto",
|
210
|
+
)
|
211
|
+
|
212
|
+
logger.debug(f"Openai response: {response.model_dump_json()}")
|
213
|
+
return response
|
214
|
+
|
232
215
|
def _format_message_for_api(self, messages: list[AgentMessage]) -> list:
|
233
216
|
"""Format the message for OpenAI API."""
|
234
217
|
formatted_messages: list = []
|
235
218
|
for message in messages:
|
236
|
-
if message.role == "
|
219
|
+
if message.role == "assistant":
|
237
220
|
if isinstance(message.content, list):
|
238
221
|
# Remove reasoning message if not followed by computer call
|
239
222
|
if (
|
@@ -252,18 +235,23 @@ class OpenAIOperatorAgent(OperatorAgent):
|
|
252
235
|
message.content.pop(0)
|
253
236
|
formatted_messages.extend(message.content)
|
254
237
|
else:
|
255
|
-
logger.warning(f"Expected message content list from
|
238
|
+
logger.warning(f"Expected message content list from assistant, got {type(message.content)}")
|
239
|
+
elif message.role == "environment":
|
240
|
+
formatted_messages.extend(message.content)
|
256
241
|
else:
|
242
|
+
if isinstance(message.content, list):
|
243
|
+
message.content = "\n".join([part["text"] for part in message.content if part["type"] == "text"])
|
257
244
|
formatted_messages.append(
|
258
245
|
{
|
259
246
|
"role": message.role,
|
260
247
|
"content": message.content,
|
261
248
|
}
|
262
249
|
)
|
250
|
+
|
263
251
|
return formatted_messages
|
264
252
|
|
265
|
-
def
|
266
|
-
"""Compile the response from model into a single string."""
|
253
|
+
def _compile_response(self, response_content: str | list[dict | ResponseOutputItem]) -> str:
|
254
|
+
"""Compile the response from model into a single string for prompt tracing."""
|
267
255
|
# Handle case where response content is a string.
|
268
256
|
# This is the case when response content is a user query
|
269
257
|
if isinstance(response_content, str):
|
@@ -347,3 +335,123 @@ class OpenAIOperatorAgent(OperatorAgent):
|
|
347
335
|
}
|
348
336
|
|
349
337
|
return render_payload
|
338
|
+
|
339
|
+
def get_instructions(self, environment_type: EnvironmentType, current_state: EnvState) -> str:
|
340
|
+
"""Return system instructions for the OpenAI operator."""
|
341
|
+
if environment_type == EnvironmentType.BROWSER:
|
342
|
+
return dedent(
|
343
|
+
f"""
|
344
|
+
<SYSTEM_CAPABILITY>
|
345
|
+
* You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
|
346
|
+
* You operate a single Chromium browser page using Playwright.
|
347
|
+
* You cannot access the OS or filesystem.
|
348
|
+
* You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
|
349
|
+
* You can use the additional back() and goto() functions to navigate the browser.
|
350
|
+
* Always use the goto() function to navigate to a specific URL. If you see nothing, try goto duckduckgo.com
|
351
|
+
* When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
|
352
|
+
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
|
353
|
+
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
354
|
+
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
355
|
+
* The current URL is {current_state.url}.
|
356
|
+
</SYSTEM_CAPABILITY>
|
357
|
+
|
358
|
+
<IMPORTANT>
|
359
|
+
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
360
|
+
* After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
|
361
|
+
</IMPORTANT>
|
362
|
+
"""
|
363
|
+
).lstrip()
|
364
|
+
elif environment_type == EnvironmentType.COMPUTER:
|
365
|
+
return dedent(
|
366
|
+
f"""
|
367
|
+
<SYSTEM_CAPABILITY>
|
368
|
+
* You are Khoj, a smart computer operating assistant. You help the users accomplish their tasks using a computer.
|
369
|
+
* You can interact with the computer to perform tasks like clicking, typing, scrolling, and more using the computer_use_preview tool.
|
370
|
+
* When viewing a document or webpage it can be helpful to zoom out or scroll down to ensure you see everything before deciding something isn't available.
|
371
|
+
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
|
372
|
+
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
373
|
+
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
374
|
+
</SYSTEM_CAPABILITY>
|
375
|
+
|
376
|
+
<CONTEXT>
|
377
|
+
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
378
|
+
</CONTEXT>
|
379
|
+
"""
|
380
|
+
).lstrip()
|
381
|
+
else:
|
382
|
+
raise ValueError(f"Unsupported environment type: {environment_type}")
|
383
|
+
|
384
|
+
def get_tools(self, environment_type: EnvironmentType, current_state: EnvState) -> list[dict]:
|
385
|
+
"""Return the tools available for the OpenAI operator."""
|
386
|
+
if environment_type == EnvironmentType.COMPUTER:
|
387
|
+
# TODO: Get OS info from the environment
|
388
|
+
# For now, assume Linux as the environment OS
|
389
|
+
environment_os = "linux"
|
390
|
+
# environment = "mac" if platform.system() == "Darwin" else "windows" if platform.system() == "Windows" else "linux"
|
391
|
+
else:
|
392
|
+
environment_os = "browser"
|
393
|
+
|
394
|
+
tools = [
|
395
|
+
{
|
396
|
+
"type": "computer_use_preview",
|
397
|
+
"display_width": current_state.width,
|
398
|
+
"display_height": current_state.height,
|
399
|
+
"environment": environment_os,
|
400
|
+
}
|
401
|
+
]
|
402
|
+
if environment_type == EnvironmentType.BROWSER:
|
403
|
+
tools += [
|
404
|
+
{
|
405
|
+
"type": "function",
|
406
|
+
"name": "back",
|
407
|
+
"description": "Go back to the previous page.",
|
408
|
+
"parameters": {},
|
409
|
+
},
|
410
|
+
{
|
411
|
+
"type": "function",
|
412
|
+
"name": "goto",
|
413
|
+
"description": "Go to a specific URL.",
|
414
|
+
"parameters": {
|
415
|
+
"type": "object",
|
416
|
+
"properties": {
|
417
|
+
"url": {
|
418
|
+
"type": "string",
|
419
|
+
"description": "Fully qualified URL to navigate to.",
|
420
|
+
},
|
421
|
+
},
|
422
|
+
"additionalProperties": False,
|
423
|
+
"required": ["url"],
|
424
|
+
},
|
425
|
+
},
|
426
|
+
]
|
427
|
+
return tools
|
428
|
+
|
429
|
+
def _format_messages_for_summary(self, formatted_messages: List[dict]) -> List[dict]:
|
430
|
+
"""Format messages for summary."""
|
431
|
+
# Format messages to interact with non computer use AI models
|
432
|
+
items_to_drop = [] # Track indices to drop reasoning messages
|
433
|
+
for idx, msg in enumerate(formatted_messages):
|
434
|
+
if isinstance(msg, dict) and "content" in msg:
|
435
|
+
continue
|
436
|
+
elif isinstance(msg, dict) and "output" in msg:
|
437
|
+
# Drop current_url from output as not supported for non computer operations
|
438
|
+
if "current_url" in msg["output"]:
|
439
|
+
del msg["output"]["current_url"]
|
440
|
+
formatted_messages[idx] = {"role": "user", "content": [msg["output"]]}
|
441
|
+
elif isinstance(msg, str):
|
442
|
+
formatted_messages[idx] = {"role": "user", "content": [{"type": "input_text", "text": msg}]}
|
443
|
+
else:
|
444
|
+
text = self._compile_response([msg])
|
445
|
+
if not text:
|
446
|
+
items_to_drop.append(idx) # Track index to drop reasoning message
|
447
|
+
else:
|
448
|
+
formatted_messages[idx] = {
|
449
|
+
"role": "assistant",
|
450
|
+
"content": [{"type": "output_text", "text": text}],
|
451
|
+
}
|
452
|
+
|
453
|
+
# Remove reasoning messages for non-computer use models
|
454
|
+
for idx in reversed(items_to_drop):
|
455
|
+
formatted_messages.pop(idx)
|
456
|
+
|
457
|
+
return formatted_messages
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
+
from enum import Enum
|
2
3
|
from typing import Literal, Optional
|
3
4
|
|
4
5
|
from pydantic import BaseModel
|
@@ -6,9 +7,18 @@ from pydantic import BaseModel
|
|
6
7
|
from khoj.processor.operator.operator_actions import OperatorAction
|
7
8
|
|
8
9
|
|
10
|
+
class EnvironmentType(Enum):
|
11
|
+
"""Type of environment to operate."""
|
12
|
+
|
13
|
+
COMPUTER = "computer"
|
14
|
+
BROWSER = "browser"
|
15
|
+
|
16
|
+
|
9
17
|
class EnvState(BaseModel):
|
10
|
-
|
18
|
+
height: int
|
19
|
+
width: int
|
11
20
|
screenshot: Optional[str] = None
|
21
|
+
url: Optional[str] = None
|
12
22
|
|
13
23
|
|
14
24
|
class EnvStepResult(BaseModel):
|
@@ -5,7 +5,7 @@ import logging
|
|
5
5
|
import os
|
6
6
|
from typing import Optional, Set, Union
|
7
7
|
|
8
|
-
from khoj.processor.operator.operator_actions import OperatorAction, Point
|
8
|
+
from khoj.processor.operator.operator_actions import DragAction, OperatorAction, Point
|
9
9
|
from khoj.processor.operator.operator_environment_base import (
|
10
10
|
Environment,
|
11
11
|
EnvState,
|
@@ -124,10 +124,10 @@ class BrowserEnvironment(Environment):
|
|
124
124
|
|
125
125
|
async def get_state(self) -> EnvState:
|
126
126
|
if not self.page or self.page.is_closed():
|
127
|
-
return EnvState(url="about:blank", screenshot=None)
|
127
|
+
return EnvState(url="about:blank", screenshot=None, height=self.height, width=self.width)
|
128
128
|
url = self.page.url
|
129
129
|
screenshot = await self._get_screenshot()
|
130
|
-
return EnvState(url=url, screenshot=screenshot)
|
130
|
+
return EnvState(url=url, screenshot=screenshot, height=self.height, width=self.width)
|
131
131
|
|
132
132
|
async def step(self, action: OperatorAction) -> EnvStepResult:
|
133
133
|
if not self.page or self.page.is_closed():
|
@@ -246,6 +246,8 @@ class BrowserEnvironment(Environment):
|
|
246
246
|
logger.debug(f"Action: {action.type} to ({x},{y})")
|
247
247
|
|
248
248
|
case "drag":
|
249
|
+
if not isinstance(action, DragAction):
|
250
|
+
raise TypeError(f"Invalid action type for drag")
|
249
251
|
path = action.path
|
250
252
|
if not path:
|
251
253
|
error = "Missing path for drag action"
|