khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev144__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +20 -0
- khoj/database/models/__init__.py +1 -1
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-2cce449fd2454abf.js} +9 -9
- khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-c9ceb9b94e24b94a.js → page-e18e67cff45758c8.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-3dc59a0df3827dc7.js → page-768a0903c4b5b06d.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-2b27c7118d8d5a16.js → page-1153981cb9c4907f.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-38f1f125d7aeb4c7.js → page-a4b97dd0c2a70cfb.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/{page-26d4492fb1200e0e.js → page-44072d929427ee56.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-bf1a4e488b29fceb.js → page-4e8fdd30a3238357.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-b3f7ae1ef8871d30.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-a1f10c96366c3a4f.js → page-6a4a9050c8bddae9.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-c6bde5961098facd.js → webpack-34ac812e4e4e9a50.js} +1 -1
- khoj/interface/compiled/_next/static/css/1e9b757ee2a2b34b.css +1 -0
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +2 -2
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/anthropic/anthropic_chat.py +8 -9
- khoj/processor/conversation/anthropic/utils.py +30 -7
- khoj/processor/conversation/google/gemini_chat.py +10 -10
- khoj/processor/conversation/google/utils.py +20 -12
- khoj/processor/conversation/offline/chat_model.py +2 -7
- khoj/processor/conversation/openai/gpt.py +8 -9
- khoj/processor/conversation/utils.py +132 -21
- khoj/processor/operator/README.md +59 -0
- khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
- khoj/processor/operator/grounding_agent.py +229 -175
- khoj/processor/operator/grounding_agent_uitars.py +59 -48
- khoj/processor/operator/operator_actions.py +48 -0
- khoj/processor/operator/operator_agent_anthropic.py +298 -90
- khoj/processor/operator/operator_agent_base.py +45 -14
- khoj/processor/operator/operator_agent_binary.py +125 -57
- khoj/processor/operator/operator_agent_openai.py +183 -75
- khoj/processor/operator/operator_environment_base.py +11 -1
- khoj/processor/operator/operator_environment_browser.py +5 -3
- khoj/processor/operator/operator_environment_computer.py +658 -0
- khoj/routers/api_chat.py +36 -25
- khoj/routers/helpers.py +8 -17
- khoj/routers/research.py +43 -20
- khoj/utils/constants.py +4 -4
- khoj/utils/helpers.py +12 -15
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/METADATA +3 -1
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/RECORD +61 -59
- khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-64a53f8ec4afa6b3.js +0 -1
- khoj/interface/compiled/_next/static/css/bb7ea98028b368f3.css +0 -1
- /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → aJZTO0gnTwX0Dca_dPw4r}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → aJZTO0gnTwX0Dca_dPw4r}/_ssgManifest.js +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/WHEEL +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/entry_points.txt +0 -0
- {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/licenses/LICENSE +0 -0
@@ -125,6 +125,49 @@ class NoopAction(BaseAction):
|
|
125
125
|
type: Literal["noop"] = "noop"
|
126
126
|
|
127
127
|
|
128
|
+
# --- Text Editor Actions ---
|
129
|
+
class TextEditorViewAction(BaseAction):
|
130
|
+
"""View contents of a file."""
|
131
|
+
|
132
|
+
type: Literal["text_editor_view"] = "text_editor_view"
|
133
|
+
path: str
|
134
|
+
view_range: Optional[List[int]] = None # [start_line, end_line]
|
135
|
+
|
136
|
+
|
137
|
+
class TextEditorCreateAction(BaseAction):
|
138
|
+
"""Create a new file with specified contents."""
|
139
|
+
|
140
|
+
type: Literal["text_editor_create"] = "text_editor_create"
|
141
|
+
path: str
|
142
|
+
file_text: str
|
143
|
+
|
144
|
+
|
145
|
+
class TextEditorStrReplaceAction(BaseAction):
|
146
|
+
"""Execute an exact string match replacement on a file."""
|
147
|
+
|
148
|
+
type: Literal["text_editor_str_replace"] = "text_editor_str_replace"
|
149
|
+
path: str
|
150
|
+
old_str: str
|
151
|
+
new_str: str
|
152
|
+
|
153
|
+
|
154
|
+
class TextEditorInsertAction(BaseAction):
|
155
|
+
"""Insert new text after a specified line number."""
|
156
|
+
|
157
|
+
type: Literal["text_editor_insert"] = "text_editor_insert"
|
158
|
+
path: str
|
159
|
+
insert_line: int
|
160
|
+
new_str: str
|
161
|
+
|
162
|
+
|
163
|
+
class TerminalAction(BaseAction):
|
164
|
+
"""Insert new text after a specified line number."""
|
165
|
+
|
166
|
+
type: Literal["terminal"] = "terminal"
|
167
|
+
command: str
|
168
|
+
restart: bool = False
|
169
|
+
|
170
|
+
|
128
171
|
OperatorAction = Union[
|
129
172
|
ClickAction,
|
130
173
|
DoubleClickAction,
|
@@ -146,4 +189,9 @@ OperatorAction = Union[
|
|
146
189
|
BackAction,
|
147
190
|
RequestUserAction,
|
148
191
|
NoopAction,
|
192
|
+
TextEditorViewAction,
|
193
|
+
TextEditorCreateAction,
|
194
|
+
TextEditorStrReplaceAction,
|
195
|
+
TextEditorInsertAction,
|
196
|
+
TerminalAction,
|
149
197
|
]
|
@@ -3,18 +3,21 @@ import json
|
|
3
3
|
import logging
|
4
4
|
from copy import deepcopy
|
5
5
|
from datetime import datetime
|
6
|
-
from
|
6
|
+
from textwrap import dedent
|
7
|
+
from typing import List, Literal, Optional, cast
|
7
8
|
|
8
|
-
from anthropic.types.beta import BetaContentBlock
|
9
|
+
from anthropic.types.beta import BetaContentBlock, BetaTextBlock, BetaToolUseBlock
|
9
10
|
|
11
|
+
from khoj.database.models import ChatModel
|
10
12
|
from khoj.processor.conversation.anthropic.utils import is_reasoning_model
|
13
|
+
from khoj.processor.conversation.utils import AgentMessage
|
11
14
|
from khoj.processor.operator.operator_actions import *
|
12
|
-
from khoj.processor.operator.operator_agent_base import
|
13
|
-
|
14
|
-
|
15
|
-
|
15
|
+
from khoj.processor.operator.operator_agent_base import AgentActResult, OperatorAgent
|
16
|
+
from khoj.processor.operator.operator_environment_base import (
|
17
|
+
EnvironmentType,
|
18
|
+
EnvState,
|
19
|
+
EnvStepResult,
|
16
20
|
)
|
17
|
-
from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
|
18
21
|
from khoj.utils.helpers import get_anthropic_async_client, is_none_or_empty
|
19
22
|
|
20
23
|
logger = logging.getLogger(__name__)
|
@@ -23,81 +26,34 @@ logger = logging.getLogger(__name__)
|
|
23
26
|
# --- Anthropic Operator Agent ---
|
24
27
|
class AnthropicOperatorAgent(OperatorAgent):
|
25
28
|
async def act(self, current_state: EnvState) -> AgentActResult:
|
26
|
-
client = get_anthropic_async_client(
|
27
|
-
self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
|
28
|
-
)
|
29
|
-
betas = self.model_default_headers()
|
30
|
-
temperature = 1.0
|
31
29
|
actions: List[OperatorAction] = []
|
32
30
|
action_results: List[dict] = []
|
33
31
|
self._commit_trace() # Commit trace before next action
|
34
32
|
|
35
|
-
system_prompt =
|
36
|
-
|
37
|
-
|
38
|
-
* You cannot access the OS or filesystem.
|
39
|
-
* You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more.
|
40
|
-
* You can use the additional back() and goto() helper functions to ease navigating the browser. If you see nothing, try goto duckduckgo.com
|
41
|
-
* When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
|
42
|
-
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
|
43
|
-
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
44
|
-
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
45
|
-
* The current URL is {current_state.url}.
|
46
|
-
</SYSTEM_CAPABILITY>
|
47
|
-
|
48
|
-
<IMPORTANT>
|
49
|
-
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
50
|
-
* Do not loop on wait, screenshot for too many turns without taking any action.
|
51
|
-
* After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
|
52
|
-
</IMPORTANT>
|
53
|
-
"""
|
33
|
+
system_prompt = self.get_instructions(self.environment_type, current_state)
|
34
|
+
tools = self.get_tools(self.environment_type, current_state)
|
35
|
+
|
54
36
|
if is_none_or_empty(self.messages):
|
55
37
|
self.messages = [AgentMessage(role="user", content=self.query)]
|
56
38
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
"display_width_px": 1024,
|
62
|
-
"display_height_px": 768,
|
63
|
-
}, # TODO: Get from env
|
64
|
-
{
|
65
|
-
"name": "back",
|
66
|
-
"description": "Go back to the previous page.",
|
67
|
-
"input_schema": {"type": "object", "properties": {}},
|
68
|
-
},
|
69
|
-
{
|
70
|
-
"name": "goto",
|
71
|
-
"description": "Go to a specific URL.",
|
72
|
-
"input_schema": {
|
73
|
-
"type": "object",
|
74
|
-
"properties": {"url": {"type": "string", "description": "Fully qualified URL to navigate to."}},
|
75
|
-
"required": ["url"],
|
76
|
-
},
|
77
|
-
},
|
78
|
-
]
|
39
|
+
# Trigger trajectory compression if exceed size limit
|
40
|
+
if len(self.messages) > self.message_limit:
|
41
|
+
logger.debug("Compacting operator trajectory.")
|
42
|
+
await self._compress()
|
79
43
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
messages_for_api = self._format_message_for_api(self.messages)
|
85
|
-
response = await client.beta.messages.create(
|
86
|
-
messages=messages_for_api,
|
87
|
-
model=self.vision_model.name,
|
88
|
-
system=system_prompt,
|
44
|
+
response_content = await self._call_model(
|
45
|
+
messages=self.messages,
|
46
|
+
model=self.vision_model,
|
47
|
+
system_prompt=system_prompt,
|
89
48
|
tools=tools,
|
90
|
-
|
91
|
-
thinking=thinking,
|
92
|
-
max_tokens=4096, # TODO: Make configurable?
|
93
|
-
temperature=temperature,
|
49
|
+
headers=self.model_default_headers(),
|
94
50
|
)
|
95
51
|
|
96
|
-
|
97
|
-
self.
|
98
|
-
rendered_response = self._render_response(response.content, current_state.screenshot)
|
52
|
+
self.messages.append(AgentMessage(role="assistant", content=response_content))
|
53
|
+
rendered_response = self._render_response(response_content, current_state.screenshot)
|
99
54
|
|
100
|
-
|
55
|
+
# Parse actions from response
|
56
|
+
for block in response_content:
|
101
57
|
if block.type == "tool_use":
|
102
58
|
content = None
|
103
59
|
is_error = False
|
@@ -179,6 +135,40 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
179
135
|
logger.warning("Goto tool called without URL.")
|
180
136
|
elif tool_name == "back":
|
181
137
|
action_to_run = BackAction()
|
138
|
+
elif tool_name == self.model_default_tool("terminal")["name"]:
|
139
|
+
command = tool_input.get("command")
|
140
|
+
restart = tool_input.get("restart", False)
|
141
|
+
if command:
|
142
|
+
action_to_run = TerminalAction(command=command, restart=restart)
|
143
|
+
elif tool_name == "str_replace_based_edit_tool":
|
144
|
+
# Handle text editor tool calls
|
145
|
+
command = tool_input.get("command")
|
146
|
+
if command == "view":
|
147
|
+
path = tool_input.get("path")
|
148
|
+
view_range = tool_input.get("view_range")
|
149
|
+
if path:
|
150
|
+
action_to_run = TextEditorViewAction(path=path, view_range=view_range)
|
151
|
+
elif command == "create":
|
152
|
+
path = tool_input.get("path")
|
153
|
+
file_text = tool_input.get("file_text", "")
|
154
|
+
if path:
|
155
|
+
action_to_run = TextEditorCreateAction(path=path, file_text=file_text)
|
156
|
+
elif command == "str_replace":
|
157
|
+
path = tool_input.get("path")
|
158
|
+
old_str = tool_input.get("old_str")
|
159
|
+
new_str = tool_input.get("new_str")
|
160
|
+
if path and old_str is not None and new_str is not None:
|
161
|
+
action_to_run = TextEditorStrReplaceAction(path=path, old_str=old_str, new_str=new_str)
|
162
|
+
elif command == "insert":
|
163
|
+
path = tool_input.get("path")
|
164
|
+
insert_line = tool_input.get("insert_line")
|
165
|
+
new_str = tool_input.get("new_str")
|
166
|
+
if path and insert_line is not None and new_str is not None:
|
167
|
+
action_to_run = TextEditorInsertAction(
|
168
|
+
path=path, insert_line=insert_line, new_str=new_str
|
169
|
+
)
|
170
|
+
else:
|
171
|
+
logger.warning(f"Unsupported text editor command: {command}")
|
182
172
|
else:
|
183
173
|
logger.warning(f"Unsupported Anthropic computer action type: {tool_name}")
|
184
174
|
|
@@ -200,14 +190,6 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
200
190
|
}
|
201
191
|
)
|
202
192
|
|
203
|
-
self._update_usage(
|
204
|
-
response.usage.input_tokens,
|
205
|
-
response.usage.output_tokens,
|
206
|
-
response.usage.cache_read_input_tokens,
|
207
|
-
response.usage.cache_creation_input_tokens,
|
208
|
-
)
|
209
|
-
self.tracer["temperature"] = temperature
|
210
|
-
|
211
193
|
return AgentActResult(
|
212
194
|
actions=actions,
|
213
195
|
action_results=action_results,
|
@@ -240,18 +222,19 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
240
222
|
if env_step.error:
|
241
223
|
action_result["is_error"] = True
|
242
224
|
|
243
|
-
# Append tool results to the message history
|
244
|
-
self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
|
245
|
-
|
246
|
-
# Mark the final tool result as a cache break point
|
247
|
-
agent_action.action_results[-1]["cache_control"] = {"type": "ephemeral"}
|
248
225
|
# Remove previous cache controls
|
249
226
|
for msg in self.messages:
|
250
|
-
if
|
227
|
+
if isinstance(msg.content, list):
|
251
228
|
for block in msg.content:
|
252
229
|
if isinstance(block, dict) and "cache_control" in block:
|
253
230
|
del block["cache_control"]
|
254
231
|
|
232
|
+
# Mark the final tool result as a cache break point
|
233
|
+
agent_action.action_results[-1]["cache_control"] = {"type": "ephemeral"}
|
234
|
+
|
235
|
+
# Append tool results to the message history
|
236
|
+
self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
|
237
|
+
|
255
238
|
def _format_message_for_api(self, messages: list[AgentMessage]) -> list[dict]:
|
256
239
|
"""Format Anthropic response into a single string."""
|
257
240
|
formatted_messages = []
|
@@ -270,7 +253,7 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
270
253
|
)
|
271
254
|
return formatted_messages
|
272
255
|
|
273
|
-
def
|
256
|
+
def _compile_response(self, response_content: list[BetaContentBlock | dict] | str) -> str:
|
274
257
|
"""Compile Anthropic response into a single string."""
|
275
258
|
if isinstance(response_content, str):
|
276
259
|
return response_content
|
@@ -288,7 +271,11 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
288
271
|
compiled_response.append(block.text)
|
289
272
|
elif block.type == "tool_use":
|
290
273
|
block_input = {"action": block.name}
|
291
|
-
if block.name
|
274
|
+
if block.name in (
|
275
|
+
self.model_default_tool("computer")["name"],
|
276
|
+
self.model_default_tool("editor")["name"],
|
277
|
+
self.model_default_tool("terminal")["name"],
|
278
|
+
):
|
292
279
|
block_input = block.input # Computer action details are in input dict
|
293
280
|
elif block.name == "goto":
|
294
281
|
block_input["url"] = block.input.get("url", "[Missing URL]")
|
@@ -345,7 +332,34 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
345
332
|
else:
|
346
333
|
# Handle other actions
|
347
334
|
render_texts += [f"{action.capitalize()}"]
|
348
|
-
|
335
|
+
elif block.name == self.model_default_tool("editor")["name"]:
|
336
|
+
# Handle text editor actions
|
337
|
+
command = block.input.get("command")
|
338
|
+
if command == "view":
|
339
|
+
path = block.input.get("path")
|
340
|
+
view_range = block.input.get("view_range")
|
341
|
+
if path:
|
342
|
+
render_texts += [f"View file: {path} (lines {view_range})"]
|
343
|
+
elif command == "create":
|
344
|
+
path = block.input.get("path")
|
345
|
+
file_text = block.input.get("file_text", "")
|
346
|
+
if path:
|
347
|
+
render_texts += [f"Create file: {path} with content:\n{file_text}"]
|
348
|
+
elif command == "str_replace":
|
349
|
+
path = block.input.get("path")
|
350
|
+
old_str = block.input.get("old_str")
|
351
|
+
new_str = block.input.get("new_str")
|
352
|
+
if path and old_str is not None and new_str is not None:
|
353
|
+
render_texts += [f"File: {path}\n**Find**\n{old_str}\n**Replace**\n{new_str}'"]
|
354
|
+
elif command == "insert":
|
355
|
+
path = block.input.get("path")
|
356
|
+
insert_line = block.input.get("insert_line")
|
357
|
+
new_str = block.input.get("new_str")
|
358
|
+
if path and insert_line is not None and new_str is not None:
|
359
|
+
render_texts += [f"In file: {path} at line {insert_line} insert\n{new_str}"]
|
360
|
+
render_texts += [f"Edit file: {block.input['path']}"]
|
361
|
+
elif block.name == self.model_default_tool("terminal")["name"]:
|
362
|
+
render_texts += [f"Run command:\n{block.input['command']}"]
|
349
363
|
# If screenshot is not available when screenshot action was requested
|
350
364
|
if isinstance(block.input, dict) and block.input.get("action") == "screenshot" and not screenshot:
|
351
365
|
render_texts += ["Failed to get screenshot"]
|
@@ -365,6 +379,107 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
365
379
|
|
366
380
|
return render_payload
|
367
381
|
|
382
|
+
async def _call_model(
|
383
|
+
self,
|
384
|
+
messages: list[AgentMessage],
|
385
|
+
model: ChatModel,
|
386
|
+
system_prompt: str,
|
387
|
+
tools: list[dict] = [],
|
388
|
+
headers: list[str] = [],
|
389
|
+
temperature: float = 1.0,
|
390
|
+
max_tokens: int = 4096,
|
391
|
+
) -> list[BetaContentBlock]:
|
392
|
+
client = get_anthropic_async_client(model.ai_model_api.api_key, model.ai_model_api.api_base_url)
|
393
|
+
thinking: dict[str, str | int] = {"type": "disabled"}
|
394
|
+
system = [{"type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}}]
|
395
|
+
kwargs: dict = {}
|
396
|
+
if is_reasoning_model(model.name):
|
397
|
+
thinking = {"type": "enabled", "budget_tokens": 1024}
|
398
|
+
if headers:
|
399
|
+
kwargs["betas"] = headers
|
400
|
+
if tools:
|
401
|
+
tools[-1]["cache_control"] = {"type": "ephemeral"} # Mark last tool as cache break point
|
402
|
+
kwargs["tools"] = tools
|
403
|
+
|
404
|
+
messages_for_api = self._format_message_for_api(messages)
|
405
|
+
try:
|
406
|
+
response = await client.beta.messages.create(
|
407
|
+
messages=messages_for_api,
|
408
|
+
model=model.name,
|
409
|
+
system=system,
|
410
|
+
thinking=thinking,
|
411
|
+
max_tokens=max_tokens,
|
412
|
+
temperature=temperature,
|
413
|
+
**kwargs,
|
414
|
+
)
|
415
|
+
response_content = response.content
|
416
|
+
except Exception as e:
|
417
|
+
# create a response block with error message
|
418
|
+
logger.error(f"Error during Anthropic API call: {e}")
|
419
|
+
error_str = e.message if hasattr(e, "message") else str(e)
|
420
|
+
response = None
|
421
|
+
response_content = [BetaTextBlock(text=f"Communication Error: {error_str}", type="text")]
|
422
|
+
|
423
|
+
if response:
|
424
|
+
logger.debug(f"Anthropic response: {response.model_dump_json()}")
|
425
|
+
self._update_usage(
|
426
|
+
response.usage.input_tokens,
|
427
|
+
response.usage.output_tokens,
|
428
|
+
response.usage.cache_read_input_tokens,
|
429
|
+
response.usage.cache_creation_input_tokens,
|
430
|
+
)
|
431
|
+
self.tracer["temperature"] = temperature
|
432
|
+
return response_content
|
433
|
+
|
434
|
+
async def _compress(self):
|
435
|
+
# 1. Prepare messages for compression
|
436
|
+
original_messages = list(self.messages)
|
437
|
+
messages_to_summarize = self.messages[: self.compress_length]
|
438
|
+
# ensure last message isn't a tool call request
|
439
|
+
if messages_to_summarize[-1].role == "assistant" and (
|
440
|
+
any(isinstance(block, BetaToolUseBlock) for block in messages_to_summarize[-1].content)
|
441
|
+
or any(block["type"] == "tool_use" for block in messages_to_summarize[-1].content)
|
442
|
+
):
|
443
|
+
messages_to_summarize.pop()
|
444
|
+
|
445
|
+
summarize_prompt = f"Summarize your research and computer use till now to help answer my query:\n{self.query}"
|
446
|
+
summarize_message = AgentMessage(role="user", content=summarize_prompt)
|
447
|
+
system_prompt = dedent(
|
448
|
+
"""
|
449
|
+
You are a computer operator with meticulous communication skills. You can condense your partial computer use traces and research into an appropriately detailed summary.
|
450
|
+
When requested summarize your key actions, results and findings until now to achieve the user specified task.
|
451
|
+
Your summary should help you remember the key information required to both complete the task and later generate a final report.
|
452
|
+
"""
|
453
|
+
)
|
454
|
+
|
455
|
+
# 2. Get summary of operation trajectory
|
456
|
+
try:
|
457
|
+
response_content = await self._call_model(
|
458
|
+
messages=messages_to_summarize + [summarize_message],
|
459
|
+
model=self.vision_model,
|
460
|
+
system_prompt=system_prompt,
|
461
|
+
max_tokens=8192,
|
462
|
+
)
|
463
|
+
except Exception as e:
|
464
|
+
# create a response block with error message
|
465
|
+
logger.error(f"Error during Anthropic API call: {e}")
|
466
|
+
error_str = e.message if hasattr(e, "message") else str(e)
|
467
|
+
response_content = [BetaTextBlock(text=f"Communication Error: {error_str}", type="text")]
|
468
|
+
|
469
|
+
summary_message = AgentMessage(role="assistant", content=response_content)
|
470
|
+
|
471
|
+
# 3. Rebuild message history with condensed trajectory
|
472
|
+
primary_task = [original_messages.pop(0)]
|
473
|
+
condensed_trajectory = [summarize_message, summary_message]
|
474
|
+
recent_trajectory = original_messages[self.compress_length - 1 :] # -1 since we popped the first message
|
475
|
+
# ensure first message isn't a tool result
|
476
|
+
if recent_trajectory[0].role == "environment" and any(
|
477
|
+
block["type"] == "tool_result" for block in recent_trajectory[0].content
|
478
|
+
):
|
479
|
+
recent_trajectory.pop(0)
|
480
|
+
|
481
|
+
self.messages = primary_task + condensed_trajectory + recent_trajectory
|
482
|
+
|
368
483
|
def get_coordinates(self, tool_input: dict, key: str = "coordinate") -> Optional[list | tuple]:
|
369
484
|
"""Get coordinates from tool input."""
|
370
485
|
raw_coord = tool_input.get(key)
|
@@ -382,14 +497,22 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
382
497
|
|
383
498
|
return coord
|
384
499
|
|
385
|
-
def model_default_tool(self, tool_type: Literal["computer", "editor", "terminal"]) -> str:
|
500
|
+
def model_default_tool(self, tool_type: Literal["computer", "editor", "terminal"]) -> dict[str, str]:
|
386
501
|
"""Get the default tool of specified type for the given model."""
|
387
502
|
if self.vision_model.name.startswith("claude-3-7-sonnet"):
|
388
503
|
if tool_type == "computer":
|
389
|
-
return "computer_20250124"
|
504
|
+
return {"name": "computer", "type": "computer_20250124"}
|
505
|
+
elif tool_type == "editor":
|
506
|
+
return {"name": "str_replace_editor", "type": "text_editor_20250124"}
|
507
|
+
elif tool_type == "terminal":
|
508
|
+
return {"name": "bash_20250124", "type": "bash"}
|
390
509
|
elif self.vision_model.name.startswith("claude-sonnet-4") or self.vision_model.name.startswith("claude-opus-4"):
|
391
510
|
if tool_type == "computer":
|
392
|
-
return "computer_20250124"
|
511
|
+
return {"name": "computer", "type": "computer_20250124"}
|
512
|
+
elif tool_type == "editor":
|
513
|
+
return {"name": "str_replace_based_edit_tool", "type": "text_editor_20250429"}
|
514
|
+
elif tool_type == "terminal":
|
515
|
+
return {"name": "bash", "type": "bash_20250124"}
|
393
516
|
raise ValueError(f"Unsupported tool type for model '{self.vision_model.name}': {tool_type}")
|
394
517
|
|
395
518
|
def model_default_headers(self) -> list[str]:
|
@@ -400,3 +523,88 @@ class AnthropicOperatorAgent(OperatorAgent):
|
|
400
523
|
return ["computer-use-2025-01-24"]
|
401
524
|
else:
|
402
525
|
return []
|
526
|
+
|
527
|
+
def get_instructions(self, environment_type: EnvironmentType, current_state: EnvState) -> str:
|
528
|
+
"""Return system instructions for the Anthropic operator."""
|
529
|
+
if environment_type == EnvironmentType.BROWSER:
|
530
|
+
return dedent(
|
531
|
+
f"""
|
532
|
+
<SYSTEM_CAPABILITY>
|
533
|
+
* You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
|
534
|
+
* You operate a Chromium browser using Playwright via the 'computer' tool.
|
535
|
+
* You cannot access the OS or filesystem.
|
536
|
+
* You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more.
|
537
|
+
* You can use the additional back() and goto() helper functions to ease navigating the browser. If you see nothing, try goto duckduckgo.com
|
538
|
+
* When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
|
539
|
+
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
|
540
|
+
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
541
|
+
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
542
|
+
* The current URL is {current_state.url}.
|
543
|
+
</SYSTEM_CAPABILITY>
|
544
|
+
|
545
|
+
<IMPORTANT>
|
546
|
+
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
547
|
+
* Do not loop on wait, screenshot for too many turns without taking any action.
|
548
|
+
* After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
|
549
|
+
</IMPORTANT>
|
550
|
+
"""
|
551
|
+
).lstrip()
|
552
|
+
elif environment_type == EnvironmentType.COMPUTER:
|
553
|
+
return dedent(
|
554
|
+
f"""
|
555
|
+
<SYSTEM_CAPABILITY>
|
556
|
+
* You are Khoj, a smart computer operating assistant. You help the users accomplish tasks using a computer.
|
557
|
+
* You can interact with the computer to perform tasks like clicking, typing, scrolling, and more.
|
558
|
+
* When viewing a document or webpage it can be helpful to zoom out or scroll down to ensure you see everything before deciding something isn't available.
|
559
|
+
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
|
560
|
+
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
561
|
+
* Do not loop on wait, screenshot for too many turns without taking any action.
|
562
|
+
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
563
|
+
</SYSTEM_CAPABILITY>
|
564
|
+
|
565
|
+
<CONTEXT>
|
566
|
+
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
567
|
+
</CONTEXT>
|
568
|
+
"""
|
569
|
+
).lstrip()
|
570
|
+
else:
|
571
|
+
raise ValueError(f"Unsupported environment type for Anthropic operator: {environment_type}")
|
572
|
+
|
573
|
+
def get_tools(self, environment: EnvironmentType, current_state: EnvState) -> list[dict]:
|
574
|
+
"""Return the tools available for the Anthropic operator."""
|
575
|
+
tools: list[dict] = [
|
576
|
+
{
|
577
|
+
"type": self.model_default_tool("computer")["type"],
|
578
|
+
"name": "computer",
|
579
|
+
"display_width_px": current_state.width,
|
580
|
+
"display_height_px": current_state.height,
|
581
|
+
},
|
582
|
+
{
|
583
|
+
"type": self.model_default_tool("editor")["type"],
|
584
|
+
"name": self.model_default_tool("editor")["name"],
|
585
|
+
},
|
586
|
+
{
|
587
|
+
"type": self.model_default_tool("terminal")["type"],
|
588
|
+
"name": self.model_default_tool("terminal")["name"],
|
589
|
+
},
|
590
|
+
]
|
591
|
+
|
592
|
+
if environment == "browser":
|
593
|
+
tools += [
|
594
|
+
{
|
595
|
+
"name": "back",
|
596
|
+
"description": "Go back to the previous page.",
|
597
|
+
"input_schema": {"type": "object", "properties": {}},
|
598
|
+
},
|
599
|
+
{
|
600
|
+
"name": "goto",
|
601
|
+
"description": "Go to a specific URL.",
|
602
|
+
"input_schema": {
|
603
|
+
"type": "object",
|
604
|
+
"properties": {"url": {"type": "string", "description": "Fully qualified URL to navigate to."}},
|
605
|
+
"required": ["url"],
|
606
|
+
},
|
607
|
+
},
|
608
|
+
]
|
609
|
+
|
610
|
+
return tools
|
@@ -5,9 +5,17 @@ from typing import List, Literal, Optional, Union
|
|
5
5
|
from pydantic import BaseModel
|
6
6
|
|
7
7
|
from khoj.database.models import ChatModel
|
8
|
-
from khoj.processor.conversation.utils import
|
8
|
+
from khoj.processor.conversation.utils import (
|
9
|
+
AgentMessage,
|
10
|
+
OperatorRun,
|
11
|
+
commit_conversation_trace,
|
12
|
+
)
|
9
13
|
from khoj.processor.operator.operator_actions import OperatorAction
|
10
|
-
from khoj.processor.operator.operator_environment_base import
|
14
|
+
from khoj.processor.operator.operator_environment_base import (
|
15
|
+
EnvironmentType,
|
16
|
+
EnvState,
|
17
|
+
EnvStepResult,
|
18
|
+
)
|
11
19
|
from khoj.utils.helpers import get_chat_usage_metrics, is_promptrace_enabled
|
12
20
|
|
13
21
|
logger = logging.getLogger(__name__)
|
@@ -19,18 +27,41 @@ class AgentActResult(BaseModel):
|
|
19
27
|
rendered_response: Optional[dict] = None
|
20
28
|
|
21
29
|
|
22
|
-
class AgentMessage(BaseModel):
|
23
|
-
role: Literal["user", "assistant", "system", "environment"]
|
24
|
-
content: Union[str, List]
|
25
|
-
|
26
|
-
|
27
30
|
class OperatorAgent(ABC):
|
28
|
-
def __init__(
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
query: str,
|
34
|
+
vision_model: ChatModel,
|
35
|
+
environment_type: EnvironmentType,
|
36
|
+
max_iterations: int,
|
37
|
+
max_context: int,
|
38
|
+
chat_history: List[AgentMessage] = [],
|
39
|
+
previous_trajectory: Optional[OperatorRun] = None,
|
40
|
+
tracer: dict = {},
|
41
|
+
):
|
29
42
|
self.query = query
|
30
43
|
self.vision_model = vision_model
|
44
|
+
self.environment_type = environment_type
|
31
45
|
self.max_iterations = max_iterations
|
32
46
|
self.tracer = tracer
|
33
|
-
self.
|
47
|
+
self.summarize_prompt = f"Use the results of our research to provide a comprehensive, self-contained answer for the target query:\n{query}."
|
48
|
+
|
49
|
+
self.messages: List[AgentMessage] = chat_history
|
50
|
+
if previous_trajectory:
|
51
|
+
# Remove tool call from previous trajectory as tool call w/o result not supported
|
52
|
+
if previous_trajectory.trajectory and previous_trajectory.trajectory[-1].role == "assistant":
|
53
|
+
previous_trajectory.trajectory.pop()
|
54
|
+
self.messages += previous_trajectory.trajectory
|
55
|
+
self.messages += [AgentMessage(role="user", content=query)]
|
56
|
+
|
57
|
+
# Context compression parameters
|
58
|
+
self.context_compress_trigger = 2e3 # heuristic to determine compression trigger
|
59
|
+
# turns after which compression triggered. scales with model max context size. Minimum 5 turns.
|
60
|
+
self.message_limit = 2 * max(5, int(max_context / self.context_compress_trigger))
|
61
|
+
# compression ratio determines how many messages to compress down to one
|
62
|
+
# e.g. if 5 messages, a compress ratio of 4/5 means compress 5 messages into 1 + keep 1 uncompressed
|
63
|
+
self.message_compress_ratio = 4 / 5
|
64
|
+
self.compress_length = int(self.message_limit * self.message_compress_ratio)
|
34
65
|
|
35
66
|
@abstractmethod
|
36
67
|
async def act(self, current_state: EnvState) -> AgentActResult:
|
@@ -41,16 +72,17 @@ class OperatorAgent(ABC):
|
|
41
72
|
"""Track results of agent actions on the environment."""
|
42
73
|
pass
|
43
74
|
|
44
|
-
async def summarize(self,
|
75
|
+
async def summarize(self, current_state: EnvState, summarize_prompt: str = None) -> str:
|
45
76
|
"""Summarize the agent's actions and results."""
|
77
|
+
summarize_prompt = summarize_prompt or self.summarize_prompt
|
46
78
|
self.messages.append(AgentMessage(role="user", content=summarize_prompt))
|
47
79
|
await self.act(current_state)
|
48
80
|
if not self.messages:
|
49
81
|
return "No actions to summarize."
|
50
|
-
return self.
|
82
|
+
return self._compile_response(self.messages[-1].content)
|
51
83
|
|
52
84
|
@abstractmethod
|
53
|
-
def
|
85
|
+
def _compile_response(self, response: List | str) -> str:
|
54
86
|
pass
|
55
87
|
|
56
88
|
@abstractmethod
|
@@ -65,13 +97,12 @@ class OperatorAgent(ABC):
|
|
65
97
|
self.tracer["usage"] = get_chat_usage_metrics(
|
66
98
|
self.vision_model.name, input_tokens, output_tokens, cache_read, cache_write, usage=self.tracer.get("usage")
|
67
99
|
)
|
68
|
-
logger.debug(f"Operator usage by {self.vision_model.model_type}: {self.tracer['usage']}")
|
69
100
|
|
70
101
|
def _commit_trace(self):
|
71
102
|
self.tracer["chat_model"] = self.vision_model.name
|
72
103
|
if is_promptrace_enabled() and len(self.messages) > 1:
|
73
104
|
compiled_messages = [
|
74
|
-
AgentMessage(role=msg.role, content=self.
|
105
|
+
AgentMessage(role=msg.role, content=self._compile_response(msg.content)) for msg in self.messages
|
75
106
|
]
|
76
107
|
commit_conversation_trace(compiled_messages[:-1], compiled_messages[-1].content, self.tracer)
|
77
108
|
|