khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev144__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. khoj/database/adapters/__init__.py +20 -0
  2. khoj/database/models/__init__.py +1 -1
  3. khoj/interface/compiled/404/index.html +2 -2
  4. khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-2cce449fd2454abf.js} +9 -9
  5. khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/agents/{page-c9ceb9b94e24b94a.js → page-e18e67cff45758c8.js} +1 -1
  8. khoj/interface/compiled/_next/static/chunks/app/automations/{page-3dc59a0df3827dc7.js → page-768a0903c4b5b06d.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/app/chat/{page-2b27c7118d8d5a16.js → page-1153981cb9c4907f.js} +1 -1
  10. khoj/interface/compiled/_next/static/chunks/app/{page-38f1f125d7aeb4c7.js → page-a4b97dd0c2a70cfb.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/search/{page-26d4492fb1200e0e.js → page-44072d929427ee56.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/settings/{page-bf1a4e488b29fceb.js → page-4e8fdd30a3238357.js} +1 -1
  13. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-b3f7ae1ef8871d30.js +1 -0
  14. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-a1f10c96366c3a4f.js → page-6a4a9050c8bddae9.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/{webpack-c6bde5961098facd.js → webpack-34ac812e4e4e9a50.js} +1 -1
  16. khoj/interface/compiled/_next/static/css/1e9b757ee2a2b34b.css +1 -0
  17. khoj/interface/compiled/agents/index.html +2 -2
  18. khoj/interface/compiled/agents/index.txt +2 -2
  19. khoj/interface/compiled/automations/index.html +2 -2
  20. khoj/interface/compiled/automations/index.txt +2 -2
  21. khoj/interface/compiled/chat/index.html +2 -2
  22. khoj/interface/compiled/chat/index.txt +2 -2
  23. khoj/interface/compiled/index.html +2 -2
  24. khoj/interface/compiled/index.txt +2 -2
  25. khoj/interface/compiled/search/index.html +2 -2
  26. khoj/interface/compiled/search/index.txt +2 -2
  27. khoj/interface/compiled/settings/index.html +2 -2
  28. khoj/interface/compiled/settings/index.txt +2 -2
  29. khoj/interface/compiled/share/chat/index.html +2 -2
  30. khoj/interface/compiled/share/chat/index.txt +2 -2
  31. khoj/processor/conversation/anthropic/anthropic_chat.py +8 -9
  32. khoj/processor/conversation/anthropic/utils.py +30 -7
  33. khoj/processor/conversation/google/gemini_chat.py +10 -10
  34. khoj/processor/conversation/google/utils.py +20 -12
  35. khoj/processor/conversation/offline/chat_model.py +2 -7
  36. khoj/processor/conversation/openai/gpt.py +8 -9
  37. khoj/processor/conversation/utils.py +132 -21
  38. khoj/processor/operator/README.md +59 -0
  39. khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
  40. khoj/processor/operator/grounding_agent.py +229 -175
  41. khoj/processor/operator/grounding_agent_uitars.py +59 -48
  42. khoj/processor/operator/operator_actions.py +48 -0
  43. khoj/processor/operator/operator_agent_anthropic.py +298 -90
  44. khoj/processor/operator/operator_agent_base.py +45 -14
  45. khoj/processor/operator/operator_agent_binary.py +125 -57
  46. khoj/processor/operator/operator_agent_openai.py +183 -75
  47. khoj/processor/operator/operator_environment_base.py +11 -1
  48. khoj/processor/operator/operator_environment_browser.py +5 -3
  49. khoj/processor/operator/operator_environment_computer.py +658 -0
  50. khoj/routers/api_chat.py +36 -25
  51. khoj/routers/helpers.py +8 -17
  52. khoj/routers/research.py +43 -20
  53. khoj/utils/constants.py +4 -4
  54. khoj/utils/helpers.py +12 -15
  55. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/METADATA +3 -1
  56. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/RECORD +61 -59
  57. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
  58. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
  59. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-64a53f8ec4afa6b3.js +0 -1
  60. khoj/interface/compiled/_next/static/css/bb7ea98028b368f3.css +0 -1
  61. /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → aJZTO0gnTwX0Dca_dPw4r}/_buildManifest.js +0 -0
  62. /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → aJZTO0gnTwX0Dca_dPw4r}/_ssgManifest.js +0 -0
  63. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/WHEEL +0 -0
  64. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/entry_points.txt +0 -0
  65. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/licenses/LICENSE +0 -0
@@ -125,6 +125,49 @@ class NoopAction(BaseAction):
125
125
  type: Literal["noop"] = "noop"
126
126
 
127
127
 
128
+ # --- Text Editor Actions ---
129
+ class TextEditorViewAction(BaseAction):
130
+ """View contents of a file."""
131
+
132
+ type: Literal["text_editor_view"] = "text_editor_view"
133
+ path: str
134
+ view_range: Optional[List[int]] = None # [start_line, end_line]
135
+
136
+
137
+ class TextEditorCreateAction(BaseAction):
138
+ """Create a new file with specified contents."""
139
+
140
+ type: Literal["text_editor_create"] = "text_editor_create"
141
+ path: str
142
+ file_text: str
143
+
144
+
145
+ class TextEditorStrReplaceAction(BaseAction):
146
+ """Execute an exact string match replacement on a file."""
147
+
148
+ type: Literal["text_editor_str_replace"] = "text_editor_str_replace"
149
+ path: str
150
+ old_str: str
151
+ new_str: str
152
+
153
+
154
+ class TextEditorInsertAction(BaseAction):
155
+ """Insert new text after a specified line number."""
156
+
157
+ type: Literal["text_editor_insert"] = "text_editor_insert"
158
+ path: str
159
+ insert_line: int
160
+ new_str: str
161
+
162
+
163
+ class TerminalAction(BaseAction):
164
+ """Insert new text after a specified line number."""
165
+
166
+ type: Literal["terminal"] = "terminal"
167
+ command: str
168
+ restart: bool = False
169
+
170
+
128
171
  OperatorAction = Union[
129
172
  ClickAction,
130
173
  DoubleClickAction,
@@ -146,4 +189,9 @@ OperatorAction = Union[
146
189
  BackAction,
147
190
  RequestUserAction,
148
191
  NoopAction,
192
+ TextEditorViewAction,
193
+ TextEditorCreateAction,
194
+ TextEditorStrReplaceAction,
195
+ TextEditorInsertAction,
196
+ TerminalAction,
149
197
  ]
@@ -3,18 +3,21 @@ import json
3
3
  import logging
4
4
  from copy import deepcopy
5
5
  from datetime import datetime
6
- from typing import List, Optional, cast
6
+ from textwrap import dedent
7
+ from typing import List, Literal, Optional, cast
7
8
 
8
- from anthropic.types.beta import BetaContentBlock
9
+ from anthropic.types.beta import BetaContentBlock, BetaTextBlock, BetaToolUseBlock
9
10
 
11
+ from khoj.database.models import ChatModel
10
12
  from khoj.processor.conversation.anthropic.utils import is_reasoning_model
13
+ from khoj.processor.conversation.utils import AgentMessage
11
14
  from khoj.processor.operator.operator_actions import *
12
- from khoj.processor.operator.operator_agent_base import (
13
- AgentActResult,
14
- AgentMessage,
15
- OperatorAgent,
15
+ from khoj.processor.operator.operator_agent_base import AgentActResult, OperatorAgent
16
+ from khoj.processor.operator.operator_environment_base import (
17
+ EnvironmentType,
18
+ EnvState,
19
+ EnvStepResult,
16
20
  )
17
- from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
18
21
  from khoj.utils.helpers import get_anthropic_async_client, is_none_or_empty
19
22
 
20
23
  logger = logging.getLogger(__name__)
@@ -23,81 +26,34 @@ logger = logging.getLogger(__name__)
23
26
  # --- Anthropic Operator Agent ---
24
27
  class AnthropicOperatorAgent(OperatorAgent):
25
28
  async def act(self, current_state: EnvState) -> AgentActResult:
26
- client = get_anthropic_async_client(
27
- self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
28
- )
29
- betas = self.model_default_headers()
30
- temperature = 1.0
31
29
  actions: List[OperatorAction] = []
32
30
  action_results: List[dict] = []
33
31
  self._commit_trace() # Commit trace before next action
34
32
 
35
- system_prompt = f"""<SYSTEM_CAPABILITY>
36
- * You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
37
- * You operate a Chromium browser using Playwright via the 'computer' tool.
38
- * You cannot access the OS or filesystem.
39
- * You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more.
40
- * You can use the additional back() and goto() helper functions to ease navigating the browser. If you see nothing, try goto duckduckgo.com
41
- * When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
42
- * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
43
- * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
44
- * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
45
- * The current URL is {current_state.url}.
46
- </SYSTEM_CAPABILITY>
47
-
48
- <IMPORTANT>
49
- * You are allowed upto {self.max_iterations} iterations to complete the task.
50
- * Do not loop on wait, screenshot for too many turns without taking any action.
51
- * After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
52
- </IMPORTANT>
53
- """
33
+ system_prompt = self.get_instructions(self.environment_type, current_state)
34
+ tools = self.get_tools(self.environment_type, current_state)
35
+
54
36
  if is_none_or_empty(self.messages):
55
37
  self.messages = [AgentMessage(role="user", content=self.query)]
56
38
 
57
- tools = [
58
- {
59
- "type": self.model_default_tool("computer"),
60
- "name": "computer",
61
- "display_width_px": 1024,
62
- "display_height_px": 768,
63
- }, # TODO: Get from env
64
- {
65
- "name": "back",
66
- "description": "Go back to the previous page.",
67
- "input_schema": {"type": "object", "properties": {}},
68
- },
69
- {
70
- "name": "goto",
71
- "description": "Go to a specific URL.",
72
- "input_schema": {
73
- "type": "object",
74
- "properties": {"url": {"type": "string", "description": "Fully qualified URL to navigate to."}},
75
- "required": ["url"],
76
- },
77
- },
78
- ]
39
+ # Trigger trajectory compression if exceed size limit
40
+ if len(self.messages) > self.message_limit:
41
+ logger.debug("Compacting operator trajectory.")
42
+ await self._compress()
79
43
 
80
- thinking: dict[str, str | int] = {"type": "disabled"}
81
- if is_reasoning_model(self.vision_model.name):
82
- thinking = {"type": "enabled", "budget_tokens": 1024}
83
-
84
- messages_for_api = self._format_message_for_api(self.messages)
85
- response = await client.beta.messages.create(
86
- messages=messages_for_api,
87
- model=self.vision_model.name,
88
- system=system_prompt,
44
+ response_content = await self._call_model(
45
+ messages=self.messages,
46
+ model=self.vision_model,
47
+ system_prompt=system_prompt,
89
48
  tools=tools,
90
- betas=betas,
91
- thinking=thinking,
92
- max_tokens=4096, # TODO: Make configurable?
93
- temperature=temperature,
49
+ headers=self.model_default_headers(),
94
50
  )
95
51
 
96
- logger.debug(f"Anthropic response: {response.model_dump_json()}")
97
- self.messages.append(AgentMessage(role="assistant", content=response.content))
98
- rendered_response = self._render_response(response.content, current_state.screenshot)
52
+ self.messages.append(AgentMessage(role="assistant", content=response_content))
53
+ rendered_response = self._render_response(response_content, current_state.screenshot)
99
54
 
100
- for block in response.content:
55
+ # Parse actions from response
56
+ for block in response_content:
101
57
  if block.type == "tool_use":
102
58
  content = None
103
59
  is_error = False
@@ -179,6 +135,40 @@ class AnthropicOperatorAgent(OperatorAgent):
179
135
  logger.warning("Goto tool called without URL.")
180
136
  elif tool_name == "back":
181
137
  action_to_run = BackAction()
138
+ elif tool_name == self.model_default_tool("terminal")["name"]:
139
+ command = tool_input.get("command")
140
+ restart = tool_input.get("restart", False)
141
+ if command:
142
+ action_to_run = TerminalAction(command=command, restart=restart)
143
+ elif tool_name == "str_replace_based_edit_tool":
144
+ # Handle text editor tool calls
145
+ command = tool_input.get("command")
146
+ if command == "view":
147
+ path = tool_input.get("path")
148
+ view_range = tool_input.get("view_range")
149
+ if path:
150
+ action_to_run = TextEditorViewAction(path=path, view_range=view_range)
151
+ elif command == "create":
152
+ path = tool_input.get("path")
153
+ file_text = tool_input.get("file_text", "")
154
+ if path:
155
+ action_to_run = TextEditorCreateAction(path=path, file_text=file_text)
156
+ elif command == "str_replace":
157
+ path = tool_input.get("path")
158
+ old_str = tool_input.get("old_str")
159
+ new_str = tool_input.get("new_str")
160
+ if path and old_str is not None and new_str is not None:
161
+ action_to_run = TextEditorStrReplaceAction(path=path, old_str=old_str, new_str=new_str)
162
+ elif command == "insert":
163
+ path = tool_input.get("path")
164
+ insert_line = tool_input.get("insert_line")
165
+ new_str = tool_input.get("new_str")
166
+ if path and insert_line is not None and new_str is not None:
167
+ action_to_run = TextEditorInsertAction(
168
+ path=path, insert_line=insert_line, new_str=new_str
169
+ )
170
+ else:
171
+ logger.warning(f"Unsupported text editor command: {command}")
182
172
  else:
183
173
  logger.warning(f"Unsupported Anthropic computer action type: {tool_name}")
184
174
 
@@ -200,14 +190,6 @@ class AnthropicOperatorAgent(OperatorAgent):
200
190
  }
201
191
  )
202
192
 
203
- self._update_usage(
204
- response.usage.input_tokens,
205
- response.usage.output_tokens,
206
- response.usage.cache_read_input_tokens,
207
- response.usage.cache_creation_input_tokens,
208
- )
209
- self.tracer["temperature"] = temperature
210
-
211
193
  return AgentActResult(
212
194
  actions=actions,
213
195
  action_results=action_results,
@@ -240,18 +222,19 @@ class AnthropicOperatorAgent(OperatorAgent):
240
222
  if env_step.error:
241
223
  action_result["is_error"] = True
242
224
 
243
- # Append tool results to the message history
244
- self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
245
-
246
- # Mark the final tool result as a cache break point
247
- agent_action.action_results[-1]["cache_control"] = {"type": "ephemeral"}
248
225
  # Remove previous cache controls
249
226
  for msg in self.messages:
250
- if msg.role == "environment" and isinstance(msg.content, list):
227
+ if isinstance(msg.content, list):
251
228
  for block in msg.content:
252
229
  if isinstance(block, dict) and "cache_control" in block:
253
230
  del block["cache_control"]
254
231
 
232
+ # Mark the final tool result as a cache break point
233
+ agent_action.action_results[-1]["cache_control"] = {"type": "ephemeral"}
234
+
235
+ # Append tool results to the message history
236
+ self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
237
+
255
238
  def _format_message_for_api(self, messages: list[AgentMessage]) -> list[dict]:
256
239
  """Format Anthropic response into a single string."""
257
240
  formatted_messages = []
@@ -270,7 +253,7 @@ class AnthropicOperatorAgent(OperatorAgent):
270
253
  )
271
254
  return formatted_messages
272
255
 
273
- def compile_response(self, response_content: list[BetaContentBlock | dict] | str) -> str:
256
+ def _compile_response(self, response_content: list[BetaContentBlock | dict] | str) -> str:
274
257
  """Compile Anthropic response into a single string."""
275
258
  if isinstance(response_content, str):
276
259
  return response_content
@@ -288,7 +271,11 @@ class AnthropicOperatorAgent(OperatorAgent):
288
271
  compiled_response.append(block.text)
289
272
  elif block.type == "tool_use":
290
273
  block_input = {"action": block.name}
291
- if block.name == "computer":
274
+ if block.name in (
275
+ self.model_default_tool("computer")["name"],
276
+ self.model_default_tool("editor")["name"],
277
+ self.model_default_tool("terminal")["name"],
278
+ ):
292
279
  block_input = block.input # Computer action details are in input dict
293
280
  elif block.name == "goto":
294
281
  block_input["url"] = block.input.get("url", "[Missing URL]")
@@ -345,7 +332,34 @@ class AnthropicOperatorAgent(OperatorAgent):
345
332
  else:
346
333
  # Handle other actions
347
334
  render_texts += [f"{action.capitalize()}"]
348
-
335
+ elif block.name == self.model_default_tool("editor")["name"]:
336
+ # Handle text editor actions
337
+ command = block.input.get("command")
338
+ if command == "view":
339
+ path = block.input.get("path")
340
+ view_range = block.input.get("view_range")
341
+ if path:
342
+ render_texts += [f"View file: {path} (lines {view_range})"]
343
+ elif command == "create":
344
+ path = block.input.get("path")
345
+ file_text = block.input.get("file_text", "")
346
+ if path:
347
+ render_texts += [f"Create file: {path} with content:\n{file_text}"]
348
+ elif command == "str_replace":
349
+ path = block.input.get("path")
350
+ old_str = block.input.get("old_str")
351
+ new_str = block.input.get("new_str")
352
+ if path and old_str is not None and new_str is not None:
353
+ render_texts += [f"File: {path}\n**Find**\n{old_str}\n**Replace**\n{new_str}'"]
354
+ elif command == "insert":
355
+ path = block.input.get("path")
356
+ insert_line = block.input.get("insert_line")
357
+ new_str = block.input.get("new_str")
358
+ if path and insert_line is not None and new_str is not None:
359
+ render_texts += [f"In file: {path} at line {insert_line} insert\n{new_str}"]
360
+ render_texts += [f"Edit file: {block.input['path']}"]
361
+ elif block.name == self.model_default_tool("terminal")["name"]:
362
+ render_texts += [f"Run command:\n{block.input['command']}"]
349
363
  # If screenshot is not available when screenshot action was requested
350
364
  if isinstance(block.input, dict) and block.input.get("action") == "screenshot" and not screenshot:
351
365
  render_texts += ["Failed to get screenshot"]
@@ -365,6 +379,107 @@ class AnthropicOperatorAgent(OperatorAgent):
365
379
 
366
380
  return render_payload
367
381
 
382
+ async def _call_model(
383
+ self,
384
+ messages: list[AgentMessage],
385
+ model: ChatModel,
386
+ system_prompt: str,
387
+ tools: list[dict] = [],
388
+ headers: list[str] = [],
389
+ temperature: float = 1.0,
390
+ max_tokens: int = 4096,
391
+ ) -> list[BetaContentBlock]:
392
+ client = get_anthropic_async_client(model.ai_model_api.api_key, model.ai_model_api.api_base_url)
393
+ thinking: dict[str, str | int] = {"type": "disabled"}
394
+ system = [{"type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}}]
395
+ kwargs: dict = {}
396
+ if is_reasoning_model(model.name):
397
+ thinking = {"type": "enabled", "budget_tokens": 1024}
398
+ if headers:
399
+ kwargs["betas"] = headers
400
+ if tools:
401
+ tools[-1]["cache_control"] = {"type": "ephemeral"} # Mark last tool as cache break point
402
+ kwargs["tools"] = tools
403
+
404
+ messages_for_api = self._format_message_for_api(messages)
405
+ try:
406
+ response = await client.beta.messages.create(
407
+ messages=messages_for_api,
408
+ model=model.name,
409
+ system=system,
410
+ thinking=thinking,
411
+ max_tokens=max_tokens,
412
+ temperature=temperature,
413
+ **kwargs,
414
+ )
415
+ response_content = response.content
416
+ except Exception as e:
417
+ # create a response block with error message
418
+ logger.error(f"Error during Anthropic API call: {e}")
419
+ error_str = e.message if hasattr(e, "message") else str(e)
420
+ response = None
421
+ response_content = [BetaTextBlock(text=f"Communication Error: {error_str}", type="text")]
422
+
423
+ if response:
424
+ logger.debug(f"Anthropic response: {response.model_dump_json()}")
425
+ self._update_usage(
426
+ response.usage.input_tokens,
427
+ response.usage.output_tokens,
428
+ response.usage.cache_read_input_tokens,
429
+ response.usage.cache_creation_input_tokens,
430
+ )
431
+ self.tracer["temperature"] = temperature
432
+ return response_content
433
+
434
+ async def _compress(self):
435
+ # 1. Prepare messages for compression
436
+ original_messages = list(self.messages)
437
+ messages_to_summarize = self.messages[: self.compress_length]
438
+ # ensure last message isn't a tool call request
439
+ if messages_to_summarize[-1].role == "assistant" and (
440
+ any(isinstance(block, BetaToolUseBlock) for block in messages_to_summarize[-1].content)
441
+ or any(block["type"] == "tool_use" for block in messages_to_summarize[-1].content)
442
+ ):
443
+ messages_to_summarize.pop()
444
+
445
+ summarize_prompt = f"Summarize your research and computer use till now to help answer my query:\n{self.query}"
446
+ summarize_message = AgentMessage(role="user", content=summarize_prompt)
447
+ system_prompt = dedent(
448
+ """
449
+ You are a computer operator with meticulous communication skills. You can condense your partial computer use traces and research into an appropriately detailed summary.
450
+ When requested summarize your key actions, results and findings until now to achieve the user specified task.
451
+ Your summary should help you remember the key information required to both complete the task and later generate a final report.
452
+ """
453
+ )
454
+
455
+ # 2. Get summary of operation trajectory
456
+ try:
457
+ response_content = await self._call_model(
458
+ messages=messages_to_summarize + [summarize_message],
459
+ model=self.vision_model,
460
+ system_prompt=system_prompt,
461
+ max_tokens=8192,
462
+ )
463
+ except Exception as e:
464
+ # create a response block with error message
465
+ logger.error(f"Error during Anthropic API call: {e}")
466
+ error_str = e.message if hasattr(e, "message") else str(e)
467
+ response_content = [BetaTextBlock(text=f"Communication Error: {error_str}", type="text")]
468
+
469
+ summary_message = AgentMessage(role="assistant", content=response_content)
470
+
471
+ # 3. Rebuild message history with condensed trajectory
472
+ primary_task = [original_messages.pop(0)]
473
+ condensed_trajectory = [summarize_message, summary_message]
474
+ recent_trajectory = original_messages[self.compress_length - 1 :] # -1 since we popped the first message
475
+ # ensure first message isn't a tool result
476
+ if recent_trajectory[0].role == "environment" and any(
477
+ block["type"] == "tool_result" for block in recent_trajectory[0].content
478
+ ):
479
+ recent_trajectory.pop(0)
480
+
481
+ self.messages = primary_task + condensed_trajectory + recent_trajectory
482
+
368
483
  def get_coordinates(self, tool_input: dict, key: str = "coordinate") -> Optional[list | tuple]:
369
484
  """Get coordinates from tool input."""
370
485
  raw_coord = tool_input.get(key)
@@ -382,14 +497,22 @@ class AnthropicOperatorAgent(OperatorAgent):
382
497
 
383
498
  return coord
384
499
 
385
- def model_default_tool(self, tool_type: Literal["computer", "editor", "terminal"]) -> str:
500
+ def model_default_tool(self, tool_type: Literal["computer", "editor", "terminal"]) -> dict[str, str]:
386
501
  """Get the default tool of specified type for the given model."""
387
502
  if self.vision_model.name.startswith("claude-3-7-sonnet"):
388
503
  if tool_type == "computer":
389
- return "computer_20250124"
504
+ return {"name": "computer", "type": "computer_20250124"}
505
+ elif tool_type == "editor":
506
+ return {"name": "str_replace_editor", "type": "text_editor_20250124"}
507
+ elif tool_type == "terminal":
508
+ return {"name": "bash_20250124", "type": "bash"}
390
509
  elif self.vision_model.name.startswith("claude-sonnet-4") or self.vision_model.name.startswith("claude-opus-4"):
391
510
  if tool_type == "computer":
392
- return "computer_20250124"
511
+ return {"name": "computer", "type": "computer_20250124"}
512
+ elif tool_type == "editor":
513
+ return {"name": "str_replace_based_edit_tool", "type": "text_editor_20250429"}
514
+ elif tool_type == "terminal":
515
+ return {"name": "bash", "type": "bash_20250124"}
393
516
  raise ValueError(f"Unsupported tool type for model '{self.vision_model.name}': {tool_type}")
394
517
 
395
518
  def model_default_headers(self) -> list[str]:
@@ -400,3 +523,88 @@ class AnthropicOperatorAgent(OperatorAgent):
400
523
  return ["computer-use-2025-01-24"]
401
524
  else:
402
525
  return []
526
+
527
+ def get_instructions(self, environment_type: EnvironmentType, current_state: EnvState) -> str:
528
+ """Return system instructions for the Anthropic operator."""
529
+ if environment_type == EnvironmentType.BROWSER:
530
+ return dedent(
531
+ f"""
532
+ <SYSTEM_CAPABILITY>
533
+ * You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
534
+ * You operate a Chromium browser using Playwright via the 'computer' tool.
535
+ * You cannot access the OS or filesystem.
536
+ * You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more.
537
+ * You can use the additional back() and goto() helper functions to ease navigating the browser. If you see nothing, try goto duckduckgo.com
538
+ * When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
539
+ * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
540
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
541
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
542
+ * The current URL is {current_state.url}.
543
+ </SYSTEM_CAPABILITY>
544
+
545
+ <IMPORTANT>
546
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
547
+ * Do not loop on wait, screenshot for too many turns without taking any action.
548
+ * After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
549
+ </IMPORTANT>
550
+ """
551
+ ).lstrip()
552
+ elif environment_type == EnvironmentType.COMPUTER:
553
+ return dedent(
554
+ f"""
555
+ <SYSTEM_CAPABILITY>
556
+ * You are Khoj, a smart computer operating assistant. You help the users accomplish tasks using a computer.
557
+ * You can interact with the computer to perform tasks like clicking, typing, scrolling, and more.
558
+ * When viewing a document or webpage it can be helpful to zoom out or scroll down to ensure you see everything before deciding something isn't available.
559
+ * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
560
+ * Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
561
+ * Do not loop on wait, screenshot for too many turns without taking any action.
562
+ * You are allowed upto {self.max_iterations} iterations to complete the task.
563
+ </SYSTEM_CAPABILITY>
564
+
565
+ <CONTEXT>
566
+ * The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
567
+ </CONTEXT>
568
+ """
569
+ ).lstrip()
570
+ else:
571
+ raise ValueError(f"Unsupported environment type for Anthropic operator: {environment_type}")
572
+
573
+ def get_tools(self, environment: EnvironmentType, current_state: EnvState) -> list[dict]:
574
+ """Return the tools available for the Anthropic operator."""
575
+ tools: list[dict] = [
576
+ {
577
+ "type": self.model_default_tool("computer")["type"],
578
+ "name": "computer",
579
+ "display_width_px": current_state.width,
580
+ "display_height_px": current_state.height,
581
+ },
582
+ {
583
+ "type": self.model_default_tool("editor")["type"],
584
+ "name": self.model_default_tool("editor")["name"],
585
+ },
586
+ {
587
+ "type": self.model_default_tool("terminal")["type"],
588
+ "name": self.model_default_tool("terminal")["name"],
589
+ },
590
+ ]
591
+
592
+ if environment == "browser":
593
+ tools += [
594
+ {
595
+ "name": "back",
596
+ "description": "Go back to the previous page.",
597
+ "input_schema": {"type": "object", "properties": {}},
598
+ },
599
+ {
600
+ "name": "goto",
601
+ "description": "Go to a specific URL.",
602
+ "input_schema": {
603
+ "type": "object",
604
+ "properties": {"url": {"type": "string", "description": "Fully qualified URL to navigate to."}},
605
+ "required": ["url"],
606
+ },
607
+ },
608
+ ]
609
+
610
+ return tools
@@ -5,9 +5,17 @@ from typing import List, Literal, Optional, Union
5
5
  from pydantic import BaseModel
6
6
 
7
7
  from khoj.database.models import ChatModel
8
- from khoj.processor.conversation.utils import commit_conversation_trace
8
+ from khoj.processor.conversation.utils import (
9
+ AgentMessage,
10
+ OperatorRun,
11
+ commit_conversation_trace,
12
+ )
9
13
  from khoj.processor.operator.operator_actions import OperatorAction
10
- from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
14
+ from khoj.processor.operator.operator_environment_base import (
15
+ EnvironmentType,
16
+ EnvState,
17
+ EnvStepResult,
18
+ )
11
19
  from khoj.utils.helpers import get_chat_usage_metrics, is_promptrace_enabled
12
20
 
13
21
  logger = logging.getLogger(__name__)
@@ -19,18 +27,41 @@ class AgentActResult(BaseModel):
19
27
  rendered_response: Optional[dict] = None
20
28
 
21
29
 
22
- class AgentMessage(BaseModel):
23
- role: Literal["user", "assistant", "system", "environment"]
24
- content: Union[str, List]
25
-
26
-
27
30
  class OperatorAgent(ABC):
28
- def __init__(self, query: str, vision_model: ChatModel, max_iterations: int, tracer: dict):
31
+ def __init__(
32
+ self,
33
+ query: str,
34
+ vision_model: ChatModel,
35
+ environment_type: EnvironmentType,
36
+ max_iterations: int,
37
+ max_context: int,
38
+ chat_history: List[AgentMessage] = [],
39
+ previous_trajectory: Optional[OperatorRun] = None,
40
+ tracer: dict = {},
41
+ ):
29
42
  self.query = query
30
43
  self.vision_model = vision_model
44
+ self.environment_type = environment_type
31
45
  self.max_iterations = max_iterations
32
46
  self.tracer = tracer
33
- self.messages: List[AgentMessage] = []
47
+ self.summarize_prompt = f"Use the results of our research to provide a comprehensive, self-contained answer for the target query:\n{query}."
48
+
49
+ self.messages: List[AgentMessage] = chat_history
50
+ if previous_trajectory:
51
+ # Remove tool call from previous trajectory as tool call w/o result not supported
52
+ if previous_trajectory.trajectory and previous_trajectory.trajectory[-1].role == "assistant":
53
+ previous_trajectory.trajectory.pop()
54
+ self.messages += previous_trajectory.trajectory
55
+ self.messages += [AgentMessage(role="user", content=query)]
56
+
57
+ # Context compression parameters
58
+ self.context_compress_trigger = 2e3 # heuristic to determine compression trigger
59
+ # turns after which compression triggered. scales with model max context size. Minimum 5 turns.
60
+ self.message_limit = 2 * max(5, int(max_context / self.context_compress_trigger))
61
+ # compression ratio determines how many messages to compress down to one
62
+ # e.g. if 5 messages, a compress ratio of 4/5 means compress 5 messages into 1 + keep 1 uncompressed
63
+ self.message_compress_ratio = 4 / 5
64
+ self.compress_length = int(self.message_limit * self.message_compress_ratio)
34
65
 
35
66
  @abstractmethod
36
67
  async def act(self, current_state: EnvState) -> AgentActResult:
@@ -41,16 +72,17 @@ class OperatorAgent(ABC):
41
72
  """Track results of agent actions on the environment."""
42
73
  pass
43
74
 
44
- async def summarize(self, summarize_prompt: str, current_state: EnvState) -> str:
75
+ async def summarize(self, current_state: EnvState, summarize_prompt: str = None) -> str:
45
76
  """Summarize the agent's actions and results."""
77
+ summarize_prompt = summarize_prompt or self.summarize_prompt
46
78
  self.messages.append(AgentMessage(role="user", content=summarize_prompt))
47
79
  await self.act(current_state)
48
80
  if not self.messages:
49
81
  return "No actions to summarize."
50
- return self.compile_response(self.messages[-1].content)
82
+ return self._compile_response(self.messages[-1].content)
51
83
 
52
84
  @abstractmethod
53
- def compile_response(self, response: List | str) -> str:
85
+ def _compile_response(self, response: List | str) -> str:
54
86
  pass
55
87
 
56
88
  @abstractmethod
@@ -65,13 +97,12 @@ class OperatorAgent(ABC):
65
97
  self.tracer["usage"] = get_chat_usage_metrics(
66
98
  self.vision_model.name, input_tokens, output_tokens, cache_read, cache_write, usage=self.tracer.get("usage")
67
99
  )
68
- logger.debug(f"Operator usage by {self.vision_model.model_type}: {self.tracer['usage']}")
69
100
 
70
101
  def _commit_trace(self):
71
102
  self.tracer["chat_model"] = self.vision_model.name
72
103
  if is_promptrace_enabled() and len(self.messages) > 1:
73
104
  compiled_messages = [
74
- AgentMessage(role=msg.role, content=self.compile_response(msg.content)) for msg in self.messages
105
+ AgentMessage(role=msg.role, content=self._compile_response(msg.content)) for msg in self.messages
75
106
  ]
76
107
  commit_conversation_trace(compiled_messages[:-1], compiled_messages[-1].content, self.tracer)
77
108