khoj 1.41.1.dev107__py3-none-any.whl → 1.41.1.dev144__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. khoj/database/adapters/__init__.py +20 -0
  2. khoj/database/models/__init__.py +1 -1
  3. khoj/interface/compiled/404/index.html +2 -2
  4. khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-2cce449fd2454abf.js} +9 -9
  5. khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/agents/{page-c9ceb9b94e24b94a.js → page-e18e67cff45758c8.js} +1 -1
  8. khoj/interface/compiled/_next/static/chunks/app/automations/{page-3dc59a0df3827dc7.js → page-768a0903c4b5b06d.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/app/chat/{page-2b27c7118d8d5a16.js → page-1153981cb9c4907f.js} +1 -1
  10. khoj/interface/compiled/_next/static/chunks/app/{page-38f1f125d7aeb4c7.js → page-a4b97dd0c2a70cfb.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/search/{page-26d4492fb1200e0e.js → page-44072d929427ee56.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/settings/{page-bf1a4e488b29fceb.js → page-4e8fdd30a3238357.js} +1 -1
  13. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-b3f7ae1ef8871d30.js +1 -0
  14. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-a1f10c96366c3a4f.js → page-6a4a9050c8bddae9.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/{webpack-c6bde5961098facd.js → webpack-34ac812e4e4e9a50.js} +1 -1
  16. khoj/interface/compiled/_next/static/css/1e9b757ee2a2b34b.css +1 -0
  17. khoj/interface/compiled/agents/index.html +2 -2
  18. khoj/interface/compiled/agents/index.txt +2 -2
  19. khoj/interface/compiled/automations/index.html +2 -2
  20. khoj/interface/compiled/automations/index.txt +2 -2
  21. khoj/interface/compiled/chat/index.html +2 -2
  22. khoj/interface/compiled/chat/index.txt +2 -2
  23. khoj/interface/compiled/index.html +2 -2
  24. khoj/interface/compiled/index.txt +2 -2
  25. khoj/interface/compiled/search/index.html +2 -2
  26. khoj/interface/compiled/search/index.txt +2 -2
  27. khoj/interface/compiled/settings/index.html +2 -2
  28. khoj/interface/compiled/settings/index.txt +2 -2
  29. khoj/interface/compiled/share/chat/index.html +2 -2
  30. khoj/interface/compiled/share/chat/index.txt +2 -2
  31. khoj/processor/conversation/anthropic/anthropic_chat.py +8 -9
  32. khoj/processor/conversation/anthropic/utils.py +30 -7
  33. khoj/processor/conversation/google/gemini_chat.py +10 -10
  34. khoj/processor/conversation/google/utils.py +20 -12
  35. khoj/processor/conversation/offline/chat_model.py +2 -7
  36. khoj/processor/conversation/openai/gpt.py +8 -9
  37. khoj/processor/conversation/utils.py +132 -21
  38. khoj/processor/operator/README.md +59 -0
  39. khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
  40. khoj/processor/operator/grounding_agent.py +229 -175
  41. khoj/processor/operator/grounding_agent_uitars.py +59 -48
  42. khoj/processor/operator/operator_actions.py +48 -0
  43. khoj/processor/operator/operator_agent_anthropic.py +298 -90
  44. khoj/processor/operator/operator_agent_base.py +45 -14
  45. khoj/processor/operator/operator_agent_binary.py +125 -57
  46. khoj/processor/operator/operator_agent_openai.py +183 -75
  47. khoj/processor/operator/operator_environment_base.py +11 -1
  48. khoj/processor/operator/operator_environment_browser.py +5 -3
  49. khoj/processor/operator/operator_environment_computer.py +658 -0
  50. khoj/routers/api_chat.py +36 -25
  51. khoj/routers/helpers.py +8 -17
  52. khoj/routers/research.py +43 -20
  53. khoj/utils/constants.py +4 -4
  54. khoj/utils/helpers.py +12 -15
  55. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/METADATA +3 -1
  56. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/RECORD +61 -59
  57. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
  58. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e49165209d2e406c.js +0 -1
  59. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-64a53f8ec4afa6b3.js +0 -1
  60. khoj/interface/compiled/_next/static/css/bb7ea98028b368f3.css +0 -1
  61. /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → aJZTO0gnTwX0Dca_dPw4r}/_buildManifest.js +0 -0
  62. /khoj/interface/compiled/_next/static/{y_k1yn7bI1CgM5ZfW7jUq → aJZTO0gnTwX0Dca_dPw4r}/_ssgManifest.js +0 -0
  63. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/WHEEL +0 -0
  64. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/entry_points.txt +0 -0
  65. {khoj-1.41.1.dev107.dist-info → khoj-1.41.1.dev144.dist-info}/licenses/LICENSE +0 -0
@@ -6,13 +6,23 @@ from typing import Callable, List, Optional
6
6
 
7
7
  from khoj.database.adapters import AgentAdapters, ConversationAdapters
8
8
  from khoj.database.models import Agent, ChatModel, KhojUser
9
+ from khoj.processor.conversation.utils import (
10
+ OperatorRun,
11
+ construct_chat_history,
12
+ construct_chat_history_for_operator,
13
+ )
9
14
  from khoj.processor.operator.operator_actions import *
10
15
  from khoj.processor.operator.operator_agent_anthropic import AnthropicOperatorAgent
11
16
  from khoj.processor.operator.operator_agent_base import OperatorAgent
12
17
  from khoj.processor.operator.operator_agent_binary import BinaryOperatorAgent
13
18
  from khoj.processor.operator.operator_agent_openai import OpenAIOperatorAgent
14
- from khoj.processor.operator.operator_environment_base import EnvStepResult
19
+ from khoj.processor.operator.operator_environment_base import (
20
+ Environment,
21
+ EnvironmentType,
22
+ EnvStepResult,
23
+ )
15
24
  from khoj.processor.operator.operator_environment_browser import BrowserEnvironment
25
+ from khoj.processor.operator.operator_environment_computer import ComputerEnvironment
16
26
  from khoj.routers.helpers import ChatEvent
17
27
  from khoj.utils.helpers import timer
18
28
  from khoj.utils.rawconfig import LocationData
@@ -20,12 +30,14 @@ from khoj.utils.rawconfig import LocationData
20
30
  logger = logging.getLogger(__name__)
21
31
 
22
32
 
23
- # --- Browser Operator Function ---
24
- async def operate_browser(
33
+ # --- Main Operator Entrypoint ---
34
+ async def operate_environment(
25
35
  query: str,
26
36
  user: KhojUser,
27
37
  conversation_log: dict,
28
38
  location_data: LocationData,
39
+ previous_trajectory: Optional[OperatorRun] = None,
40
+ environment_type: EnvironmentType = EnvironmentType.COMPUTER,
29
41
  send_status_func: Optional[Callable] = None,
30
42
  query_images: Optional[List[str]] = None, # TODO: Handle query images
31
43
  agent: Agent = None,
@@ -33,8 +45,11 @@ async def operate_browser(
33
45
  cancellation_event: Optional[asyncio.Event] = None,
34
46
  tracer: dict = {},
35
47
  ):
36
- response, summary_message, user_input_message = None, None, None
37
- environment: Optional[BrowserEnvironment] = None
48
+ response, user_input_message = None, None
49
+
50
+ # Only use partial previous trajectories to continue existing task
51
+ if previous_trajectory and previous_trajectory.response:
52
+ previous_trajectory = None
38
53
 
39
54
  # Get the agent chat model
40
55
  agent_chat_model = await AgentAdapters.aget_agent_chat_model(agent, user) if agent else None
@@ -42,16 +57,40 @@ async def operate_browser(
42
57
  if not reasoning_model or not reasoning_model.vision_enabled:
43
58
  reasoning_model = await ConversationAdapters.aget_vision_enabled_config()
44
59
  if not reasoning_model:
45
- raise ValueError(f"No vision enabled chat model found. Configure a vision chat model to operate browser.")
60
+ raise ValueError(f"No vision enabled chat model found. Configure a vision chat model to operate environment.")
61
+
62
+ # Create conversation history from conversation log
63
+ chat_history = construct_chat_history_for_operator(conversation_log)
46
64
 
47
65
  # Initialize Agent
48
- max_iterations = int(os.getenv("KHOJ_OPERATOR_ITERATIONS", 40))
66
+ max_context = await ConversationAdapters.aget_max_context_size(reasoning_model, user) or 20000
67
+ max_iterations = int(os.getenv("KHOJ_OPERATOR_ITERATIONS", 100))
49
68
  operator_agent: OperatorAgent
50
- if is_operator_model(reasoning_model.name) == ChatModel.ModelType.OPENAI:
51
- operator_agent = OpenAIOperatorAgent(query, reasoning_model, max_iterations, tracer)
52
- elif is_operator_model(reasoning_model.name) == ChatModel.ModelType.ANTHROPIC:
53
- operator_agent = AnthropicOperatorAgent(query, reasoning_model, max_iterations, tracer)
54
- else:
69
+ if is_operator_model(reasoning_model.name) == ChatModel.ModelType.ANTHROPIC:
70
+ operator_agent = AnthropicOperatorAgent(
71
+ query,
72
+ reasoning_model,
73
+ environment_type,
74
+ max_iterations,
75
+ max_context,
76
+ chat_history,
77
+ previous_trajectory,
78
+ tracer,
79
+ )
80
+ # TODO: Remove once OpenAI Operator Agent is useful
81
+ elif is_operator_model(reasoning_model.name) == ChatModel.ModelType.OPENAI and False:
82
+ operator_agent = OpenAIOperatorAgent(
83
+ query,
84
+ reasoning_model,
85
+ environment_type,
86
+ max_iterations,
87
+ max_context,
88
+ chat_history,
89
+ previous_trajectory,
90
+ tracer,
91
+ )
92
+ # TODO: Remove once Binary Operator Agent is useful
93
+ elif False:
55
94
  grounding_model_name = "ui-tars-1.5"
56
95
  grounding_model = await ConversationAdapters.aget_chat_model_by_name(grounding_model_name)
57
96
  if (
@@ -59,41 +98,62 @@ async def operate_browser(
59
98
  or not grounding_model.vision_enabled
60
99
  or not grounding_model.model_type == ChatModel.ModelType.OPENAI
61
100
  ):
62
- raise ValueError("No supported visual grounding model for binary operator agent found.")
63
- operator_agent = BinaryOperatorAgent(query, reasoning_model, grounding_model, max_iterations, tracer)
101
+ raise ValueError("Binary operator agent needs ui-tars-1.5 served over an OpenAI compatible API.")
102
+ operator_agent = BinaryOperatorAgent(
103
+ query,
104
+ reasoning_model,
105
+ grounding_model,
106
+ environment_type,
107
+ max_iterations,
108
+ max_context,
109
+ chat_history,
110
+ previous_trajectory,
111
+ tracer,
112
+ )
113
+ else:
114
+ raise ValueError(
115
+ f"Unsupported operator model: {reasoning_model.name}. "
116
+ "Please use a supported operator model. Only Anthropic models are currently supported."
117
+ )
64
118
 
65
119
  # Initialize Environment
66
120
  if send_status_func:
67
- async for event in send_status_func(f"**Launching Browser**"):
121
+ async for event in send_status_func(f"**Launching {environment_type.value}**"):
68
122
  yield {ChatEvent.STATUS: event}
69
- environment = BrowserEnvironment()
123
+ if environment_type == EnvironmentType.BROWSER:
124
+ environment: Environment = BrowserEnvironment()
125
+ else:
126
+ environment = ComputerEnvironment(provider="docker")
70
127
  await environment.start(width=1024, height=768)
71
128
 
72
129
  # Start Operator Loop
73
130
  try:
74
- summarize_prompt = f"Use the results of our research to provide a comprehensive, self-contained answer for the target query:\n{query}."
75
131
  task_completed = False
76
132
  iterations = 0
133
+ operator_run = OperatorRun(query=query, trajectory=operator_agent.messages, response=response)
134
+ yield operator_run
77
135
 
78
- with timer(f"Operating browser with {reasoning_model.model_type} {reasoning_model.name}", logger):
136
+ with timer(
137
+ f"Operating {environment_type.value} with {reasoning_model.model_type} {reasoning_model.name}", logger
138
+ ):
79
139
  while iterations < max_iterations and not task_completed:
80
140
  if cancellation_event and cancellation_event.is_set():
81
- logger.debug(f"Browser operator cancelled by client disconnect")
141
+ logger.debug(f"{environment_type.value} operator cancelled by client disconnect")
82
142
  break
83
143
 
84
144
  iterations += 1
85
145
 
86
146
  # 1. Get current environment state
87
- browser_state = await environment.get_state()
147
+ env_state = await environment.get_state()
88
148
 
89
149
  # 2. Agent decides action(s)
90
- agent_result = await operator_agent.act(browser_state)
150
+ agent_result = await operator_agent.act(env_state)
91
151
 
92
152
  # 3. Execute actions in the environment
93
153
  env_steps: List[EnvStepResult] = []
94
154
  for action in agent_result.actions:
95
155
  if cancellation_event and cancellation_event.is_set():
96
- logger.debug(f"Browser operator cancelled by client disconnect")
156
+ logger.debug(f"{environment_type.value} operator cancelled by client disconnect")
97
157
  break
98
158
  # Handle request for user action and break the loop
99
159
  if isinstance(action, RequestUserAction):
@@ -106,12 +166,14 @@ async def operate_browser(
106
166
  env_steps.append(env_step)
107
167
 
108
168
  # Render status update
109
- latest_screenshot = f"data:image/webp;base64,{env_steps[-1].screenshot_base64 if env_steps else browser_state.screenshot}"
169
+ latest_screenshot = (
170
+ f"data:image/webp;base64,{env_steps[-1].screenshot_base64 if env_steps else env_state.screenshot}"
171
+ )
110
172
  render_payload = agent_result.rendered_response
111
173
  render_payload["image"] = latest_screenshot
112
174
  render_content = f"**Action**: {json.dumps(render_payload)}"
113
175
  if send_status_func:
114
- async for event in send_status_func(f"**Operating Browser**:\n{render_content}"):
176
+ async for event in send_status_func(f"**Operating {environment_type.value}**:\n{render_content}"):
115
177
  yield {ChatEvent.STATUS: event}
116
178
 
117
179
  # Check if termination conditions are met
@@ -123,31 +185,33 @@ async def operate_browser(
123
185
  if task_completed or trigger_iteration_limit:
124
186
  # Summarize results of operator run on last iteration
125
187
  operator_agent.add_action_results(env_steps, agent_result)
126
- summary_message = await operator_agent.summarize(summarize_prompt, browser_state)
188
+ summary_message = await operator_agent.summarize(env_state)
127
189
  logger.info(f"Task completed: {task_completed}, Iteration limit: {trigger_iteration_limit}")
128
190
  break
129
191
 
130
192
  # 4. Update agent on the results of its action on the environment
131
193
  operator_agent.add_action_results(env_steps, agent_result)
194
+ operator_run.trajectory = operator_agent.messages
132
195
 
133
196
  # Determine final response message
134
197
  if user_input_message:
135
- response = user_input_message
198
+ operator_run.response = user_input_message
136
199
  elif task_completed:
137
- response = summary_message
200
+ operator_run.response = summary_message
201
+ elif cancellation_event and cancellation_event.is_set():
202
+ operator_run.response = None
138
203
  else: # Hit iteration limit
139
- response = f"Operator hit iteration limit ({max_iterations}). If the results seem incomplete try again, assign a smaller task or try a different approach.\nThese were the results till now:\n{summary_message}"
204
+ operator_run.response = f"Operator hit iteration limit ({max_iterations}). If the results seem incomplete try again, assign a smaller task or try a different approach.\nThese were the results till now:\n{summary_message}"
140
205
  finally:
141
- if environment and not user_input_message: # Don't close browser if user input required
206
+ if environment and not user_input_message: # Don't close environment if user input required
142
207
  await environment.close()
143
208
  if operator_agent:
144
209
  operator_agent.reset()
145
210
 
146
- yield {
147
- "query": query,
148
- "result": user_input_message or response,
149
- "webpages": [{"link": url, "snippet": ""} for url in environment.visited_urls],
150
- }
211
+ if environment_type == EnvironmentType.BROWSER and hasattr(environment, "visited_urls"):
212
+ operator_run.webpages = [{"link": url, "snippet": ""} for url in environment.visited_urls]
213
+
214
+ yield operator_run
151
215
 
152
216
 
153
217
  def is_operator_model(model: str) -> ChatModel.ModelType | None: