khoj 1.41.1.dev40__py3-none-any.whl → 1.41.1.dev90__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. khoj/database/adapters/__init__.py +1 -1
  2. khoj/interface/compiled/404/index.html +1 -1
  3. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  4. khoj/interface/compiled/agents/index.html +1 -1
  5. khoj/interface/compiled/agents/index.txt +1 -1
  6. khoj/interface/compiled/automations/index.html +1 -1
  7. khoj/interface/compiled/automations/index.txt +1 -1
  8. khoj/interface/compiled/chat/index.html +2 -2
  9. khoj/interface/compiled/chat/index.txt +2 -2
  10. khoj/interface/compiled/index.html +1 -1
  11. khoj/interface/compiled/index.txt +1 -1
  12. khoj/interface/compiled/search/index.html +1 -1
  13. khoj/interface/compiled/search/index.txt +1 -1
  14. khoj/interface/compiled/settings/index.html +1 -1
  15. khoj/interface/compiled/settings/index.txt +1 -1
  16. khoj/interface/compiled/share/chat/index.html +2 -2
  17. khoj/interface/compiled/share/chat/index.txt +2 -2
  18. khoj/processor/conversation/anthropic/anthropic_chat.py +5 -0
  19. khoj/processor/conversation/google/gemini_chat.py +5 -0
  20. khoj/processor/conversation/google/utils.py +4 -0
  21. khoj/processor/conversation/openai/gpt.py +5 -0
  22. khoj/processor/conversation/prompts.py +12 -1
  23. khoj/processor/conversation/utils.py +13 -1
  24. khoj/processor/operator/grounding_agent.py +345 -0
  25. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  26. khoj/processor/operator/operate_browser.py +152 -0
  27. khoj/processor/operator/operator_actions.py +149 -0
  28. khoj/processor/operator/operator_agent_anthropic.py +383 -0
  29. khoj/processor/operator/operator_agent_base.py +80 -0
  30. khoj/processor/operator/operator_agent_binary.py +336 -0
  31. khoj/processor/operator/operator_agent_openai.py +349 -0
  32. khoj/processor/operator/operator_environment_base.py +37 -0
  33. khoj/processor/operator/operator_environment_browser.py +395 -0
  34. khoj/routers/api_chat.py +42 -3
  35. khoj/routers/helpers.py +14 -3
  36. khoj/routers/research.py +48 -1
  37. khoj/utils/helpers.py +17 -0
  38. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/METADATA +5 -3
  39. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/RECORD +44 -34
  40. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  41. /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_buildManifest.js +0 -0
  42. /khoj/interface/compiled/_next/static/{ifuY0XkcvaIiCG3xJl8zw → WLmcH2J-wz36GlS6O8HSL}/_ssgManifest.js +0 -0
  43. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/WHEEL +0 -0
  44. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/entry_points.txt +0 -0
  45. {khoj-1.41.1.dev40.dist-info → khoj-1.41.1.dev90.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,345 @@
1
+ import json
2
+ import logging
3
+
4
+ from openai import AzureOpenAI, OpenAI
5
+ from openai.types.chat import ChatCompletion, ChatCompletionMessage
6
+
7
+ from khoj.database.models import ChatModel
8
+ from khoj.processor.conversation.utils import construct_structured_message
9
+ from khoj.processor.operator.operator_actions import *
10
+ from khoj.processor.operator.operator_agent_base import AgentActResult
11
+ from khoj.processor.operator.operator_environment_base import EnvState
12
+ from khoj.utils.helpers import get_chat_usage_metrics
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class GroundingAgent:
18
+ def __init__(
19
+ self,
20
+ model: ChatModel,
21
+ client: OpenAI | AzureOpenAI,
22
+ max_iterations: int,
23
+ tracer: dict = None,
24
+ ):
25
+ self.model = model
26
+ self.client = client
27
+ self.max_iterations = max_iterations
28
+ self.tracer = tracer
29
+
30
+ # Define tools for the grounding LLM (OpenAI format)
31
+ self.action_tools = [
32
+ {
33
+ "type": "function",
34
+ "function": {
35
+ "name": "click",
36
+ "description": "Click on a specific coordinate.",
37
+ "parameters": {
38
+ "type": "object",
39
+ "properties": {
40
+ "x": {"type": "integer", "description": "X coordinate"},
41
+ "y": {"type": "integer", "description": "Y coordinate"},
42
+ },
43
+ "required": ["x", "y"],
44
+ },
45
+ },
46
+ },
47
+ {
48
+ "type": "function",
49
+ "function": {
50
+ "name": "left_double",
51
+ "description": "Double click on a specific coordinate.",
52
+ "parameters": {
53
+ "type": "object",
54
+ "properties": {
55
+ "x": {"type": "integer", "description": "X coordinate"},
56
+ "y": {"type": "integer", "description": "Y coordinate"},
57
+ },
58
+ "required": ["x", "y"],
59
+ },
60
+ },
61
+ },
62
+ {
63
+ "type": "function",
64
+ "function": {
65
+ "name": "right_single",
66
+ "description": "Right click on a specific coordinate.",
67
+ "parameters": {
68
+ "type": "object",
69
+ "properties": {
70
+ "x": {"type": "integer", "description": "X coordinate"},
71
+ "y": {"type": "integer", "description": "Y coordinate"},
72
+ },
73
+ "required": ["x", "y"],
74
+ },
75
+ },
76
+ },
77
+ {
78
+ "type": "function",
79
+ "function": {
80
+ "name": "drag",
81
+ "description": "Perform a drag-and-drop operation along a path.",
82
+ "parameters": {
83
+ "type": "object",
84
+ "properties": {
85
+ "path": {
86
+ "type": "array",
87
+ "items": {
88
+ "type": "object",
89
+ "properties": {
90
+ "x": {"type": "integer"},
91
+ "y": {"type": "integer"},
92
+ },
93
+ "required": ["x", "y"],
94
+ },
95
+ "description": "List of points (x, y coordinates) defining the drag path.",
96
+ }
97
+ },
98
+ "required": ["path"],
99
+ },
100
+ },
101
+ },
102
+ {
103
+ "type": "function",
104
+ "function": {
105
+ "name": "hotkey",
106
+ "description": "Press a key or key combination.",
107
+ "parameters": {
108
+ "type": "object",
109
+ "properties": {
110
+ "keys": {
111
+ "type": "array",
112
+ "items": {"type": "string"},
113
+ "description": "List of keys to press (e.g., ['Control', 'a'], ['Enter'])",
114
+ }
115
+ },
116
+ "required": ["keys"],
117
+ },
118
+ },
119
+ },
120
+ {
121
+ "type": "function",
122
+ "function": {
123
+ "name": "type",
124
+ "description": "Type text, usually into a focused input field.",
125
+ "parameters": {
126
+ "type": "object",
127
+ "properties": {"content": {"type": "string", "description": "Text to type"}},
128
+ "required": ["content"],
129
+ },
130
+ },
131
+ },
132
+ {
133
+ "type": "function",
134
+ "function": {
135
+ "name": "scroll",
136
+ "description": "Scroll the page.",
137
+ "parameters": {
138
+ "type": "object",
139
+ "properties": {
140
+ "x": {"type": "integer", "description": "X coordinate to scroll from"},
141
+ "y": {"type": "integer", "description": "Y coordinate to scroll from"},
142
+ "direction": {
143
+ "type": "string",
144
+ "enum": ["up", "down", "left", "right"],
145
+ "default": "down",
146
+ },
147
+ },
148
+ "required": [], # None is strictly required
149
+ },
150
+ },
151
+ },
152
+ {
153
+ "type": "function",
154
+ "function": {
155
+ "name": "wait",
156
+ "description": "Pause execution for a specified duration.",
157
+ "parameters": {
158
+ "type": "object",
159
+ "properties": {
160
+ "duration": {"type": "number", "description": "Duration in seconds", "default": 1.0}
161
+ },
162
+ "required": [],
163
+ },
164
+ },
165
+ },
166
+ {
167
+ "type": "function",
168
+ "function": {
169
+ "name": "goto",
170
+ "description": "Navigate to a specific URL.",
171
+ "parameters": {
172
+ "type": "object",
173
+ "properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
174
+ "required": ["url"],
175
+ },
176
+ },
177
+ },
178
+ {
179
+ "type": "function",
180
+ "function": {
181
+ "name": "back",
182
+ "description": "navigate back to the previous page.",
183
+ "parameters": {"type": "object", "properties": {}},
184
+ },
185
+ },
186
+ ]
187
+
188
+ async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
189
+ """Call the grounding LLM to get the next action based on the current state and instruction."""
190
+ # Format the message for the API call
191
+ messages_for_api = self._format_message_for_api(instruction, current_state)
192
+ try:
193
+ grounding_response: ChatCompletion = await self.client.chat.completions.create(
194
+ messages=messages_for_api,
195
+ model=self.model.name,
196
+ tools=self.action_tools,
197
+ tool_choice="required",
198
+ temperature=0.0, # Grounding should be precise
199
+ max_completion_tokens=1000, # Allow for thoughts + actions
200
+ )
201
+ if not isinstance(grounding_response, ChatCompletion):
202
+ raise ValueError("Grounding LLM response is not of type ChatCompletion.")
203
+ logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
204
+
205
+ # Parse tool calls
206
+ grounding_message = grounding_response.choices[0].message
207
+ rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
208
+
209
+ # Update usage by grounding model
210
+ self.tracer["usage"] = get_chat_usage_metrics(
211
+ self.model.name,
212
+ input_tokens=grounding_response.usage.prompt_tokens,
213
+ output_tokens=grounding_response.usage.completion_tokens,
214
+ usage=self.tracer.get("usage"),
215
+ )
216
+ except Exception as e:
217
+ logger.error(f"Error calling Grounding LLM: {e}")
218
+ rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
219
+ actions = []
220
+
221
+ return rendered_response, actions
222
+
223
+ def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
224
+ """Format the message for the API call."""
225
+ grounding_user_prompt = f"""
226
+ You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
227
+ You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
228
+ Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
229
+
230
+ ## Output Format
231
+ ```
232
+ Thought: ...
233
+ Action: ...
234
+ ```
235
+
236
+ ## Action Space
237
+
238
+ click(start_box='<|box_start|>(x1,y1)<|box_end|>')
239
+ left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
240
+ right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
241
+ drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
242
+ hotkey(key='')
243
+ type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
244
+ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
245
+ wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
246
+ goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
247
+ back() # Use this to go back to the previous page.
248
+
249
+ ## Note
250
+ - Use English in `Thought` part.
251
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
252
+
253
+ ## User Instruction
254
+ {instruction}
255
+ """.lstrip()
256
+
257
+ # Construct grounding LLM input (using only the latest user prompt + image)
258
+ # We don't pass the full history here, as grounding depends on the *current* state + NL action
259
+ screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
260
+ grounding_messages_content = construct_structured_message(
261
+ grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
262
+ )
263
+ return [{"role": "user", "content": grounding_messages_content}]
264
+
265
+ def _parse_action(
266
+ self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
267
+ ) -> tuple[str, list[OperatorAction]]:
268
+ """Parse the tool calls from the grounding LLM response and convert them to action objects."""
269
+ actions: List[OperatorAction] = []
270
+ action_results: List[dict] = []
271
+
272
+ if grounding_message.tool_calls:
273
+ rendered_parts = []
274
+ for tool_call in grounding_message.tool_calls:
275
+ function_name = tool_call.function.name
276
+ try:
277
+ arguments = json.loads(tool_call.function.arguments)
278
+ action_to_run: Optional[OperatorAction] = None
279
+ action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
280
+
281
+ if function_name == "click":
282
+ action_to_run = ClickAction(**arguments)
283
+ elif function_name == "left_double":
284
+ action_to_run = DoubleClickAction(**arguments)
285
+ elif function_name == "right_single":
286
+ action_to_run = ClickAction(button="right", **arguments)
287
+ elif function_name == "type":
288
+ content = arguments.get("content")
289
+ action_to_run = TypeAction(text=content)
290
+ elif function_name == "scroll":
291
+ direction = arguments.get("direction", "down")
292
+ amount = 3
293
+ action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
294
+ elif function_name == "hotkey":
295
+ action_to_run = KeypressAction(**arguments)
296
+ elif function_name == "goto":
297
+ action_to_run = GotoAction(**arguments)
298
+ elif function_name == "back":
299
+ action_to_run = BackAction(**arguments)
300
+ elif function_name == "wait":
301
+ action_to_run = WaitAction(**arguments)
302
+ elif function_name == "screenshot":
303
+ action_to_run = ScreenshotAction(**arguments)
304
+ elif function_name == "drag":
305
+ # Need to convert list of dicts to list of Point objects
306
+ path_dicts = arguments.get("path", [])
307
+ path_points = [Point(**p) for p in path_dicts]
308
+ if path_points:
309
+ action_to_run = DragAction(path=path_points)
310
+ else:
311
+ logger.warning(f"Drag action called with empty path: {arguments}")
312
+ action_render_str += " [Skipped - empty path]"
313
+ elif function_name == "finished":
314
+ action_to_run = None
315
+ else:
316
+ logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
317
+ action_render_str += " [Unhandled]"
318
+
319
+ if action_to_run:
320
+ actions.append(action_to_run)
321
+ action_results.append(
322
+ {
323
+ "type": "tool_result",
324
+ "tool_call_id": tool_call.id,
325
+ "content": None, # Updated after environment step
326
+ }
327
+ )
328
+ rendered_parts.append(action_render_str)
329
+ except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
330
+ logger.error(
331
+ f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
332
+ )
333
+ rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
334
+ rendered_response = "\n- ".join(rendered_parts)
335
+ else:
336
+ # Grounding LLM responded but didn't call a tool
337
+ logger.warning("Grounding LLM did not produce a tool call.")
338
+ rendered_response = f"{grounding_message.content or 'No action required.'}"
339
+
340
+ # Render the response
341
+ return rendered_response, actions
342
+
343
+ def reset(self):
344
+ """Reset the agent state."""
345
+ pass