khoj 1.41.1.dev97__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. khoj/database/adapters/__init__.py +20 -0
  2. khoj/database/models/__init__.py +3 -0
  3. khoj/interface/compiled/404/index.html +2 -2
  4. khoj/interface/compiled/_next/static/chunks/{2327-aa22697ed9c8d54a.js → 2327-f03b2a77f67b8f8c.js} +1 -1
  5. khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-81457f7f59956b56.js} +9 -9
  6. khoj/interface/compiled/_next/static/chunks/5477-b91e9926cfc3095c.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
  9. khoj/interface/compiled/_next/static/chunks/app/agents/{page-ceeb9a91edea74ce.js → page-774c78ff0f55a228.js} +1 -1
  10. khoj/interface/compiled/_next/static/chunks/app/automations/{page-e3cb78747ab98cc7.js → page-4454891c5007b870.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
  12. khoj/interface/compiled/_next/static/chunks/app/chat/{page-7e780dc11eb5e5d3.js → page-5a2559825b4d5def.js} +1 -1
  13. khoj/interface/compiled/_next/static/chunks/app/{page-a4053e1bb578b2ce.js → page-f7a0286dfc31ad6b.js} +1 -1
  14. khoj/interface/compiled/_next/static/chunks/app/search/layout-f5881c7ae3ba0795.js +1 -0
  15. khoj/interface/compiled/_next/static/chunks/app/search/{page-8973da2f4c076fe1.js → page-f1a7f278c89e09b6.js} +1 -1
  16. khoj/interface/compiled/_next/static/chunks/app/settings/{page-375136dbb400525b.js → page-5d9134d4a97f8834.js} +1 -1
  17. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
  18. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-384b54fc953b18f2.js → page-32cd0ceb9ffbd777.js} +1 -1
  19. khoj/interface/compiled/_next/static/chunks/{webpack-21f76f7f59582bc7.js → webpack-952bc0d41769db77.js} +1 -1
  20. khoj/interface/compiled/_next/static/css/{fca983d49c3dd1a3.css → 0db53bacf81896f5.css} +1 -1
  21. khoj/interface/compiled/_next/static/css/93eeacc43e261162.css +1 -0
  22. khoj/interface/compiled/agents/index.html +2 -2
  23. khoj/interface/compiled/agents/index.txt +2 -2
  24. khoj/interface/compiled/automations/index.html +2 -2
  25. khoj/interface/compiled/automations/index.txt +3 -3
  26. khoj/interface/compiled/chat/index.html +2 -2
  27. khoj/interface/compiled/chat/index.txt +2 -2
  28. khoj/interface/compiled/index.html +2 -2
  29. khoj/interface/compiled/index.txt +2 -2
  30. khoj/interface/compiled/search/index.html +2 -2
  31. khoj/interface/compiled/search/index.txt +2 -2
  32. khoj/interface/compiled/settings/index.html +2 -2
  33. khoj/interface/compiled/settings/index.txt +4 -4
  34. khoj/interface/compiled/share/chat/index.html +2 -2
  35. khoj/interface/compiled/share/chat/index.txt +2 -2
  36. khoj/processor/conversation/anthropic/anthropic_chat.py +9 -10
  37. khoj/processor/conversation/anthropic/utils.py +30 -7
  38. khoj/processor/conversation/google/gemini_chat.py +10 -10
  39. khoj/processor/conversation/google/utils.py +20 -12
  40. khoj/processor/conversation/offline/chat_model.py +2 -7
  41. khoj/processor/conversation/openai/gpt.py +9 -10
  42. khoj/processor/conversation/utils.py +177 -53
  43. khoj/processor/operator/README.md +59 -0
  44. khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
  45. khoj/processor/operator/grounding_agent.py +229 -175
  46. khoj/processor/operator/grounding_agent_uitars.py +61 -50
  47. khoj/processor/operator/operator_actions.py +48 -0
  48. khoj/processor/operator/operator_agent_anthropic.py +298 -90
  49. khoj/processor/operator/operator_agent_base.py +45 -14
  50. khoj/processor/operator/operator_agent_binary.py +125 -57
  51. khoj/processor/operator/operator_agent_openai.py +183 -75
  52. khoj/processor/operator/operator_environment_base.py +11 -1
  53. khoj/processor/operator/operator_environment_browser.py +5 -3
  54. khoj/processor/operator/operator_environment_computer.py +658 -0
  55. khoj/routers/api_chat.py +125 -43
  56. khoj/routers/api_model.py +3 -3
  57. khoj/routers/helpers.py +13 -18
  58. khoj/routers/research.py +57 -23
  59. khoj/utils/constants.py +4 -4
  60. khoj/utils/helpers.py +12 -15
  61. khoj/utils/rawconfig.py +1 -0
  62. {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/METADATA +3 -1
  63. {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/RECORD +74 -72
  64. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
  65. khoj/interface/compiled/_next/static/chunks/5477-77ce5c6f468d6c25.js +0 -1
  66. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +0 -1
  67. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +0 -1
  68. khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
  69. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
  70. khoj/interface/compiled/_next/static/css/f29752d6e1be7624.css +0 -1
  71. /khoj/interface/compiled/_next/static/{o6zlo73DbD2lS92jWHS8o → 4CIEX6Ko-Qehhb7L-ymZw}/_buildManifest.js +0 -0
  72. /khoj/interface/compiled/_next/static/{o6zlo73DbD2lS92jWHS8o → 4CIEX6Ko-Qehhb7L-ymZw}/_ssgManifest.js +0 -0
  73. /khoj/interface/compiled/_next/static/chunks/{1915-ab4353eaca76f690.js → 1915-1943ee8a628b893c.js} +0 -0
  74. /khoj/interface/compiled/_next/static/chunks/{2117-1c18aa2098982bf9.js → 2117-5a41630a2bd2eae8.js} +0 -0
  75. /khoj/interface/compiled/_next/static/chunks/{4363-4efaf12abe696251.js → 4363-e6ac2203564d1a3b.js} +0 -0
  76. /khoj/interface/compiled/_next/static/chunks/{4447-5d44807c40355b1a.js → 4447-e038b251d626c340.js} +0 -0
  77. /khoj/interface/compiled/_next/static/chunks/{8667-adbe6017a66cef10.js → 8667-8136f74e9a086fca.js} +0 -0
  78. /khoj/interface/compiled/_next/static/chunks/{9259-d8bcd9da9e80c81e.js → 9259-640fdd77408475df.js} +0 -0
  79. {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/WHEEL +0 -0
  80. {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/entry_points.txt +0 -0
  81. {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ from textwrap import dedent
3
4
 
4
5
  from openai import AzureOpenAI, OpenAI
5
6
  from openai.types.chat import ChatCompletion, ChatCompletionMessage
@@ -8,7 +9,7 @@ from khoj.database.models import ChatModel
8
9
  from khoj.processor.conversation.utils import construct_structured_message
9
10
  from khoj.processor.operator.operator_actions import *
10
11
  from khoj.processor.operator.operator_agent_base import AgentActResult
11
- from khoj.processor.operator.operator_environment_base import EnvState
12
+ from khoj.processor.operator.operator_environment_base import EnvironmentType, EnvState
12
13
  from khoj.utils.helpers import get_chat_usage_metrics
13
14
 
14
15
  logger = logging.getLogger(__name__)
@@ -18,6 +19,7 @@ class GroundingAgent:
18
19
  def __init__(
19
20
  self,
20
21
  model: ChatModel,
22
+ environment_type: EnvironmentType,
21
23
  client: OpenAI | AzureOpenAI,
22
24
  max_iterations: int,
23
25
  tracer: dict = None,
@@ -26,9 +28,211 @@ class GroundingAgent:
26
28
  self.client = client
27
29
  self.max_iterations = max_iterations
28
30
  self.tracer = tracer
31
+ self.environment_type = environment_type
32
+ self.action_tools = self.get_tools(self.environment_type)
29
33
 
30
- # Define tools for the grounding LLM (OpenAI format)
31
- self.action_tools = [
34
+ async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
35
+ """Call the grounding LLM to get the next action based on the current state and instruction."""
36
+ # Format the message for the API call
37
+ messages_for_api = self._format_message_for_api(instruction, current_state)
38
+ try:
39
+ grounding_response: ChatCompletion = await self.client.chat.completions.create(
40
+ messages=messages_for_api,
41
+ model=self.model.name,
42
+ tools=self.action_tools,
43
+ tool_choice="required",
44
+ temperature=0.0, # Grounding should be precise
45
+ max_completion_tokens=1000, # Allow for thoughts + actions
46
+ )
47
+ if not isinstance(grounding_response, ChatCompletion):
48
+ raise ValueError("Grounding LLM response is not of type ChatCompletion.")
49
+ logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
50
+
51
+ # Parse tool calls
52
+ grounding_message = grounding_response.choices[0].message
53
+ rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
54
+
55
+ # Update usage by grounding model
56
+ self.tracer["usage"] = get_chat_usage_metrics(
57
+ self.model.name,
58
+ input_tokens=grounding_response.usage.prompt_tokens,
59
+ output_tokens=grounding_response.usage.completion_tokens,
60
+ usage=self.tracer.get("usage"),
61
+ )
62
+ except Exception as e:
63
+ logger.error(f"Error calling Grounding LLM: {e}")
64
+ rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
65
+ actions = []
66
+
67
+ return rendered_response, actions
68
+
69
+ def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
70
+ """Format the message for the API call."""
71
+ # Construct grounding LLM input (using only the latest user prompt + image)
72
+ # We don't pass the full history here, as grounding depends on the *current* state + NL action
73
+ grounding_user_prompt = self.get_instruction(instruction, self.environment_type)
74
+ screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
75
+ grounding_messages_content = construct_structured_message(
76
+ grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
77
+ )
78
+ return [{"role": "user", "content": grounding_messages_content}]
79
+
80
+ def _parse_action(
81
+ self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
82
+ ) -> tuple[str, list[OperatorAction]]:
83
+ """Parse the tool calls from the grounding LLM response and convert them to action objects."""
84
+ actions: List[OperatorAction] = []
85
+ action_results: List[dict] = []
86
+
87
+ if grounding_message.tool_calls:
88
+ rendered_parts = []
89
+ for tool_call in grounding_message.tool_calls:
90
+ function_name = tool_call.function.name
91
+ try:
92
+ arguments = json.loads(tool_call.function.arguments)
93
+ action_to_run: Optional[OperatorAction] = None
94
+ action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
95
+
96
+ if function_name == "click":
97
+ action_to_run = ClickAction(**arguments)
98
+ elif function_name == "left_double":
99
+ action_to_run = DoubleClickAction(**arguments)
100
+ elif function_name == "right_single":
101
+ action_to_run = ClickAction(button="right", **arguments)
102
+ elif function_name == "type":
103
+ content = arguments.get("content")
104
+ action_to_run = TypeAction(text=content)
105
+ elif function_name == "scroll":
106
+ direction = arguments.get("direction", "down")
107
+ amount = 3
108
+ action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
109
+ elif function_name == "hotkey":
110
+ action_to_run = KeypressAction(**arguments)
111
+ elif function_name == "goto":
112
+ action_to_run = GotoAction(**arguments)
113
+ elif function_name == "back":
114
+ action_to_run = BackAction(**arguments)
115
+ elif function_name == "wait":
116
+ action_to_run = WaitAction(**arguments)
117
+ elif function_name == "screenshot":
118
+ action_to_run = ScreenshotAction(**arguments)
119
+ elif function_name == "drag":
120
+ # Need to convert list of dicts to list of Point objects
121
+ path_dicts = arguments.get("path", [])
122
+ path_points = [Point(**p) for p in path_dicts]
123
+ if path_points:
124
+ action_to_run = DragAction(path=path_points)
125
+ else:
126
+ logger.warning(f"Drag action called with empty path: {arguments}")
127
+ action_render_str += " [Skipped - empty path]"
128
+ elif function_name == "finished":
129
+ action_to_run = None
130
+ else:
131
+ logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
132
+ action_render_str += " [Unhandled]"
133
+
134
+ if action_to_run:
135
+ actions.append(action_to_run)
136
+ action_results.append(
137
+ {
138
+ "type": "tool_result",
139
+ "tool_call_id": tool_call.id,
140
+ "content": None, # Updated after environment step
141
+ }
142
+ )
143
+ rendered_parts.append(action_render_str)
144
+ except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
145
+ logger.error(
146
+ f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
147
+ )
148
+ rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
149
+ rendered_response = "\n- ".join(rendered_parts)
150
+ else:
151
+ # Grounding LLM responded but didn't call a tool
152
+ logger.warning("Grounding LLM did not produce a tool call.")
153
+ rendered_response = f"{grounding_message.content or 'No action required.'}"
154
+
155
+ # Render the response
156
+ return rendered_response, actions
157
+
158
+ def get_instruction(self, instruction: str, environment_type: EnvironmentType) -> str:
159
+ """
160
+ Get the instruction for the agent based on the environment type.
161
+ """
162
+ UITARS_COMPUTER_PREFIX_PROMPT = """
163
+ You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
164
+ """
165
+ UITARS_BROWSER_PREFIX_PROMPT = """
166
+ You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
167
+ You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
168
+ Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
169
+ """
170
+
171
+ UITARS_USR_COMPUTER_PROMPT_THOUGHT = f"""
172
+ Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
173
+
174
+ ## Output Format
175
+ ```
176
+ Thought: ...
177
+ Action: ...
178
+ ```
179
+
180
+ ## Action Space
181
+ click(start_box='<|box_start|>(x1,y1)<|box_end|>')
182
+ left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
183
+ right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
184
+ drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
185
+ hotkey(key='')
186
+ type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
187
+ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
188
+ wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
189
+
190
+ ## Note
191
+ - Use English in `Thought` part.
192
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
193
+
194
+ ## User Instruction
195
+ {instruction}
196
+ """
197
+ UITARS_USR_BROWSER_PROMPT_THOUGHT = f"""
198
+ Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
199
+
200
+ ## Output Format
201
+ ```
202
+ Thought: ...
203
+ Action: ...
204
+ ```
205
+
206
+ ## Action Space
207
+ click(start_box='<|box_start|>(x1,y1)<|box_end|>')
208
+ left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
209
+ right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
210
+ drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
211
+ hotkey(key='')
212
+ type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
213
+ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
214
+ wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
215
+ goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
216
+ back() # Use this to go back to the previous page.
217
+
218
+ ## Note
219
+ - Use English in `Thought` part.
220
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
221
+
222
+ ## User Instruction
223
+ {instruction}
224
+ """
225
+
226
+ if environment_type == EnvironmentType.BROWSER:
227
+ return dedent(UITARS_BROWSER_PREFIX_PROMPT + UITARS_USR_BROWSER_PROMPT_THOUGHT).lstrip()
228
+ elif environment_type == EnvironmentType.COMPUTER:
229
+ return dedent(UITARS_COMPUTER_PREFIX_PROMPT + UITARS_USR_COMPUTER_PROMPT_THOUGHT).lstrip()
230
+ else:
231
+ raise ValueError(f"Expected environment type: Computer or Browser. Got {environment_type}.")
232
+
233
+ def get_tools(self, environment_type: EnvironmentType) -> list[dict]:
234
+ """Get tools for the grounding LLM, in OpenAI API tool format"""
235
+ tools = [
32
236
  {
33
237
  "type": "function",
34
238
  "function": {
@@ -163,182 +367,32 @@ class GroundingAgent:
163
367
  },
164
368
  },
165
369
  },
166
- {
167
- "type": "function",
168
- "function": {
169
- "name": "goto",
170
- "description": "Navigate to a specific URL.",
171
- "parameters": {
172
- "type": "object",
173
- "properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
174
- "required": ["url"],
370
+ ]
371
+ if environment_type == EnvironmentType.BROWSER:
372
+ tools += [
373
+ {
374
+ "type": "function",
375
+ "function": {
376
+ "name": "goto",
377
+ "description": "Navigate to a specific URL.",
378
+ "parameters": {
379
+ "type": "object",
380
+ "properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
381
+ "required": ["url"],
382
+ },
175
383
  },
176
384
  },
177
- },
178
- {
179
- "type": "function",
180
- "function": {
181
- "name": "back",
182
- "description": "navigate back to the previous page.",
183
- "parameters": {"type": "object", "properties": {}},
385
+ {
386
+ "type": "function",
387
+ "function": {
388
+ "name": "back",
389
+ "description": "navigate back to the previous page.",
390
+ "parameters": {"type": "object", "properties": {}},
391
+ },
184
392
  },
185
- },
186
- ]
187
-
188
- async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
189
- """Call the grounding LLM to get the next action based on the current state and instruction."""
190
- # Format the message for the API call
191
- messages_for_api = self._format_message_for_api(instruction, current_state)
192
- try:
193
- grounding_response: ChatCompletion = await self.client.chat.completions.create(
194
- messages=messages_for_api,
195
- model=self.model.name,
196
- tools=self.action_tools,
197
- tool_choice="required",
198
- temperature=0.0, # Grounding should be precise
199
- max_completion_tokens=1000, # Allow for thoughts + actions
200
- )
201
- if not isinstance(grounding_response, ChatCompletion):
202
- raise ValueError("Grounding LLM response is not of type ChatCompletion.")
203
- logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
204
-
205
- # Parse tool calls
206
- grounding_message = grounding_response.choices[0].message
207
- rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
208
-
209
- # Update usage by grounding model
210
- self.tracer["usage"] = get_chat_usage_metrics(
211
- self.model.name,
212
- input_tokens=grounding_response.usage.prompt_tokens,
213
- output_tokens=grounding_response.usage.completion_tokens,
214
- usage=self.tracer.get("usage"),
215
- )
216
- except Exception as e:
217
- logger.error(f"Error calling Grounding LLM: {e}")
218
- rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
219
- actions = []
220
-
221
- return rendered_response, actions
222
-
223
- def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
224
- """Format the message for the API call."""
225
- grounding_user_prompt = f"""
226
- You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
227
- You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
228
- Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
229
-
230
- ## Output Format
231
- ```
232
- Thought: ...
233
- Action: ...
234
- ```
235
-
236
- ## Action Space
237
-
238
- click(start_box='<|box_start|>(x1,y1)<|box_end|>')
239
- left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
240
- right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
241
- drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
242
- hotkey(key='')
243
- type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
244
- scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
245
- wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
246
- goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
247
- back() # Use this to go back to the previous page.
248
-
249
- ## Note
250
- - Use English in `Thought` part.
251
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
252
-
253
- ## User Instruction
254
- {instruction}
255
- """.lstrip()
256
-
257
- # Construct grounding LLM input (using only the latest user prompt + image)
258
- # We don't pass the full history here, as grounding depends on the *current* state + NL action
259
- screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
260
- grounding_messages_content = construct_structured_message(
261
- grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
262
- )
263
- return [{"role": "user", "content": grounding_messages_content}]
264
-
265
- def _parse_action(
266
- self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
267
- ) -> tuple[str, list[OperatorAction]]:
268
- """Parse the tool calls from the grounding LLM response and convert them to action objects."""
269
- actions: List[OperatorAction] = []
270
- action_results: List[dict] = []
271
-
272
- if grounding_message.tool_calls:
273
- rendered_parts = []
274
- for tool_call in grounding_message.tool_calls:
275
- function_name = tool_call.function.name
276
- try:
277
- arguments = json.loads(tool_call.function.arguments)
278
- action_to_run: Optional[OperatorAction] = None
279
- action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
393
+ ]
280
394
 
281
- if function_name == "click":
282
- action_to_run = ClickAction(**arguments)
283
- elif function_name == "left_double":
284
- action_to_run = DoubleClickAction(**arguments)
285
- elif function_name == "right_single":
286
- action_to_run = ClickAction(button="right", **arguments)
287
- elif function_name == "type":
288
- content = arguments.get("content")
289
- action_to_run = TypeAction(text=content)
290
- elif function_name == "scroll":
291
- direction = arguments.get("direction", "down")
292
- amount = 3
293
- action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
294
- elif function_name == "hotkey":
295
- action_to_run = KeypressAction(**arguments)
296
- elif function_name == "goto":
297
- action_to_run = GotoAction(**arguments)
298
- elif function_name == "back":
299
- action_to_run = BackAction(**arguments)
300
- elif function_name == "wait":
301
- action_to_run = WaitAction(**arguments)
302
- elif function_name == "screenshot":
303
- action_to_run = ScreenshotAction(**arguments)
304
- elif function_name == "drag":
305
- # Need to convert list of dicts to list of Point objects
306
- path_dicts = arguments.get("path", [])
307
- path_points = [Point(**p) for p in path_dicts]
308
- if path_points:
309
- action_to_run = DragAction(path=path_points)
310
- else:
311
- logger.warning(f"Drag action called with empty path: {arguments}")
312
- action_render_str += " [Skipped - empty path]"
313
- elif function_name == "finished":
314
- action_to_run = None
315
- else:
316
- logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
317
- action_render_str += " [Unhandled]"
318
-
319
- if action_to_run:
320
- actions.append(action_to_run)
321
- action_results.append(
322
- {
323
- "type": "tool_result",
324
- "tool_call_id": tool_call.id,
325
- "content": None, # Updated after environment step
326
- }
327
- )
328
- rendered_parts.append(action_render_str)
329
- except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
330
- logger.error(
331
- f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
332
- )
333
- rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
334
- rendered_response = "\n- ".join(rendered_parts)
335
- else:
336
- # Grounding LLM responded but didn't call a tool
337
- logger.warning("Grounding LLM did not produce a tool call.")
338
- rendered_response = f"{grounding_message.content or 'No action required.'}"
339
-
340
- # Render the response
341
- return rendered_response, actions
395
+ return tools
342
396
 
343
397
  def reset(self):
344
398
  """Reset the agent state."""
@@ -10,15 +10,16 @@ import logging
10
10
  import math
11
11
  import re
12
12
  from io import BytesIO
13
+ from textwrap import dedent
13
14
  from typing import Any, List
14
15
 
15
16
  import numpy as np
16
- from openai import AzureOpenAI, OpenAI
17
+ from openai import AsyncAzureOpenAI, AsyncOpenAI
17
18
  from openai.types.chat import ChatCompletion
18
19
  from PIL import Image
19
20
 
20
21
  from khoj.processor.operator.operator_actions import *
21
- from khoj.processor.operator.operator_environment_base import EnvState
22
+ from khoj.processor.operator.operator_environment_base import EnvironmentType, EnvState
22
23
  from khoj.utils.helpers import get_chat_usage_metrics
23
24
 
24
25
  logger = logging.getLogger(__name__)
@@ -35,29 +36,8 @@ class GroundingAgentUitars:
35
36
  MAX_PIXELS = 16384 * 28 * 28
36
37
  MAX_RATIO = 200
37
38
 
38
- UITARS_USR_PROMPT_THOUGHT = """
39
- You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to perform the next action to complete the task.
40
- You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar.
41
- Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
42
-
43
- ## Output Format
44
- ```
45
- Thought: ...
46
- Action: ...
47
- ```
48
-
49
- ## Action Space
50
- {action_space}
51
-
52
- ## Note
53
- - Use {language} in `Thought` part.
54
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
55
-
56
- ## User Instruction
57
- {instruction}
58
- """
59
-
60
- UITARS_NORMAL_ACTION_SPACE = """
39
+ UITARS_NORMAL_ACTION_SPACE = dedent(
40
+ """
61
41
  click(start_box='<|box_start|>(x1,y1)<|box_end|>')
62
42
  left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
63
43
  right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
@@ -67,14 +47,15 @@ class GroundingAgentUitars:
67
47
  scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
68
48
  wait() #Sleep for 5s and take a screenshot to check for any changes.
69
49
  finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
70
- """.lstrip()
50
+ """
51
+ ).lstrip()
71
52
 
72
53
  def __init__(
73
54
  self,
74
55
  model_name: str,
75
- client: OpenAI | AzureOpenAI,
56
+ environment_type: EnvironmentType,
57
+ client: AsyncOpenAI | AsyncAzureOpenAI,
76
58
  max_iterations=50,
77
- environment_type: Literal["computer", "web"] = "computer",
78
59
  runtime_conf: dict = {
79
60
  "infer_mode": "qwen25vl_normal",
80
61
  "prompt_style": "qwen25vl_normal",
@@ -94,7 +75,7 @@ class GroundingAgentUitars:
94
75
  self.model_name = model_name
95
76
  self.client = client
96
77
  self.tracer = tracer
97
- self.environment_type = environment_type
78
+ self.environment = environment_type
98
79
 
99
80
  self.max_iterations = max_iterations
100
81
  self.runtime_conf = runtime_conf
@@ -116,7 +97,7 @@ class GroundingAgentUitars:
116
97
  self.history_images: list[bytes] = []
117
98
  self.history_responses: list[str] = []
118
99
 
119
- self.prompt_template = self.UITARS_USR_PROMPT_THOUGHT
100
+ self.prompt_template = self.get_instruction(self.environment)
120
101
  self.prompt_action_space = self.UITARS_NORMAL_ACTION_SPACE
121
102
 
122
103
  if "history_n" in self.runtime_conf:
@@ -126,11 +107,11 @@ class GroundingAgentUitars:
126
107
 
127
108
  self.cur_callusr_count = 0
128
109
 
129
- async def act(self, instruction: str, env_state: EnvState) -> tuple[str, list[OperatorAction]]:
110
+ async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
130
111
  """
131
112
  Suggest the next action(s) based on the instruction and current environment.
132
113
  """
133
- messages = self._format_messages_for_api(instruction, env_state)
114
+ messages = self._format_messages_for_api(instruction, current_state)
134
115
 
135
116
  recent_screenshot = Image.open(BytesIO(self.history_images[-1]))
136
117
  origin_resized_height = recent_screenshot.height
@@ -145,9 +126,11 @@ class GroundingAgentUitars:
145
126
  try_times = 3
146
127
  while not parsed_responses:
147
128
  if try_times <= 0:
148
- print(f"Reach max retry times to fetch response from client, as error flag.")
129
+ logger.warning(f"Reach max retry times to fetch response from client, as error flag.")
149
130
  return "client error\nFAIL", []
150
131
  try:
132
+ message_content = "\n".join([msg["content"][0].get("text") or "[image]" for msg in messages])
133
+ logger.debug(f"User message content: {message_content}")
151
134
  response: ChatCompletion = await self.client.chat.completions.create(
152
135
  model="ui-tars",
153
136
  messages=messages,
@@ -228,20 +211,9 @@ class GroundingAgentUitars:
228
211
  self.actions.append(actions)
229
212
  return f"{prediction}\nFAIL", []
230
213
 
231
- if self.environment_type == "web":
232
- actions.extend(
233
- self.parsing_response_to_action(parsed_response, obs_image_height, obs_image_width, self.input_swap)
234
- )
235
- else:
236
- pass
237
- # TODO: Add PyautoguiAction when enable computer environment
238
- # actions.append(
239
- # PyautoguiAction(code=
240
- # self.parsing_response_to_pyautogui_code(
241
- # parsed_response, obs_image_height, obs_image_width, self.input_swap
242
- # )
243
- # )
244
- # )
214
+ actions.extend(
215
+ self.parsing_response_to_action(parsed_response, obs_image_height, obs_image_width, self.input_swap)
216
+ )
245
217
 
246
218
  self.actions.append(actions)
247
219
 
@@ -252,13 +224,52 @@ class GroundingAgentUitars:
252
224
 
253
225
  return prediction or "", actions
254
226
 
255
- def _format_messages_for_api(self, instruction: str, env_state: EnvState):
227
+ def get_instruction(self, environment_type: EnvironmentType) -> str:
228
+ """
229
+ Get the instruction for the agent based on the environment type.
230
+ """
231
+ UITARS_COMPUTER_PREFIX_PROMPT = """
232
+ You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
233
+ """
234
+ UITARS_BROWSER_PREFIX_PROMPT = """
235
+ You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to perform the next action to complete the task.
236
+ You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar.
237
+ """
238
+
239
+ UITARS_USR_PROMPT_THOUGHT = """
240
+ Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
241
+
242
+ ## Output Format
243
+ ```
244
+ Thought: ...
245
+ Action: ...
246
+ ```
247
+
248
+ ## Action Space
249
+ {action_space}
250
+
251
+ ## Note
252
+ - Use {language} in `Thought` part.
253
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
254
+
255
+ ## User Instruction
256
+ {instruction}
257
+ """
258
+
259
+ if environment_type == EnvironmentType.BROWSER:
260
+ return dedent(UITARS_BROWSER_PREFIX_PROMPT + UITARS_USR_PROMPT_THOUGHT).lstrip()
261
+ elif environment_type == EnvironmentType.COMPUTER:
262
+ return dedent(UITARS_COMPUTER_PREFIX_PROMPT + UITARS_USR_PROMPT_THOUGHT).lstrip()
263
+ else:
264
+ raise ValueError(f"Unsupported environment type: {environment_type}")
265
+
266
+ def _format_messages_for_api(self, instruction: str, current_state: EnvState):
256
267
  assert len(self.observations) == len(self.actions) and len(self.actions) == len(
257
268
  self.thoughts
258
269
  ), "The number of observations and actions should be the same."
259
270
 
260
- self.history_images.append(base64.b64decode(env_state.screenshot))
261
- self.observations.append({"screenshot": env_state.screenshot, "accessibility_tree": None})
271
+ self.history_images.append(base64.b64decode(current_state.screenshot))
272
+ self.observations.append({"screenshot": current_state.screenshot, "accessibility_tree": None})
262
273
 
263
274
  user_prompt = self.prompt_template.format(
264
275
  instruction=instruction, action_space=self.prompt_action_space, language=self.language