khoj 1.41.1.dev97__py3-none-any.whl → 1.41.1.dev142__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +20 -0
- khoj/database/models/__init__.py +3 -0
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/{2327-aa22697ed9c8d54a.js → 2327-f03b2a77f67b8f8c.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{8515-f305779d95dd5780.js → 5138-81457f7f59956b56.js} +9 -9
- khoj/interface/compiled/_next/static/chunks/5477-b91e9926cfc3095c.js +1 -0
- khoj/interface/compiled/_next/static/chunks/7127-d3199617463d45f0.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-ceeb9a91edea74ce.js → page-774c78ff0f55a228.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-e3cb78747ab98cc7.js → page-4454891c5007b870.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-7e780dc11eb5e5d3.js → page-5a2559825b4d5def.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-a4053e1bb578b2ce.js → page-f7a0286dfc31ad6b.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-f5881c7ae3ba0795.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/{page-8973da2f4c076fe1.js → page-f1a7f278c89e09b6.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-375136dbb400525b.js → page-5d9134d4a97f8834.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-384b54fc953b18f2.js → page-32cd0ceb9ffbd777.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-21f76f7f59582bc7.js → webpack-952bc0d41769db77.js} +1 -1
- khoj/interface/compiled/_next/static/css/{fca983d49c3dd1a3.css → 0db53bacf81896f5.css} +1 -1
- khoj/interface/compiled/_next/static/css/93eeacc43e261162.css +1 -0
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +3 -3
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +4 -4
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/anthropic/anthropic_chat.py +9 -10
- khoj/processor/conversation/anthropic/utils.py +30 -7
- khoj/processor/conversation/google/gemini_chat.py +10 -10
- khoj/processor/conversation/google/utils.py +20 -12
- khoj/processor/conversation/offline/chat_model.py +2 -7
- khoj/processor/conversation/openai/gpt.py +9 -10
- khoj/processor/conversation/utils.py +177 -53
- khoj/processor/operator/README.md +59 -0
- khoj/processor/operator/{operate_browser.py → __init__.py} +98 -34
- khoj/processor/operator/grounding_agent.py +229 -175
- khoj/processor/operator/grounding_agent_uitars.py +61 -50
- khoj/processor/operator/operator_actions.py +48 -0
- khoj/processor/operator/operator_agent_anthropic.py +298 -90
- khoj/processor/operator/operator_agent_base.py +45 -14
- khoj/processor/operator/operator_agent_binary.py +125 -57
- khoj/processor/operator/operator_agent_openai.py +183 -75
- khoj/processor/operator/operator_environment_base.py +11 -1
- khoj/processor/operator/operator_environment_browser.py +5 -3
- khoj/processor/operator/operator_environment_computer.py +658 -0
- khoj/routers/api_chat.py +125 -43
- khoj/routers/api_model.py +3 -3
- khoj/routers/helpers.py +13 -18
- khoj/routers/research.py +57 -23
- khoj/utils/constants.py +4 -4
- khoj/utils/helpers.py +12 -15
- khoj/utils/rawconfig.py +1 -0
- {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/METADATA +3 -1
- {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/RECORD +74 -72
- khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +0 -1
- khoj/interface/compiled/_next/static/chunks/5477-77ce5c6f468d6c25.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
- khoj/interface/compiled/_next/static/css/f29752d6e1be7624.css +0 -1
- /khoj/interface/compiled/_next/static/{o6zlo73DbD2lS92jWHS8o → 4CIEX6Ko-Qehhb7L-ymZw}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{o6zlo73DbD2lS92jWHS8o → 4CIEX6Ko-Qehhb7L-ymZw}/_ssgManifest.js +0 -0
- /khoj/interface/compiled/_next/static/chunks/{1915-ab4353eaca76f690.js → 1915-1943ee8a628b893c.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{2117-1c18aa2098982bf9.js → 2117-5a41630a2bd2eae8.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4363-4efaf12abe696251.js → 4363-e6ac2203564d1a3b.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4447-5d44807c40355b1a.js → 4447-e038b251d626c340.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{8667-adbe6017a66cef10.js → 8667-8136f74e9a086fca.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{9259-d8bcd9da9e80c81e.js → 9259-640fdd77408475df.js} +0 -0
- {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/WHEEL +0 -0
- {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/entry_points.txt +0 -0
- {khoj-1.41.1.dev97.dist-info → khoj-1.41.1.dev142.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
+
from textwrap import dedent
|
3
4
|
|
4
5
|
from openai import AzureOpenAI, OpenAI
|
5
6
|
from openai.types.chat import ChatCompletion, ChatCompletionMessage
|
@@ -8,7 +9,7 @@ from khoj.database.models import ChatModel
|
|
8
9
|
from khoj.processor.conversation.utils import construct_structured_message
|
9
10
|
from khoj.processor.operator.operator_actions import *
|
10
11
|
from khoj.processor.operator.operator_agent_base import AgentActResult
|
11
|
-
from khoj.processor.operator.operator_environment_base import EnvState
|
12
|
+
from khoj.processor.operator.operator_environment_base import EnvironmentType, EnvState
|
12
13
|
from khoj.utils.helpers import get_chat_usage_metrics
|
13
14
|
|
14
15
|
logger = logging.getLogger(__name__)
|
@@ -18,6 +19,7 @@ class GroundingAgent:
|
|
18
19
|
def __init__(
|
19
20
|
self,
|
20
21
|
model: ChatModel,
|
22
|
+
environment_type: EnvironmentType,
|
21
23
|
client: OpenAI | AzureOpenAI,
|
22
24
|
max_iterations: int,
|
23
25
|
tracer: dict = None,
|
@@ -26,9 +28,211 @@ class GroundingAgent:
|
|
26
28
|
self.client = client
|
27
29
|
self.max_iterations = max_iterations
|
28
30
|
self.tracer = tracer
|
31
|
+
self.environment_type = environment_type
|
32
|
+
self.action_tools = self.get_tools(self.environment_type)
|
29
33
|
|
30
|
-
|
31
|
-
|
34
|
+
async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
|
35
|
+
"""Call the grounding LLM to get the next action based on the current state and instruction."""
|
36
|
+
# Format the message for the API call
|
37
|
+
messages_for_api = self._format_message_for_api(instruction, current_state)
|
38
|
+
try:
|
39
|
+
grounding_response: ChatCompletion = await self.client.chat.completions.create(
|
40
|
+
messages=messages_for_api,
|
41
|
+
model=self.model.name,
|
42
|
+
tools=self.action_tools,
|
43
|
+
tool_choice="required",
|
44
|
+
temperature=0.0, # Grounding should be precise
|
45
|
+
max_completion_tokens=1000, # Allow for thoughts + actions
|
46
|
+
)
|
47
|
+
if not isinstance(grounding_response, ChatCompletion):
|
48
|
+
raise ValueError("Grounding LLM response is not of type ChatCompletion.")
|
49
|
+
logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
|
50
|
+
|
51
|
+
# Parse tool calls
|
52
|
+
grounding_message = grounding_response.choices[0].message
|
53
|
+
rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
|
54
|
+
|
55
|
+
# Update usage by grounding model
|
56
|
+
self.tracer["usage"] = get_chat_usage_metrics(
|
57
|
+
self.model.name,
|
58
|
+
input_tokens=grounding_response.usage.prompt_tokens,
|
59
|
+
output_tokens=grounding_response.usage.completion_tokens,
|
60
|
+
usage=self.tracer.get("usage"),
|
61
|
+
)
|
62
|
+
except Exception as e:
|
63
|
+
logger.error(f"Error calling Grounding LLM: {e}")
|
64
|
+
rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
|
65
|
+
actions = []
|
66
|
+
|
67
|
+
return rendered_response, actions
|
68
|
+
|
69
|
+
def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
|
70
|
+
"""Format the message for the API call."""
|
71
|
+
# Construct grounding LLM input (using only the latest user prompt + image)
|
72
|
+
# We don't pass the full history here, as grounding depends on the *current* state + NL action
|
73
|
+
grounding_user_prompt = self.get_instruction(instruction, self.environment_type)
|
74
|
+
screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
|
75
|
+
grounding_messages_content = construct_structured_message(
|
76
|
+
grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
|
77
|
+
)
|
78
|
+
return [{"role": "user", "content": grounding_messages_content}]
|
79
|
+
|
80
|
+
def _parse_action(
|
81
|
+
self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
|
82
|
+
) -> tuple[str, list[OperatorAction]]:
|
83
|
+
"""Parse the tool calls from the grounding LLM response and convert them to action objects."""
|
84
|
+
actions: List[OperatorAction] = []
|
85
|
+
action_results: List[dict] = []
|
86
|
+
|
87
|
+
if grounding_message.tool_calls:
|
88
|
+
rendered_parts = []
|
89
|
+
for tool_call in grounding_message.tool_calls:
|
90
|
+
function_name = tool_call.function.name
|
91
|
+
try:
|
92
|
+
arguments = json.loads(tool_call.function.arguments)
|
93
|
+
action_to_run: Optional[OperatorAction] = None
|
94
|
+
action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
|
95
|
+
|
96
|
+
if function_name == "click":
|
97
|
+
action_to_run = ClickAction(**arguments)
|
98
|
+
elif function_name == "left_double":
|
99
|
+
action_to_run = DoubleClickAction(**arguments)
|
100
|
+
elif function_name == "right_single":
|
101
|
+
action_to_run = ClickAction(button="right", **arguments)
|
102
|
+
elif function_name == "type":
|
103
|
+
content = arguments.get("content")
|
104
|
+
action_to_run = TypeAction(text=content)
|
105
|
+
elif function_name == "scroll":
|
106
|
+
direction = arguments.get("direction", "down")
|
107
|
+
amount = 3
|
108
|
+
action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
|
109
|
+
elif function_name == "hotkey":
|
110
|
+
action_to_run = KeypressAction(**arguments)
|
111
|
+
elif function_name == "goto":
|
112
|
+
action_to_run = GotoAction(**arguments)
|
113
|
+
elif function_name == "back":
|
114
|
+
action_to_run = BackAction(**arguments)
|
115
|
+
elif function_name == "wait":
|
116
|
+
action_to_run = WaitAction(**arguments)
|
117
|
+
elif function_name == "screenshot":
|
118
|
+
action_to_run = ScreenshotAction(**arguments)
|
119
|
+
elif function_name == "drag":
|
120
|
+
# Need to convert list of dicts to list of Point objects
|
121
|
+
path_dicts = arguments.get("path", [])
|
122
|
+
path_points = [Point(**p) for p in path_dicts]
|
123
|
+
if path_points:
|
124
|
+
action_to_run = DragAction(path=path_points)
|
125
|
+
else:
|
126
|
+
logger.warning(f"Drag action called with empty path: {arguments}")
|
127
|
+
action_render_str += " [Skipped - empty path]"
|
128
|
+
elif function_name == "finished":
|
129
|
+
action_to_run = None
|
130
|
+
else:
|
131
|
+
logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
|
132
|
+
action_render_str += " [Unhandled]"
|
133
|
+
|
134
|
+
if action_to_run:
|
135
|
+
actions.append(action_to_run)
|
136
|
+
action_results.append(
|
137
|
+
{
|
138
|
+
"type": "tool_result",
|
139
|
+
"tool_call_id": tool_call.id,
|
140
|
+
"content": None, # Updated after environment step
|
141
|
+
}
|
142
|
+
)
|
143
|
+
rendered_parts.append(action_render_str)
|
144
|
+
except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
|
145
|
+
logger.error(
|
146
|
+
f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
|
147
|
+
)
|
148
|
+
rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
|
149
|
+
rendered_response = "\n- ".join(rendered_parts)
|
150
|
+
else:
|
151
|
+
# Grounding LLM responded but didn't call a tool
|
152
|
+
logger.warning("Grounding LLM did not produce a tool call.")
|
153
|
+
rendered_response = f"{grounding_message.content or 'No action required.'}"
|
154
|
+
|
155
|
+
# Render the response
|
156
|
+
return rendered_response, actions
|
157
|
+
|
158
|
+
def get_instruction(self, instruction: str, environment_type: EnvironmentType) -> str:
|
159
|
+
"""
|
160
|
+
Get the instruction for the agent based on the environment type.
|
161
|
+
"""
|
162
|
+
UITARS_COMPUTER_PREFIX_PROMPT = """
|
163
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
164
|
+
"""
|
165
|
+
UITARS_BROWSER_PREFIX_PROMPT = """
|
166
|
+
You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
|
167
|
+
You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
|
168
|
+
Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
|
169
|
+
"""
|
170
|
+
|
171
|
+
UITARS_USR_COMPUTER_PROMPT_THOUGHT = f"""
|
172
|
+
Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
|
173
|
+
|
174
|
+
## Output Format
|
175
|
+
```
|
176
|
+
Thought: ...
|
177
|
+
Action: ...
|
178
|
+
```
|
179
|
+
|
180
|
+
## Action Space
|
181
|
+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
182
|
+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
183
|
+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
184
|
+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
185
|
+
hotkey(key='')
|
186
|
+
type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
187
|
+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
188
|
+
wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
|
189
|
+
|
190
|
+
## Note
|
191
|
+
- Use English in `Thought` part.
|
192
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
193
|
+
|
194
|
+
## User Instruction
|
195
|
+
{instruction}
|
196
|
+
"""
|
197
|
+
UITARS_USR_BROWSER_PROMPT_THOUGHT = f"""
|
198
|
+
Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
|
199
|
+
|
200
|
+
## Output Format
|
201
|
+
```
|
202
|
+
Thought: ...
|
203
|
+
Action: ...
|
204
|
+
```
|
205
|
+
|
206
|
+
## Action Space
|
207
|
+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
208
|
+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
209
|
+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
210
|
+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
211
|
+
hotkey(key='')
|
212
|
+
type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
213
|
+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
214
|
+
wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
|
215
|
+
goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
|
216
|
+
back() # Use this to go back to the previous page.
|
217
|
+
|
218
|
+
## Note
|
219
|
+
- Use English in `Thought` part.
|
220
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
221
|
+
|
222
|
+
## User Instruction
|
223
|
+
{instruction}
|
224
|
+
"""
|
225
|
+
|
226
|
+
if environment_type == EnvironmentType.BROWSER:
|
227
|
+
return dedent(UITARS_BROWSER_PREFIX_PROMPT + UITARS_USR_BROWSER_PROMPT_THOUGHT).lstrip()
|
228
|
+
elif environment_type == EnvironmentType.COMPUTER:
|
229
|
+
return dedent(UITARS_COMPUTER_PREFIX_PROMPT + UITARS_USR_COMPUTER_PROMPT_THOUGHT).lstrip()
|
230
|
+
else:
|
231
|
+
raise ValueError(f"Expected environment type: Computer or Browser. Got {environment_type}.")
|
232
|
+
|
233
|
+
def get_tools(self, environment_type: EnvironmentType) -> list[dict]:
|
234
|
+
"""Get tools for the grounding LLM, in OpenAI API tool format"""
|
235
|
+
tools = [
|
32
236
|
{
|
33
237
|
"type": "function",
|
34
238
|
"function": {
|
@@ -163,182 +367,32 @@ class GroundingAgent:
|
|
163
367
|
},
|
164
368
|
},
|
165
369
|
},
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
"
|
171
|
-
"
|
172
|
-
"
|
173
|
-
"
|
174
|
-
"
|
370
|
+
]
|
371
|
+
if environment_type == EnvironmentType.BROWSER:
|
372
|
+
tools += [
|
373
|
+
{
|
374
|
+
"type": "function",
|
375
|
+
"function": {
|
376
|
+
"name": "goto",
|
377
|
+
"description": "Navigate to a specific URL.",
|
378
|
+
"parameters": {
|
379
|
+
"type": "object",
|
380
|
+
"properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
|
381
|
+
"required": ["url"],
|
382
|
+
},
|
175
383
|
},
|
176
384
|
},
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
385
|
+
{
|
386
|
+
"type": "function",
|
387
|
+
"function": {
|
388
|
+
"name": "back",
|
389
|
+
"description": "navigate back to the previous page.",
|
390
|
+
"parameters": {"type": "object", "properties": {}},
|
391
|
+
},
|
184
392
|
},
|
185
|
-
|
186
|
-
]
|
187
|
-
|
188
|
-
async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
|
189
|
-
"""Call the grounding LLM to get the next action based on the current state and instruction."""
|
190
|
-
# Format the message for the API call
|
191
|
-
messages_for_api = self._format_message_for_api(instruction, current_state)
|
192
|
-
try:
|
193
|
-
grounding_response: ChatCompletion = await self.client.chat.completions.create(
|
194
|
-
messages=messages_for_api,
|
195
|
-
model=self.model.name,
|
196
|
-
tools=self.action_tools,
|
197
|
-
tool_choice="required",
|
198
|
-
temperature=0.0, # Grounding should be precise
|
199
|
-
max_completion_tokens=1000, # Allow for thoughts + actions
|
200
|
-
)
|
201
|
-
if not isinstance(grounding_response, ChatCompletion):
|
202
|
-
raise ValueError("Grounding LLM response is not of type ChatCompletion.")
|
203
|
-
logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
|
204
|
-
|
205
|
-
# Parse tool calls
|
206
|
-
grounding_message = grounding_response.choices[0].message
|
207
|
-
rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
|
208
|
-
|
209
|
-
# Update usage by grounding model
|
210
|
-
self.tracer["usage"] = get_chat_usage_metrics(
|
211
|
-
self.model.name,
|
212
|
-
input_tokens=grounding_response.usage.prompt_tokens,
|
213
|
-
output_tokens=grounding_response.usage.completion_tokens,
|
214
|
-
usage=self.tracer.get("usage"),
|
215
|
-
)
|
216
|
-
except Exception as e:
|
217
|
-
logger.error(f"Error calling Grounding LLM: {e}")
|
218
|
-
rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
|
219
|
-
actions = []
|
220
|
-
|
221
|
-
return rendered_response, actions
|
222
|
-
|
223
|
-
def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
|
224
|
-
"""Format the message for the API call."""
|
225
|
-
grounding_user_prompt = f"""
|
226
|
-
You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
|
227
|
-
You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
|
228
|
-
Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
|
229
|
-
|
230
|
-
## Output Format
|
231
|
-
```
|
232
|
-
Thought: ...
|
233
|
-
Action: ...
|
234
|
-
```
|
235
|
-
|
236
|
-
## Action Space
|
237
|
-
|
238
|
-
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
239
|
-
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
240
|
-
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
241
|
-
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
242
|
-
hotkey(key='')
|
243
|
-
type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
244
|
-
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
245
|
-
wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
|
246
|
-
goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
|
247
|
-
back() # Use this to go back to the previous page.
|
248
|
-
|
249
|
-
## Note
|
250
|
-
- Use English in `Thought` part.
|
251
|
-
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
252
|
-
|
253
|
-
## User Instruction
|
254
|
-
{instruction}
|
255
|
-
""".lstrip()
|
256
|
-
|
257
|
-
# Construct grounding LLM input (using only the latest user prompt + image)
|
258
|
-
# We don't pass the full history here, as grounding depends on the *current* state + NL action
|
259
|
-
screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
|
260
|
-
grounding_messages_content = construct_structured_message(
|
261
|
-
grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
|
262
|
-
)
|
263
|
-
return [{"role": "user", "content": grounding_messages_content}]
|
264
|
-
|
265
|
-
def _parse_action(
|
266
|
-
self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
|
267
|
-
) -> tuple[str, list[OperatorAction]]:
|
268
|
-
"""Parse the tool calls from the grounding LLM response and convert them to action objects."""
|
269
|
-
actions: List[OperatorAction] = []
|
270
|
-
action_results: List[dict] = []
|
271
|
-
|
272
|
-
if grounding_message.tool_calls:
|
273
|
-
rendered_parts = []
|
274
|
-
for tool_call in grounding_message.tool_calls:
|
275
|
-
function_name = tool_call.function.name
|
276
|
-
try:
|
277
|
-
arguments = json.loads(tool_call.function.arguments)
|
278
|
-
action_to_run: Optional[OperatorAction] = None
|
279
|
-
action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
|
393
|
+
]
|
280
394
|
|
281
|
-
|
282
|
-
action_to_run = ClickAction(**arguments)
|
283
|
-
elif function_name == "left_double":
|
284
|
-
action_to_run = DoubleClickAction(**arguments)
|
285
|
-
elif function_name == "right_single":
|
286
|
-
action_to_run = ClickAction(button="right", **arguments)
|
287
|
-
elif function_name == "type":
|
288
|
-
content = arguments.get("content")
|
289
|
-
action_to_run = TypeAction(text=content)
|
290
|
-
elif function_name == "scroll":
|
291
|
-
direction = arguments.get("direction", "down")
|
292
|
-
amount = 3
|
293
|
-
action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
|
294
|
-
elif function_name == "hotkey":
|
295
|
-
action_to_run = KeypressAction(**arguments)
|
296
|
-
elif function_name == "goto":
|
297
|
-
action_to_run = GotoAction(**arguments)
|
298
|
-
elif function_name == "back":
|
299
|
-
action_to_run = BackAction(**arguments)
|
300
|
-
elif function_name == "wait":
|
301
|
-
action_to_run = WaitAction(**arguments)
|
302
|
-
elif function_name == "screenshot":
|
303
|
-
action_to_run = ScreenshotAction(**arguments)
|
304
|
-
elif function_name == "drag":
|
305
|
-
# Need to convert list of dicts to list of Point objects
|
306
|
-
path_dicts = arguments.get("path", [])
|
307
|
-
path_points = [Point(**p) for p in path_dicts]
|
308
|
-
if path_points:
|
309
|
-
action_to_run = DragAction(path=path_points)
|
310
|
-
else:
|
311
|
-
logger.warning(f"Drag action called with empty path: {arguments}")
|
312
|
-
action_render_str += " [Skipped - empty path]"
|
313
|
-
elif function_name == "finished":
|
314
|
-
action_to_run = None
|
315
|
-
else:
|
316
|
-
logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
|
317
|
-
action_render_str += " [Unhandled]"
|
318
|
-
|
319
|
-
if action_to_run:
|
320
|
-
actions.append(action_to_run)
|
321
|
-
action_results.append(
|
322
|
-
{
|
323
|
-
"type": "tool_result",
|
324
|
-
"tool_call_id": tool_call.id,
|
325
|
-
"content": None, # Updated after environment step
|
326
|
-
}
|
327
|
-
)
|
328
|
-
rendered_parts.append(action_render_str)
|
329
|
-
except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
|
330
|
-
logger.error(
|
331
|
-
f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
|
332
|
-
)
|
333
|
-
rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
|
334
|
-
rendered_response = "\n- ".join(rendered_parts)
|
335
|
-
else:
|
336
|
-
# Grounding LLM responded but didn't call a tool
|
337
|
-
logger.warning("Grounding LLM did not produce a tool call.")
|
338
|
-
rendered_response = f"{grounding_message.content or 'No action required.'}"
|
339
|
-
|
340
|
-
# Render the response
|
341
|
-
return rendered_response, actions
|
395
|
+
return tools
|
342
396
|
|
343
397
|
def reset(self):
|
344
398
|
"""Reset the agent state."""
|
@@ -10,15 +10,16 @@ import logging
|
|
10
10
|
import math
|
11
11
|
import re
|
12
12
|
from io import BytesIO
|
13
|
+
from textwrap import dedent
|
13
14
|
from typing import Any, List
|
14
15
|
|
15
16
|
import numpy as np
|
16
|
-
from openai import
|
17
|
+
from openai import AsyncAzureOpenAI, AsyncOpenAI
|
17
18
|
from openai.types.chat import ChatCompletion
|
18
19
|
from PIL import Image
|
19
20
|
|
20
21
|
from khoj.processor.operator.operator_actions import *
|
21
|
-
from khoj.processor.operator.operator_environment_base import EnvState
|
22
|
+
from khoj.processor.operator.operator_environment_base import EnvironmentType, EnvState
|
22
23
|
from khoj.utils.helpers import get_chat_usage_metrics
|
23
24
|
|
24
25
|
logger = logging.getLogger(__name__)
|
@@ -35,29 +36,8 @@ class GroundingAgentUitars:
|
|
35
36
|
MAX_PIXELS = 16384 * 28 * 28
|
36
37
|
MAX_RATIO = 200
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar.
|
41
|
-
Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
|
42
|
-
|
43
|
-
## Output Format
|
44
|
-
```
|
45
|
-
Thought: ...
|
46
|
-
Action: ...
|
47
|
-
```
|
48
|
-
|
49
|
-
## Action Space
|
50
|
-
{action_space}
|
51
|
-
|
52
|
-
## Note
|
53
|
-
- Use {language} in `Thought` part.
|
54
|
-
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
55
|
-
|
56
|
-
## User Instruction
|
57
|
-
{instruction}
|
58
|
-
"""
|
59
|
-
|
60
|
-
UITARS_NORMAL_ACTION_SPACE = """
|
39
|
+
UITARS_NORMAL_ACTION_SPACE = dedent(
|
40
|
+
"""
|
61
41
|
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
62
42
|
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
63
43
|
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
@@ -67,14 +47,15 @@ class GroundingAgentUitars:
|
|
67
47
|
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
68
48
|
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
69
49
|
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
|
70
|
-
"""
|
50
|
+
"""
|
51
|
+
).lstrip()
|
71
52
|
|
72
53
|
def __init__(
|
73
54
|
self,
|
74
55
|
model_name: str,
|
75
|
-
|
56
|
+
environment_type: EnvironmentType,
|
57
|
+
client: AsyncOpenAI | AsyncAzureOpenAI,
|
76
58
|
max_iterations=50,
|
77
|
-
environment_type: Literal["computer", "web"] = "computer",
|
78
59
|
runtime_conf: dict = {
|
79
60
|
"infer_mode": "qwen25vl_normal",
|
80
61
|
"prompt_style": "qwen25vl_normal",
|
@@ -94,7 +75,7 @@ class GroundingAgentUitars:
|
|
94
75
|
self.model_name = model_name
|
95
76
|
self.client = client
|
96
77
|
self.tracer = tracer
|
97
|
-
self.
|
78
|
+
self.environment = environment_type
|
98
79
|
|
99
80
|
self.max_iterations = max_iterations
|
100
81
|
self.runtime_conf = runtime_conf
|
@@ -116,7 +97,7 @@ class GroundingAgentUitars:
|
|
116
97
|
self.history_images: list[bytes] = []
|
117
98
|
self.history_responses: list[str] = []
|
118
99
|
|
119
|
-
self.prompt_template = self.
|
100
|
+
self.prompt_template = self.get_instruction(self.environment)
|
120
101
|
self.prompt_action_space = self.UITARS_NORMAL_ACTION_SPACE
|
121
102
|
|
122
103
|
if "history_n" in self.runtime_conf:
|
@@ -126,11 +107,11 @@ class GroundingAgentUitars:
|
|
126
107
|
|
127
108
|
self.cur_callusr_count = 0
|
128
109
|
|
129
|
-
async def act(self, instruction: str,
|
110
|
+
async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
|
130
111
|
"""
|
131
112
|
Suggest the next action(s) based on the instruction and current environment.
|
132
113
|
"""
|
133
|
-
messages = self._format_messages_for_api(instruction,
|
114
|
+
messages = self._format_messages_for_api(instruction, current_state)
|
134
115
|
|
135
116
|
recent_screenshot = Image.open(BytesIO(self.history_images[-1]))
|
136
117
|
origin_resized_height = recent_screenshot.height
|
@@ -145,9 +126,11 @@ class GroundingAgentUitars:
|
|
145
126
|
try_times = 3
|
146
127
|
while not parsed_responses:
|
147
128
|
if try_times <= 0:
|
148
|
-
|
129
|
+
logger.warning(f"Reach max retry times to fetch response from client, as error flag.")
|
149
130
|
return "client error\nFAIL", []
|
150
131
|
try:
|
132
|
+
message_content = "\n".join([msg["content"][0].get("text") or "[image]" for msg in messages])
|
133
|
+
logger.debug(f"User message content: {message_content}")
|
151
134
|
response: ChatCompletion = await self.client.chat.completions.create(
|
152
135
|
model="ui-tars",
|
153
136
|
messages=messages,
|
@@ -228,20 +211,9 @@ class GroundingAgentUitars:
|
|
228
211
|
self.actions.append(actions)
|
229
212
|
return f"{prediction}\nFAIL", []
|
230
213
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
)
|
235
|
-
else:
|
236
|
-
pass
|
237
|
-
# TODO: Add PyautoguiAction when enable computer environment
|
238
|
-
# actions.append(
|
239
|
-
# PyautoguiAction(code=
|
240
|
-
# self.parsing_response_to_pyautogui_code(
|
241
|
-
# parsed_response, obs_image_height, obs_image_width, self.input_swap
|
242
|
-
# )
|
243
|
-
# )
|
244
|
-
# )
|
214
|
+
actions.extend(
|
215
|
+
self.parsing_response_to_action(parsed_response, obs_image_height, obs_image_width, self.input_swap)
|
216
|
+
)
|
245
217
|
|
246
218
|
self.actions.append(actions)
|
247
219
|
|
@@ -252,13 +224,52 @@ class GroundingAgentUitars:
|
|
252
224
|
|
253
225
|
return prediction or "", actions
|
254
226
|
|
255
|
-
def
|
227
|
+
def get_instruction(self, environment_type: EnvironmentType) -> str:
|
228
|
+
"""
|
229
|
+
Get the instruction for the agent based on the environment type.
|
230
|
+
"""
|
231
|
+
UITARS_COMPUTER_PREFIX_PROMPT = """
|
232
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
233
|
+
"""
|
234
|
+
UITARS_BROWSER_PREFIX_PROMPT = """
|
235
|
+
You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to perform the next action to complete the task.
|
236
|
+
You control a single tab in a Chromium browser. You cannot access the OS, filesystem, the application window or the addressbar.
|
237
|
+
"""
|
238
|
+
|
239
|
+
UITARS_USR_PROMPT_THOUGHT = """
|
240
|
+
Try fulfill the user instruction to the best of your ability, especially when the instruction is given multiple times. Do not ignore the instruction.
|
241
|
+
|
242
|
+
## Output Format
|
243
|
+
```
|
244
|
+
Thought: ...
|
245
|
+
Action: ...
|
246
|
+
```
|
247
|
+
|
248
|
+
## Action Space
|
249
|
+
{action_space}
|
250
|
+
|
251
|
+
## Note
|
252
|
+
- Use {language} in `Thought` part.
|
253
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
254
|
+
|
255
|
+
## User Instruction
|
256
|
+
{instruction}
|
257
|
+
"""
|
258
|
+
|
259
|
+
if environment_type == EnvironmentType.BROWSER:
|
260
|
+
return dedent(UITARS_BROWSER_PREFIX_PROMPT + UITARS_USR_PROMPT_THOUGHT).lstrip()
|
261
|
+
elif environment_type == EnvironmentType.COMPUTER:
|
262
|
+
return dedent(UITARS_COMPUTER_PREFIX_PROMPT + UITARS_USR_PROMPT_THOUGHT).lstrip()
|
263
|
+
else:
|
264
|
+
raise ValueError(f"Unsupported environment type: {environment_type}")
|
265
|
+
|
266
|
+
def _format_messages_for_api(self, instruction: str, current_state: EnvState):
|
256
267
|
assert len(self.observations) == len(self.actions) and len(self.actions) == len(
|
257
268
|
self.thoughts
|
258
269
|
), "The number of observations and actions should be the same."
|
259
270
|
|
260
|
-
self.history_images.append(base64.b64decode(
|
261
|
-
self.observations.append({"screenshot":
|
271
|
+
self.history_images.append(base64.b64decode(current_state.screenshot))
|
272
|
+
self.observations.append({"screenshot": current_state.screenshot, "accessibility_tree": None})
|
262
273
|
|
263
274
|
user_prompt = self.prompt_template.format(
|
264
275
|
instruction=instruction, action_space=self.prompt_action_space, language=self.language
|