khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +17 -6
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/{2327-f03b2a77f67b8f8c.js → 2327-aa22697ed9c8d54a.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
- khoj/interface/compiled/_next/static/chunks/{8515-010dd769c584b672.js → 8515-f305779d95dd5780.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-14ac9d1ad5cb84c5.js → page-7e780dc11eb5e5d3.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-1169ca6e9e7e6247.js → webpack-21f76f7f59582bc7.js} +1 -1
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +2 -2
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +2 -2
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/anthropic/anthropic_chat.py +7 -2
- khoj/processor/conversation/anthropic/utils.py +37 -19
- khoj/processor/conversation/google/gemini_chat.py +7 -2
- khoj/processor/conversation/offline/chat_model.py +2 -2
- khoj/processor/conversation/openai/gpt.py +7 -2
- khoj/processor/conversation/prompts.py +13 -2
- khoj/processor/conversation/utils.py +34 -6
- khoj/processor/operator/grounding_agent.py +345 -0
- khoj/processor/operator/grounding_agent_uitars.py +973 -0
- khoj/processor/operator/operate_browser.py +165 -0
- khoj/processor/operator/operator_actions.py +149 -0
- khoj/processor/operator/operator_agent_anthropic.py +402 -0
- khoj/processor/operator/operator_agent_base.py +80 -0
- khoj/processor/operator/operator_agent_binary.py +336 -0
- khoj/processor/operator/operator_agent_openai.py +349 -0
- khoj/processor/operator/operator_environment_base.py +37 -0
- khoj/processor/operator/operator_environment_browser.py +395 -0
- khoj/routers/api_chat.py +44 -6
- khoj/routers/helpers.py +18 -8
- khoj/routers/research.py +48 -1
- khoj/utils/constants.py +6 -0
- khoj/utils/helpers.py +17 -0
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/METADATA +4 -2
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/RECORD +52 -42
- khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
- /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_ssgManifest.js +0 -0
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/WHEEL +0 -0
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/entry_points.txt +0 -0
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/licenses/LICENSE +0 -0
@@ -736,7 +736,7 @@ Create a multi-step plan and intelligently iterate on the plan based on the retr
|
|
736
736
|
- Ask highly diverse, detailed queries to the tool AIs, one tool AI at a time, to discover required information or run calculations. Their response will be shown to you in the next iteration.
|
737
737
|
- Break down your research process into independent, self-contained steps that can be executed sequentially using the available tool AIs to answer the user's query. Write your step-by-step plan in the scratchpad.
|
738
738
|
- Always ask a new query that was not asked to the tool AI in a previous iteration. Build on the results of the previous iterations.
|
739
|
-
- Ensure that all required context is passed to the tool AIs for successful execution. They only know the context provided in your query.
|
739
|
+
- Ensure that all required context is passed to the tool AIs for successful execution. Include any relevant stuff that has previously been attempted. They only know the context provided in your query.
|
740
740
|
- Think step by step to come up with creative strategies when the previous iteration did not yield useful results.
|
741
741
|
- You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to answer the user's question.
|
742
742
|
- Stop when you have the required information by returning a JSON object with the "tool" field set to "text" and "query" field empty. E.g., {{"scratchpad": "I have all I need", "tool": "text", "query": ""}}
|
@@ -766,7 +766,7 @@ You decide which of the tool AIs listed below would you use to answer the user's
|
|
766
766
|
|
767
767
|
{tools}
|
768
768
|
|
769
|
-
Your response should always be a valid JSON object. Do not say anything else.
|
769
|
+
Your response should always be a valid JSON object with keys: "scratchpad" (str), "tool" (str) and "query" (str). Do not say anything else.
|
770
770
|
Response format:
|
771
771
|
{{"scratchpad": "<your_scratchpad_to_reason_about_which_tool_to_use>", "tool": "<name_of_tool_ai>", "query": "<your_detailed_query_for_the_tool_ai>"}}
|
772
772
|
""".strip()
|
@@ -1119,6 +1119,16 @@ terrarium_sandbox_context = """
|
|
1119
1119
|
- The sandbox has access to only the standard library and the matplotlib, pandas, numpy, scipy, bs5 and sympy packages. The requests, torch, catboost, tensorflow, rdkit and tkinter packages are not available.
|
1120
1120
|
""".strip()
|
1121
1121
|
|
1122
|
+
operator_execution_context = PromptTemplate.from_template(
|
1123
|
+
"""
|
1124
|
+
Use the results of operating a web browser to inform your response.
|
1125
|
+
|
1126
|
+
Browser Operation Results:
|
1127
|
+
{operator_results}
|
1128
|
+
""".strip()
|
1129
|
+
)
|
1130
|
+
|
1131
|
+
|
1122
1132
|
# Automations
|
1123
1133
|
# --
|
1124
1134
|
crontime_prompt = PromptTemplate.from_template(
|
@@ -1371,6 +1381,7 @@ help_message = PromptTemplate.from_template(
|
|
1371
1381
|
- **/online**: Chat using the internet as a source of information.
|
1372
1382
|
- **/image**: Generate an image based on your message.
|
1373
1383
|
- **/research**: Go deeper in a topic for more accurate, in-depth responses.
|
1384
|
+
- **/operator**: Use a web browser to execute actions and search for information.
|
1374
1385
|
- **/help**: Show this help message.
|
1375
1386
|
|
1376
1387
|
You are using the **{model}** model on the **{device}**.
|
@@ -73,6 +73,10 @@ model_to_prompt_size = {
|
|
73
73
|
"claude-3-7-sonnet-20250219": 60000,
|
74
74
|
"claude-3-7-sonnet-latest": 60000,
|
75
75
|
"claude-3-5-haiku-20241022": 60000,
|
76
|
+
"claude-sonnet-4": 60000,
|
77
|
+
"claude-sonnet-4-20250514": 60000,
|
78
|
+
"claude-opus-4": 60000,
|
79
|
+
"claude-opus-4-20250514": 60000,
|
76
80
|
# Offline Models
|
77
81
|
"bartowski/Qwen2.5-14B-Instruct-GGUF": 20000,
|
78
82
|
"bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
|
@@ -91,6 +95,7 @@ class InformationCollectionIteration:
|
|
91
95
|
context: list = None,
|
92
96
|
onlineContext: dict = None,
|
93
97
|
codeContext: dict = None,
|
98
|
+
operatorContext: dict[str, str] = None,
|
94
99
|
summarizedResult: str = None,
|
95
100
|
warning: str = None,
|
96
101
|
):
|
@@ -99,6 +104,7 @@ class InformationCollectionIteration:
|
|
99
104
|
self.context = context
|
100
105
|
self.onlineContext = onlineContext
|
101
106
|
self.codeContext = codeContext
|
107
|
+
self.operatorContext = operatorContext
|
102
108
|
self.summarizedResult = summarizedResult
|
103
109
|
self.warning = warning
|
104
110
|
|
@@ -187,6 +193,9 @@ def construct_tool_chat_history(
|
|
187
193
|
ConversationCommand.Code: (
|
188
194
|
lambda iteration: list(iteration.codeContext.keys()) if iteration.codeContext else []
|
189
195
|
),
|
196
|
+
ConversationCommand.Operator: (
|
197
|
+
lambda iteration: list(iteration.operatorContext.keys()) if iteration.operatorContext else []
|
198
|
+
),
|
190
199
|
}
|
191
200
|
for iteration in previous_iterations:
|
192
201
|
# If a tool is provided use the inferred query extractor for that tool if available
|
@@ -265,6 +274,7 @@ async def save_to_conversation_log(
|
|
265
274
|
compiled_references: List[Dict[str, Any]] = [],
|
266
275
|
online_results: Dict[str, Any] = {},
|
267
276
|
code_results: Dict[str, Any] = {},
|
277
|
+
operator_results: Dict[str, str] = {},
|
268
278
|
inferred_queries: List[str] = [],
|
269
279
|
intent_type: str = "remember",
|
270
280
|
client_application: ClientApplication = None,
|
@@ -291,6 +301,7 @@ async def save_to_conversation_log(
|
|
291
301
|
"intent": {"inferred-queries": inferred_queries, "type": intent_type},
|
292
302
|
"onlineContext": online_results,
|
293
303
|
"codeContext": code_results,
|
304
|
+
"operatorContext": operator_results,
|
294
305
|
"automationId": automation_id,
|
295
306
|
"trainOfThought": train_of_thought,
|
296
307
|
"turnId": turn_id,
|
@@ -380,7 +391,7 @@ def gather_raw_query_files(
|
|
380
391
|
|
381
392
|
|
382
393
|
def generate_chatml_messages_with_context(
|
383
|
-
user_message,
|
394
|
+
user_message: str,
|
384
395
|
system_message: str = None,
|
385
396
|
conversation_log={},
|
386
397
|
model_name="gpt-4o-mini",
|
@@ -447,6 +458,11 @@ def generate_chatml_messages_with_context(
|
|
447
458
|
if not is_none_or_empty(chat.get("codeContext")):
|
448
459
|
message_context += f"{prompts.code_executed_context.format(code_results=chat.get('codeContext'))}"
|
449
460
|
|
461
|
+
if not is_none_or_empty(chat.get("operatorContext")):
|
462
|
+
message_context += (
|
463
|
+
f"{prompts.operator_execution_context.format(operator_results=chat.get('operatorContext'))}"
|
464
|
+
)
|
465
|
+
|
450
466
|
if not is_none_or_empty(message_context):
|
451
467
|
reconstructed_context_message = ChatMessage(content=message_context, role="user")
|
452
468
|
chatml_messages.insert(0, reconstructed_context_message)
|
@@ -685,8 +701,9 @@ def clean_code_python(code: str):
|
|
685
701
|
|
686
702
|
def load_complex_json(json_str):
|
687
703
|
"""
|
688
|
-
Preprocess a raw JSON string to
|
689
|
-
while preserving the JSON structure and already escaped quotes.
|
704
|
+
Preprocess a raw JSON string to
|
705
|
+
- escape unescaped double quotes within value strings while preserving the JSON structure and already escaped quotes.
|
706
|
+
- remove suffix after the first valid JSON object,
|
690
707
|
"""
|
691
708
|
|
692
709
|
def replace_unescaped_quotes(match):
|
@@ -714,9 +731,20 @@ def load_complex_json(json_str):
|
|
714
731
|
for loads in json_loaders_to_try:
|
715
732
|
try:
|
716
733
|
return loads(processed)
|
717
|
-
except (json.JSONDecodeError, pyjson5.Json5Exception) as
|
718
|
-
|
719
|
-
|
734
|
+
except (json.JSONDecodeError, pyjson5.Json5Exception) as e_load:
|
735
|
+
loader_name = loads.__name__
|
736
|
+
errors.append(f"{loader_name} (initial parse): {type(e_load).__name__}: {str(e_load)}")
|
737
|
+
|
738
|
+
# Handle plain text suffixes by slicing at error position
|
739
|
+
if hasattr(e_load, "pos") and 0 < e_load.pos < len(processed):
|
740
|
+
try:
|
741
|
+
sliced = processed[: e_load.pos].strip()
|
742
|
+
if sliced:
|
743
|
+
return loads(sliced)
|
744
|
+
except Exception as e_slice:
|
745
|
+
errors.append(
|
746
|
+
f"{loader_name} after slice at {e_load.pos}: {type(e_slice).__name__}: {str(e_slice)}"
|
747
|
+
)
|
720
748
|
# If all loaders fail, raise the aggregated error
|
721
749
|
raise ValueError(
|
722
750
|
f"Failed to load JSON with errors: {'; '.join(errors)}\n\n"
|
@@ -0,0 +1,345 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
|
4
|
+
from openai import AzureOpenAI, OpenAI
|
5
|
+
from openai.types.chat import ChatCompletion, ChatCompletionMessage
|
6
|
+
|
7
|
+
from khoj.database.models import ChatModel
|
8
|
+
from khoj.processor.conversation.utils import construct_structured_message
|
9
|
+
from khoj.processor.operator.operator_actions import *
|
10
|
+
from khoj.processor.operator.operator_agent_base import AgentActResult
|
11
|
+
from khoj.processor.operator.operator_environment_base import EnvState
|
12
|
+
from khoj.utils.helpers import get_chat_usage_metrics
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class GroundingAgent:
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
model: ChatModel,
|
21
|
+
client: OpenAI | AzureOpenAI,
|
22
|
+
max_iterations: int,
|
23
|
+
tracer: dict = None,
|
24
|
+
):
|
25
|
+
self.model = model
|
26
|
+
self.client = client
|
27
|
+
self.max_iterations = max_iterations
|
28
|
+
self.tracer = tracer
|
29
|
+
|
30
|
+
# Define tools for the grounding LLM (OpenAI format)
|
31
|
+
self.action_tools = [
|
32
|
+
{
|
33
|
+
"type": "function",
|
34
|
+
"function": {
|
35
|
+
"name": "click",
|
36
|
+
"description": "Click on a specific coordinate.",
|
37
|
+
"parameters": {
|
38
|
+
"type": "object",
|
39
|
+
"properties": {
|
40
|
+
"x": {"type": "integer", "description": "X coordinate"},
|
41
|
+
"y": {"type": "integer", "description": "Y coordinate"},
|
42
|
+
},
|
43
|
+
"required": ["x", "y"],
|
44
|
+
},
|
45
|
+
},
|
46
|
+
},
|
47
|
+
{
|
48
|
+
"type": "function",
|
49
|
+
"function": {
|
50
|
+
"name": "left_double",
|
51
|
+
"description": "Double click on a specific coordinate.",
|
52
|
+
"parameters": {
|
53
|
+
"type": "object",
|
54
|
+
"properties": {
|
55
|
+
"x": {"type": "integer", "description": "X coordinate"},
|
56
|
+
"y": {"type": "integer", "description": "Y coordinate"},
|
57
|
+
},
|
58
|
+
"required": ["x", "y"],
|
59
|
+
},
|
60
|
+
},
|
61
|
+
},
|
62
|
+
{
|
63
|
+
"type": "function",
|
64
|
+
"function": {
|
65
|
+
"name": "right_single",
|
66
|
+
"description": "Right click on a specific coordinate.",
|
67
|
+
"parameters": {
|
68
|
+
"type": "object",
|
69
|
+
"properties": {
|
70
|
+
"x": {"type": "integer", "description": "X coordinate"},
|
71
|
+
"y": {"type": "integer", "description": "Y coordinate"},
|
72
|
+
},
|
73
|
+
"required": ["x", "y"],
|
74
|
+
},
|
75
|
+
},
|
76
|
+
},
|
77
|
+
{
|
78
|
+
"type": "function",
|
79
|
+
"function": {
|
80
|
+
"name": "drag",
|
81
|
+
"description": "Perform a drag-and-drop operation along a path.",
|
82
|
+
"parameters": {
|
83
|
+
"type": "object",
|
84
|
+
"properties": {
|
85
|
+
"path": {
|
86
|
+
"type": "array",
|
87
|
+
"items": {
|
88
|
+
"type": "object",
|
89
|
+
"properties": {
|
90
|
+
"x": {"type": "integer"},
|
91
|
+
"y": {"type": "integer"},
|
92
|
+
},
|
93
|
+
"required": ["x", "y"],
|
94
|
+
},
|
95
|
+
"description": "List of points (x, y coordinates) defining the drag path.",
|
96
|
+
}
|
97
|
+
},
|
98
|
+
"required": ["path"],
|
99
|
+
},
|
100
|
+
},
|
101
|
+
},
|
102
|
+
{
|
103
|
+
"type": "function",
|
104
|
+
"function": {
|
105
|
+
"name": "hotkey",
|
106
|
+
"description": "Press a key or key combination.",
|
107
|
+
"parameters": {
|
108
|
+
"type": "object",
|
109
|
+
"properties": {
|
110
|
+
"keys": {
|
111
|
+
"type": "array",
|
112
|
+
"items": {"type": "string"},
|
113
|
+
"description": "List of keys to press (e.g., ['Control', 'a'], ['Enter'])",
|
114
|
+
}
|
115
|
+
},
|
116
|
+
"required": ["keys"],
|
117
|
+
},
|
118
|
+
},
|
119
|
+
},
|
120
|
+
{
|
121
|
+
"type": "function",
|
122
|
+
"function": {
|
123
|
+
"name": "type",
|
124
|
+
"description": "Type text, usually into a focused input field.",
|
125
|
+
"parameters": {
|
126
|
+
"type": "object",
|
127
|
+
"properties": {"content": {"type": "string", "description": "Text to type"}},
|
128
|
+
"required": ["content"],
|
129
|
+
},
|
130
|
+
},
|
131
|
+
},
|
132
|
+
{
|
133
|
+
"type": "function",
|
134
|
+
"function": {
|
135
|
+
"name": "scroll",
|
136
|
+
"description": "Scroll the page.",
|
137
|
+
"parameters": {
|
138
|
+
"type": "object",
|
139
|
+
"properties": {
|
140
|
+
"x": {"type": "integer", "description": "X coordinate to scroll from"},
|
141
|
+
"y": {"type": "integer", "description": "Y coordinate to scroll from"},
|
142
|
+
"direction": {
|
143
|
+
"type": "string",
|
144
|
+
"enum": ["up", "down", "left", "right"],
|
145
|
+
"default": "down",
|
146
|
+
},
|
147
|
+
},
|
148
|
+
"required": [], # None is strictly required
|
149
|
+
},
|
150
|
+
},
|
151
|
+
},
|
152
|
+
{
|
153
|
+
"type": "function",
|
154
|
+
"function": {
|
155
|
+
"name": "wait",
|
156
|
+
"description": "Pause execution for a specified duration.",
|
157
|
+
"parameters": {
|
158
|
+
"type": "object",
|
159
|
+
"properties": {
|
160
|
+
"duration": {"type": "number", "description": "Duration in seconds", "default": 1.0}
|
161
|
+
},
|
162
|
+
"required": [],
|
163
|
+
},
|
164
|
+
},
|
165
|
+
},
|
166
|
+
{
|
167
|
+
"type": "function",
|
168
|
+
"function": {
|
169
|
+
"name": "goto",
|
170
|
+
"description": "Navigate to a specific URL.",
|
171
|
+
"parameters": {
|
172
|
+
"type": "object",
|
173
|
+
"properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
|
174
|
+
"required": ["url"],
|
175
|
+
},
|
176
|
+
},
|
177
|
+
},
|
178
|
+
{
|
179
|
+
"type": "function",
|
180
|
+
"function": {
|
181
|
+
"name": "back",
|
182
|
+
"description": "navigate back to the previous page.",
|
183
|
+
"parameters": {"type": "object", "properties": {}},
|
184
|
+
},
|
185
|
+
},
|
186
|
+
]
|
187
|
+
|
188
|
+
async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
|
189
|
+
"""Call the grounding LLM to get the next action based on the current state and instruction."""
|
190
|
+
# Format the message for the API call
|
191
|
+
messages_for_api = self._format_message_for_api(instruction, current_state)
|
192
|
+
try:
|
193
|
+
grounding_response: ChatCompletion = await self.client.chat.completions.create(
|
194
|
+
messages=messages_for_api,
|
195
|
+
model=self.model.name,
|
196
|
+
tools=self.action_tools,
|
197
|
+
tool_choice="required",
|
198
|
+
temperature=0.0, # Grounding should be precise
|
199
|
+
max_completion_tokens=1000, # Allow for thoughts + actions
|
200
|
+
)
|
201
|
+
if not isinstance(grounding_response, ChatCompletion):
|
202
|
+
raise ValueError("Grounding LLM response is not of type ChatCompletion.")
|
203
|
+
logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
|
204
|
+
|
205
|
+
# Parse tool calls
|
206
|
+
grounding_message = grounding_response.choices[0].message
|
207
|
+
rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
|
208
|
+
|
209
|
+
# Update usage by grounding model
|
210
|
+
self.tracer["usage"] = get_chat_usage_metrics(
|
211
|
+
self.model.name,
|
212
|
+
input_tokens=grounding_response.usage.prompt_tokens,
|
213
|
+
output_tokens=grounding_response.usage.completion_tokens,
|
214
|
+
usage=self.tracer.get("usage"),
|
215
|
+
)
|
216
|
+
except Exception as e:
|
217
|
+
logger.error(f"Error calling Grounding LLM: {e}")
|
218
|
+
rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
|
219
|
+
actions = []
|
220
|
+
|
221
|
+
return rendered_response, actions
|
222
|
+
|
223
|
+
def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
|
224
|
+
"""Format the message for the API call."""
|
225
|
+
grounding_user_prompt = f"""
|
226
|
+
You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
|
227
|
+
You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
|
228
|
+
Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
|
229
|
+
|
230
|
+
## Output Format
|
231
|
+
```
|
232
|
+
Thought: ...
|
233
|
+
Action: ...
|
234
|
+
```
|
235
|
+
|
236
|
+
## Action Space
|
237
|
+
|
238
|
+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
239
|
+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
240
|
+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
|
241
|
+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
|
242
|
+
hotkey(key='')
|
243
|
+
type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
|
244
|
+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
|
245
|
+
wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
|
246
|
+
goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
|
247
|
+
back() # Use this to go back to the previous page.
|
248
|
+
|
249
|
+
## Note
|
250
|
+
- Use English in `Thought` part.
|
251
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
|
252
|
+
|
253
|
+
## User Instruction
|
254
|
+
{instruction}
|
255
|
+
""".lstrip()
|
256
|
+
|
257
|
+
# Construct grounding LLM input (using only the latest user prompt + image)
|
258
|
+
# We don't pass the full history here, as grounding depends on the *current* state + NL action
|
259
|
+
screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
|
260
|
+
grounding_messages_content = construct_structured_message(
|
261
|
+
grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
|
262
|
+
)
|
263
|
+
return [{"role": "user", "content": grounding_messages_content}]
|
264
|
+
|
265
|
+
def _parse_action(
|
266
|
+
self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
|
267
|
+
) -> tuple[str, list[OperatorAction]]:
|
268
|
+
"""Parse the tool calls from the grounding LLM response and convert them to action objects."""
|
269
|
+
actions: List[OperatorAction] = []
|
270
|
+
action_results: List[dict] = []
|
271
|
+
|
272
|
+
if grounding_message.tool_calls:
|
273
|
+
rendered_parts = []
|
274
|
+
for tool_call in grounding_message.tool_calls:
|
275
|
+
function_name = tool_call.function.name
|
276
|
+
try:
|
277
|
+
arguments = json.loads(tool_call.function.arguments)
|
278
|
+
action_to_run: Optional[OperatorAction] = None
|
279
|
+
action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
|
280
|
+
|
281
|
+
if function_name == "click":
|
282
|
+
action_to_run = ClickAction(**arguments)
|
283
|
+
elif function_name == "left_double":
|
284
|
+
action_to_run = DoubleClickAction(**arguments)
|
285
|
+
elif function_name == "right_single":
|
286
|
+
action_to_run = ClickAction(button="right", **arguments)
|
287
|
+
elif function_name == "type":
|
288
|
+
content = arguments.get("content")
|
289
|
+
action_to_run = TypeAction(text=content)
|
290
|
+
elif function_name == "scroll":
|
291
|
+
direction = arguments.get("direction", "down")
|
292
|
+
amount = 3
|
293
|
+
action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
|
294
|
+
elif function_name == "hotkey":
|
295
|
+
action_to_run = KeypressAction(**arguments)
|
296
|
+
elif function_name == "goto":
|
297
|
+
action_to_run = GotoAction(**arguments)
|
298
|
+
elif function_name == "back":
|
299
|
+
action_to_run = BackAction(**arguments)
|
300
|
+
elif function_name == "wait":
|
301
|
+
action_to_run = WaitAction(**arguments)
|
302
|
+
elif function_name == "screenshot":
|
303
|
+
action_to_run = ScreenshotAction(**arguments)
|
304
|
+
elif function_name == "drag":
|
305
|
+
# Need to convert list of dicts to list of Point objects
|
306
|
+
path_dicts = arguments.get("path", [])
|
307
|
+
path_points = [Point(**p) for p in path_dicts]
|
308
|
+
if path_points:
|
309
|
+
action_to_run = DragAction(path=path_points)
|
310
|
+
else:
|
311
|
+
logger.warning(f"Drag action called with empty path: {arguments}")
|
312
|
+
action_render_str += " [Skipped - empty path]"
|
313
|
+
elif function_name == "finished":
|
314
|
+
action_to_run = None
|
315
|
+
else:
|
316
|
+
logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
|
317
|
+
action_render_str += " [Unhandled]"
|
318
|
+
|
319
|
+
if action_to_run:
|
320
|
+
actions.append(action_to_run)
|
321
|
+
action_results.append(
|
322
|
+
{
|
323
|
+
"type": "tool_result",
|
324
|
+
"tool_call_id": tool_call.id,
|
325
|
+
"content": None, # Updated after environment step
|
326
|
+
}
|
327
|
+
)
|
328
|
+
rendered_parts.append(action_render_str)
|
329
|
+
except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
|
330
|
+
logger.error(
|
331
|
+
f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
|
332
|
+
)
|
333
|
+
rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
|
334
|
+
rendered_response = "\n- ".join(rendered_parts)
|
335
|
+
else:
|
336
|
+
# Grounding LLM responded but didn't call a tool
|
337
|
+
logger.warning("Grounding LLM did not produce a tool call.")
|
338
|
+
rendered_response = f"{grounding_message.content or 'No action required.'}"
|
339
|
+
|
340
|
+
# Render the response
|
341
|
+
return rendered_response, actions
|
342
|
+
|
343
|
+
def reset(self):
|
344
|
+
"""Reset the agent state."""
|
345
|
+
pass
|