khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. khoj/database/adapters/__init__.py +17 -6
  2. khoj/interface/compiled/404/index.html +2 -2
  3. khoj/interface/compiled/_next/static/chunks/{2327-f03b2a77f67b8f8c.js → 2327-aa22697ed9c8d54a.js} +1 -1
  4. khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
  5. khoj/interface/compiled/_next/static/chunks/{8515-010dd769c584b672.js → 8515-f305779d95dd5780.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/app/chat/{page-14ac9d1ad5cb84c5.js → page-7e780dc11eb5e5d3.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/{webpack-1169ca6e9e7e6247.js → webpack-21f76f7f59582bc7.js} +1 -1
  10. khoj/interface/compiled/agents/index.html +2 -2
  11. khoj/interface/compiled/agents/index.txt +2 -2
  12. khoj/interface/compiled/automations/index.html +2 -2
  13. khoj/interface/compiled/automations/index.txt +2 -2
  14. khoj/interface/compiled/chat/index.html +2 -2
  15. khoj/interface/compiled/chat/index.txt +2 -2
  16. khoj/interface/compiled/index.html +2 -2
  17. khoj/interface/compiled/index.txt +2 -2
  18. khoj/interface/compiled/search/index.html +2 -2
  19. khoj/interface/compiled/search/index.txt +2 -2
  20. khoj/interface/compiled/settings/index.html +2 -2
  21. khoj/interface/compiled/settings/index.txt +2 -2
  22. khoj/interface/compiled/share/chat/index.html +2 -2
  23. khoj/interface/compiled/share/chat/index.txt +2 -2
  24. khoj/processor/conversation/anthropic/anthropic_chat.py +7 -2
  25. khoj/processor/conversation/anthropic/utils.py +37 -19
  26. khoj/processor/conversation/google/gemini_chat.py +7 -2
  27. khoj/processor/conversation/offline/chat_model.py +2 -2
  28. khoj/processor/conversation/openai/gpt.py +7 -2
  29. khoj/processor/conversation/prompts.py +13 -2
  30. khoj/processor/conversation/utils.py +34 -6
  31. khoj/processor/operator/grounding_agent.py +345 -0
  32. khoj/processor/operator/grounding_agent_uitars.py +973 -0
  33. khoj/processor/operator/operate_browser.py +165 -0
  34. khoj/processor/operator/operator_actions.py +149 -0
  35. khoj/processor/operator/operator_agent_anthropic.py +402 -0
  36. khoj/processor/operator/operator_agent_base.py +80 -0
  37. khoj/processor/operator/operator_agent_binary.py +336 -0
  38. khoj/processor/operator/operator_agent_openai.py +349 -0
  39. khoj/processor/operator/operator_environment_base.py +37 -0
  40. khoj/processor/operator/operator_environment_browser.py +395 -0
  41. khoj/routers/api_chat.py +44 -6
  42. khoj/routers/helpers.py +18 -8
  43. khoj/routers/research.py +48 -1
  44. khoj/utils/constants.py +6 -0
  45. khoj/utils/helpers.py +17 -0
  46. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/METADATA +4 -2
  47. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/RECORD +52 -42
  48. khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
  49. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
  50. khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
  51. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_buildManifest.js +0 -0
  52. /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → o6zlo73DbD2lS92jWHS8o}/_ssgManifest.js +0 -0
  53. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/WHEEL +0 -0
  54. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/entry_points.txt +0 -0
  55. {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev97.dist-info}/licenses/LICENSE +0 -0
@@ -736,7 +736,7 @@ Create a multi-step plan and intelligently iterate on the plan based on the retr
736
736
  - Ask highly diverse, detailed queries to the tool AIs, one tool AI at a time, to discover required information or run calculations. Their response will be shown to you in the next iteration.
737
737
  - Break down your research process into independent, self-contained steps that can be executed sequentially using the available tool AIs to answer the user's query. Write your step-by-step plan in the scratchpad.
738
738
  - Always ask a new query that was not asked to the tool AI in a previous iteration. Build on the results of the previous iterations.
739
- - Ensure that all required context is passed to the tool AIs for successful execution. They only know the context provided in your query.
739
+ - Ensure that all required context is passed to the tool AIs for successful execution. Include any relevant stuff that has previously been attempted. They only know the context provided in your query.
740
740
  - Think step by step to come up with creative strategies when the previous iteration did not yield useful results.
741
741
  - You are allowed upto {max_iterations} iterations to use the help of the provided tool AIs to answer the user's question.
742
742
  - Stop when you have the required information by returning a JSON object with the "tool" field set to "text" and "query" field empty. E.g., {{"scratchpad": "I have all I need", "tool": "text", "query": ""}}
@@ -766,7 +766,7 @@ You decide which of the tool AIs listed below would you use to answer the user's
766
766
 
767
767
  {tools}
768
768
 
769
- Your response should always be a valid JSON object. Do not say anything else.
769
+ Your response should always be a valid JSON object with keys: "scratchpad" (str), "tool" (str) and "query" (str). Do not say anything else.
770
770
  Response format:
771
771
  {{"scratchpad": "<your_scratchpad_to_reason_about_which_tool_to_use>", "tool": "<name_of_tool_ai>", "query": "<your_detailed_query_for_the_tool_ai>"}}
772
772
  """.strip()
@@ -1119,6 +1119,16 @@ terrarium_sandbox_context = """
1119
1119
  - The sandbox has access to only the standard library and the matplotlib, pandas, numpy, scipy, bs5 and sympy packages. The requests, torch, catboost, tensorflow, rdkit and tkinter packages are not available.
1120
1120
  """.strip()
1121
1121
 
1122
+ operator_execution_context = PromptTemplate.from_template(
1123
+ """
1124
+ Use the results of operating a web browser to inform your response.
1125
+
1126
+ Browser Operation Results:
1127
+ {operator_results}
1128
+ """.strip()
1129
+ )
1130
+
1131
+
1122
1132
  # Automations
1123
1133
  # --
1124
1134
  crontime_prompt = PromptTemplate.from_template(
@@ -1371,6 +1381,7 @@ help_message = PromptTemplate.from_template(
1371
1381
  - **/online**: Chat using the internet as a source of information.
1372
1382
  - **/image**: Generate an image based on your message.
1373
1383
  - **/research**: Go deeper in a topic for more accurate, in-depth responses.
1384
+ - **/operator**: Use a web browser to execute actions and search for information.
1374
1385
  - **/help**: Show this help message.
1375
1386
 
1376
1387
  You are using the **{model}** model on the **{device}**.
@@ -73,6 +73,10 @@ model_to_prompt_size = {
73
73
  "claude-3-7-sonnet-20250219": 60000,
74
74
  "claude-3-7-sonnet-latest": 60000,
75
75
  "claude-3-5-haiku-20241022": 60000,
76
+ "claude-sonnet-4": 60000,
77
+ "claude-sonnet-4-20250514": 60000,
78
+ "claude-opus-4": 60000,
79
+ "claude-opus-4-20250514": 60000,
76
80
  # Offline Models
77
81
  "bartowski/Qwen2.5-14B-Instruct-GGUF": 20000,
78
82
  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
@@ -91,6 +95,7 @@ class InformationCollectionIteration:
91
95
  context: list = None,
92
96
  onlineContext: dict = None,
93
97
  codeContext: dict = None,
98
+ operatorContext: dict[str, str] = None,
94
99
  summarizedResult: str = None,
95
100
  warning: str = None,
96
101
  ):
@@ -99,6 +104,7 @@ class InformationCollectionIteration:
99
104
  self.context = context
100
105
  self.onlineContext = onlineContext
101
106
  self.codeContext = codeContext
107
+ self.operatorContext = operatorContext
102
108
  self.summarizedResult = summarizedResult
103
109
  self.warning = warning
104
110
 
@@ -187,6 +193,9 @@ def construct_tool_chat_history(
187
193
  ConversationCommand.Code: (
188
194
  lambda iteration: list(iteration.codeContext.keys()) if iteration.codeContext else []
189
195
  ),
196
+ ConversationCommand.Operator: (
197
+ lambda iteration: list(iteration.operatorContext.keys()) if iteration.operatorContext else []
198
+ ),
190
199
  }
191
200
  for iteration in previous_iterations:
192
201
  # If a tool is provided use the inferred query extractor for that tool if available
@@ -265,6 +274,7 @@ async def save_to_conversation_log(
265
274
  compiled_references: List[Dict[str, Any]] = [],
266
275
  online_results: Dict[str, Any] = {},
267
276
  code_results: Dict[str, Any] = {},
277
+ operator_results: Dict[str, str] = {},
268
278
  inferred_queries: List[str] = [],
269
279
  intent_type: str = "remember",
270
280
  client_application: ClientApplication = None,
@@ -291,6 +301,7 @@ async def save_to_conversation_log(
291
301
  "intent": {"inferred-queries": inferred_queries, "type": intent_type},
292
302
  "onlineContext": online_results,
293
303
  "codeContext": code_results,
304
+ "operatorContext": operator_results,
294
305
  "automationId": automation_id,
295
306
  "trainOfThought": train_of_thought,
296
307
  "turnId": turn_id,
@@ -380,7 +391,7 @@ def gather_raw_query_files(
380
391
 
381
392
 
382
393
  def generate_chatml_messages_with_context(
383
- user_message,
394
+ user_message: str,
384
395
  system_message: str = None,
385
396
  conversation_log={},
386
397
  model_name="gpt-4o-mini",
@@ -447,6 +458,11 @@ def generate_chatml_messages_with_context(
447
458
  if not is_none_or_empty(chat.get("codeContext")):
448
459
  message_context += f"{prompts.code_executed_context.format(code_results=chat.get('codeContext'))}"
449
460
 
461
+ if not is_none_or_empty(chat.get("operatorContext")):
462
+ message_context += (
463
+ f"{prompts.operator_execution_context.format(operator_results=chat.get('operatorContext'))}"
464
+ )
465
+
450
466
  if not is_none_or_empty(message_context):
451
467
  reconstructed_context_message = ChatMessage(content=message_context, role="user")
452
468
  chatml_messages.insert(0, reconstructed_context_message)
@@ -685,8 +701,9 @@ def clean_code_python(code: str):
685
701
 
686
702
  def load_complex_json(json_str):
687
703
  """
688
- Preprocess a raw JSON string to escape unescaped double quotes within value strings,
689
- while preserving the JSON structure and already escaped quotes.
704
+ Preprocess a raw JSON string to
705
+ - escape unescaped double quotes within value strings while preserving the JSON structure and already escaped quotes.
706
+ - remove suffix after the first valid JSON object,
690
707
  """
691
708
 
692
709
  def replace_unescaped_quotes(match):
@@ -714,9 +731,20 @@ def load_complex_json(json_str):
714
731
  for loads in json_loaders_to_try:
715
732
  try:
716
733
  return loads(processed)
717
- except (json.JSONDecodeError, pyjson5.Json5Exception) as e:
718
- errors.append(f"{type(e).__name__}: {str(e)}")
719
-
734
+ except (json.JSONDecodeError, pyjson5.Json5Exception) as e_load:
735
+ loader_name = loads.__name__
736
+ errors.append(f"{loader_name} (initial parse): {type(e_load).__name__}: {str(e_load)}")
737
+
738
+ # Handle plain text suffixes by slicing at error position
739
+ if hasattr(e_load, "pos") and 0 < e_load.pos < len(processed):
740
+ try:
741
+ sliced = processed[: e_load.pos].strip()
742
+ if sliced:
743
+ return loads(sliced)
744
+ except Exception as e_slice:
745
+ errors.append(
746
+ f"{loader_name} after slice at {e_load.pos}: {type(e_slice).__name__}: {str(e_slice)}"
747
+ )
720
748
  # If all loaders fail, raise the aggregated error
721
749
  raise ValueError(
722
750
  f"Failed to load JSON with errors: {'; '.join(errors)}\n\n"
@@ -0,0 +1,345 @@
1
+ import json
2
+ import logging
3
+
4
+ from openai import AzureOpenAI, OpenAI
5
+ from openai.types.chat import ChatCompletion, ChatCompletionMessage
6
+
7
+ from khoj.database.models import ChatModel
8
+ from khoj.processor.conversation.utils import construct_structured_message
9
+ from khoj.processor.operator.operator_actions import *
10
+ from khoj.processor.operator.operator_agent_base import AgentActResult
11
+ from khoj.processor.operator.operator_environment_base import EnvState
12
+ from khoj.utils.helpers import get_chat_usage_metrics
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class GroundingAgent:
18
+ def __init__(
19
+ self,
20
+ model: ChatModel,
21
+ client: OpenAI | AzureOpenAI,
22
+ max_iterations: int,
23
+ tracer: dict = None,
24
+ ):
25
+ self.model = model
26
+ self.client = client
27
+ self.max_iterations = max_iterations
28
+ self.tracer = tracer
29
+
30
+ # Define tools for the grounding LLM (OpenAI format)
31
+ self.action_tools = [
32
+ {
33
+ "type": "function",
34
+ "function": {
35
+ "name": "click",
36
+ "description": "Click on a specific coordinate.",
37
+ "parameters": {
38
+ "type": "object",
39
+ "properties": {
40
+ "x": {"type": "integer", "description": "X coordinate"},
41
+ "y": {"type": "integer", "description": "Y coordinate"},
42
+ },
43
+ "required": ["x", "y"],
44
+ },
45
+ },
46
+ },
47
+ {
48
+ "type": "function",
49
+ "function": {
50
+ "name": "left_double",
51
+ "description": "Double click on a specific coordinate.",
52
+ "parameters": {
53
+ "type": "object",
54
+ "properties": {
55
+ "x": {"type": "integer", "description": "X coordinate"},
56
+ "y": {"type": "integer", "description": "Y coordinate"},
57
+ },
58
+ "required": ["x", "y"],
59
+ },
60
+ },
61
+ },
62
+ {
63
+ "type": "function",
64
+ "function": {
65
+ "name": "right_single",
66
+ "description": "Right click on a specific coordinate.",
67
+ "parameters": {
68
+ "type": "object",
69
+ "properties": {
70
+ "x": {"type": "integer", "description": "X coordinate"},
71
+ "y": {"type": "integer", "description": "Y coordinate"},
72
+ },
73
+ "required": ["x", "y"],
74
+ },
75
+ },
76
+ },
77
+ {
78
+ "type": "function",
79
+ "function": {
80
+ "name": "drag",
81
+ "description": "Perform a drag-and-drop operation along a path.",
82
+ "parameters": {
83
+ "type": "object",
84
+ "properties": {
85
+ "path": {
86
+ "type": "array",
87
+ "items": {
88
+ "type": "object",
89
+ "properties": {
90
+ "x": {"type": "integer"},
91
+ "y": {"type": "integer"},
92
+ },
93
+ "required": ["x", "y"],
94
+ },
95
+ "description": "List of points (x, y coordinates) defining the drag path.",
96
+ }
97
+ },
98
+ "required": ["path"],
99
+ },
100
+ },
101
+ },
102
+ {
103
+ "type": "function",
104
+ "function": {
105
+ "name": "hotkey",
106
+ "description": "Press a key or key combination.",
107
+ "parameters": {
108
+ "type": "object",
109
+ "properties": {
110
+ "keys": {
111
+ "type": "array",
112
+ "items": {"type": "string"},
113
+ "description": "List of keys to press (e.g., ['Control', 'a'], ['Enter'])",
114
+ }
115
+ },
116
+ "required": ["keys"],
117
+ },
118
+ },
119
+ },
120
+ {
121
+ "type": "function",
122
+ "function": {
123
+ "name": "type",
124
+ "description": "Type text, usually into a focused input field.",
125
+ "parameters": {
126
+ "type": "object",
127
+ "properties": {"content": {"type": "string", "description": "Text to type"}},
128
+ "required": ["content"],
129
+ },
130
+ },
131
+ },
132
+ {
133
+ "type": "function",
134
+ "function": {
135
+ "name": "scroll",
136
+ "description": "Scroll the page.",
137
+ "parameters": {
138
+ "type": "object",
139
+ "properties": {
140
+ "x": {"type": "integer", "description": "X coordinate to scroll from"},
141
+ "y": {"type": "integer", "description": "Y coordinate to scroll from"},
142
+ "direction": {
143
+ "type": "string",
144
+ "enum": ["up", "down", "left", "right"],
145
+ "default": "down",
146
+ },
147
+ },
148
+ "required": [], # None is strictly required
149
+ },
150
+ },
151
+ },
152
+ {
153
+ "type": "function",
154
+ "function": {
155
+ "name": "wait",
156
+ "description": "Pause execution for a specified duration.",
157
+ "parameters": {
158
+ "type": "object",
159
+ "properties": {
160
+ "duration": {"type": "number", "description": "Duration in seconds", "default": 1.0}
161
+ },
162
+ "required": [],
163
+ },
164
+ },
165
+ },
166
+ {
167
+ "type": "function",
168
+ "function": {
169
+ "name": "goto",
170
+ "description": "Navigate to a specific URL.",
171
+ "parameters": {
172
+ "type": "object",
173
+ "properties": {"url": {"type": "string", "description": "Fully qualified URL"}},
174
+ "required": ["url"],
175
+ },
176
+ },
177
+ },
178
+ {
179
+ "type": "function",
180
+ "function": {
181
+ "name": "back",
182
+ "description": "navigate back to the previous page.",
183
+ "parameters": {"type": "object", "properties": {}},
184
+ },
185
+ },
186
+ ]
187
+
188
+ async def act(self, instruction: str, current_state: EnvState) -> tuple[str, list[OperatorAction]]:
189
+ """Call the grounding LLM to get the next action based on the current state and instruction."""
190
+ # Format the message for the API call
191
+ messages_for_api = self._format_message_for_api(instruction, current_state)
192
+ try:
193
+ grounding_response: ChatCompletion = await self.client.chat.completions.create(
194
+ messages=messages_for_api,
195
+ model=self.model.name,
196
+ tools=self.action_tools,
197
+ tool_choice="required",
198
+ temperature=0.0, # Grounding should be precise
199
+ max_completion_tokens=1000, # Allow for thoughts + actions
200
+ )
201
+ if not isinstance(grounding_response, ChatCompletion):
202
+ raise ValueError("Grounding LLM response is not of type ChatCompletion.")
203
+ logger.debug(f"Grounding LLM response: {grounding_response.model_dump_json()}")
204
+
205
+ # Parse tool calls
206
+ grounding_message = grounding_response.choices[0].message
207
+ rendered_response, actions = self._parse_action(grounding_message, instruction, current_state)
208
+
209
+ # Update usage by grounding model
210
+ self.tracer["usage"] = get_chat_usage_metrics(
211
+ self.model.name,
212
+ input_tokens=grounding_response.usage.prompt_tokens,
213
+ output_tokens=grounding_response.usage.completion_tokens,
214
+ usage=self.tracer.get("usage"),
215
+ )
216
+ except Exception as e:
217
+ logger.error(f"Error calling Grounding LLM: {e}")
218
+ rendered_response = f"**Error**: Error contacting Grounding LLM: {e}"
219
+ actions = []
220
+
221
+ return rendered_response, actions
222
+
223
+ def _format_message_for_api(self, instruction: str, current_state: EnvState) -> List:
224
+ """Format the message for the API call."""
225
+ grounding_user_prompt = f"""
226
+ You are a GUI agent. You are given a task and a screenshot of the web browser tab you operate. You need to decide the next action to complete the task.
227
+ You control a single tab in a Chromium browser. You cannot access the OS, filesystem or the application window.
228
+ Always use the `goto` function to navigate to a specific URL. Ctrl+t, Ctrl+w, Ctrl+q, Ctrl+Shift+T, Ctrl+Shift+W are not allowed.
229
+
230
+ ## Output Format
231
+ ```
232
+ Thought: ...
233
+ Action: ...
234
+ ```
235
+
236
+ ## Action Space
237
+
238
+ click(start_box='<|box_start|>(x1,y1)<|box_end|>')
239
+ left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
240
+ right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
241
+ drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
242
+ hotkey(key='')
243
+ type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
244
+ scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
245
+ wait(duration='time') # Sleep for specified time. Default is 1s and take a screenshot to check for any changes.
246
+ goto(url='xxx') # Always use this to navigate to a specific URL. Use escape characters \\', \\", and \\n in url part to ensure we can parse the url in normal python string format.
247
+ back() # Use this to go back to the previous page.
248
+
249
+ ## Note
250
+ - Use English in `Thought` part.
251
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
252
+
253
+ ## User Instruction
254
+ {instruction}
255
+ """.lstrip()
256
+
257
+ # Construct grounding LLM input (using only the latest user prompt + image)
258
+ # We don't pass the full history here, as grounding depends on the *current* state + NL action
259
+ screenshots = [f"data:image/webp;base64,{current_state.screenshot}"]
260
+ grounding_messages_content = construct_structured_message(
261
+ grounding_user_prompt, screenshots, self.model.name, vision_enabled=True
262
+ )
263
+ return [{"role": "user", "content": grounding_messages_content}]
264
+
265
+ def _parse_action(
266
+ self, grounding_message: ChatCompletionMessage, instruction: str, current_state: EnvState
267
+ ) -> tuple[str, list[OperatorAction]]:
268
+ """Parse the tool calls from the grounding LLM response and convert them to action objects."""
269
+ actions: List[OperatorAction] = []
270
+ action_results: List[dict] = []
271
+
272
+ if grounding_message.tool_calls:
273
+ rendered_parts = []
274
+ for tool_call in grounding_message.tool_calls:
275
+ function_name = tool_call.function.name
276
+ try:
277
+ arguments = json.loads(tool_call.function.arguments)
278
+ action_to_run: Optional[OperatorAction] = None
279
+ action_render_str = f"**Action ({function_name})**: {tool_call.function.arguments}"
280
+
281
+ if function_name == "click":
282
+ action_to_run = ClickAction(**arguments)
283
+ elif function_name == "left_double":
284
+ action_to_run = DoubleClickAction(**arguments)
285
+ elif function_name == "right_single":
286
+ action_to_run = ClickAction(button="right", **arguments)
287
+ elif function_name == "type":
288
+ content = arguments.get("content")
289
+ action_to_run = TypeAction(text=content)
290
+ elif function_name == "scroll":
291
+ direction = arguments.get("direction", "down")
292
+ amount = 3
293
+ action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, **arguments)
294
+ elif function_name == "hotkey":
295
+ action_to_run = KeypressAction(**arguments)
296
+ elif function_name == "goto":
297
+ action_to_run = GotoAction(**arguments)
298
+ elif function_name == "back":
299
+ action_to_run = BackAction(**arguments)
300
+ elif function_name == "wait":
301
+ action_to_run = WaitAction(**arguments)
302
+ elif function_name == "screenshot":
303
+ action_to_run = ScreenshotAction(**arguments)
304
+ elif function_name == "drag":
305
+ # Need to convert list of dicts to list of Point objects
306
+ path_dicts = arguments.get("path", [])
307
+ path_points = [Point(**p) for p in path_dicts]
308
+ if path_points:
309
+ action_to_run = DragAction(path=path_points)
310
+ else:
311
+ logger.warning(f"Drag action called with empty path: {arguments}")
312
+ action_render_str += " [Skipped - empty path]"
313
+ elif function_name == "finished":
314
+ action_to_run = None
315
+ else:
316
+ logger.warning(f"Grounding LLM called unhandled tool: {function_name}")
317
+ action_render_str += " [Unhandled]"
318
+
319
+ if action_to_run:
320
+ actions.append(action_to_run)
321
+ action_results.append(
322
+ {
323
+ "type": "tool_result",
324
+ "tool_call_id": tool_call.id,
325
+ "content": None, # Updated after environment step
326
+ }
327
+ )
328
+ rendered_parts.append(action_render_str)
329
+ except (json.JSONDecodeError, TypeError, ValueError) as arg_err:
330
+ logger.error(
331
+ f"Error parsing arguments for tool {function_name}: {arg_err} - Args: {tool_call.function.arguments}"
332
+ )
333
+ rendered_parts.append(f"**Error**: Failed to parse arguments for {function_name}")
334
+ rendered_response = "\n- ".join(rendered_parts)
335
+ else:
336
+ # Grounding LLM responded but didn't call a tool
337
+ logger.warning("Grounding LLM did not produce a tool call.")
338
+ rendered_response = f"{grounding_message.content or 'No action required.'}"
339
+
340
+ # Render the response
341
+ return rendered_response, actions
342
+
343
+ def reset(self):
344
+ """Reset the agent state."""
345
+ pass