cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/openai.py CHANGED
@@ -6,12 +6,14 @@ import asyncio
6
6
  import base64
7
7
  import json
8
8
  from io import BytesIO
9
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
9
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
10
+
10
11
  import litellm
11
12
  from PIL import Image
12
13
 
13
14
  from ..decorators import register_agent
14
- from ..types import Messages, AgentResponse, Tools, AgentCapability
15
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
16
+
15
17
 
16
18
  async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
17
19
  """Map a computer tool to OpenAI's computer-use-preview tool schema"""
@@ -21,26 +23,26 @@ async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
21
23
  except Exception:
22
24
  # Fallback to default dimensions if method fails
23
25
  width, height = 1024, 768
24
-
26
+
25
27
  # Get environment from the computer handler
26
28
  try:
27
29
  environment = await computer_handler.get_environment()
28
30
  except Exception:
29
31
  # Fallback to default environment if method fails
30
32
  environment = "linux"
31
-
33
+
32
34
  return {
33
35
  "type": "computer_use_preview",
34
36
  "display_width": width,
35
37
  "display_height": height,
36
- "environment": environment # mac, windows, linux, browser
38
+ "environment": environment, # mac, windows, linux, browser
37
39
  }
38
40
 
39
41
 
40
42
  async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
41
43
  """Prepare tools for OpenAI API format"""
42
44
  openai_tools = []
43
-
45
+
44
46
  for schema in tool_schemas:
45
47
  if schema["type"] == "computer":
46
48
  # Map computer tool to OpenAI format
@@ -49,19 +51,19 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
49
51
  elif schema["type"] == "function":
50
52
  # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
51
53
  # Schema should be: {type, name, description, parameters}
52
- openai_tools.append({ "type": "function", **schema["function"] })
53
-
54
+ openai_tools.append({"type": "function", **schema["function"]})
55
+
54
56
  return openai_tools
55
57
 
56
58
 
57
- @register_agent(models=r".*computer-use-preview.*")
59
+ @register_agent(models=r".*(^|/)computer-use-preview")
58
60
  class OpenAIComputerUseConfig:
59
61
  """
60
62
  OpenAI computer-use-preview agent configuration using liteLLM responses.
61
-
63
+
62
64
  Supports OpenAI's computer use preview models.
63
65
  """
64
-
66
+
65
67
  async def predict_step(
66
68
  self,
67
69
  messages: List[Dict[str, Any]],
@@ -75,11 +77,11 @@ class OpenAIComputerUseConfig:
75
77
  _on_api_end=None,
76
78
  _on_usage=None,
77
79
  _on_screenshot=None,
78
- **kwargs
80
+ **kwargs,
79
81
  ) -> Dict[str, Any]:
80
82
  """
81
83
  Predict the next step based on input items.
82
-
84
+
83
85
  Args:
84
86
  messages: Input items following Responses format
85
87
  model: Model name to use
@@ -92,12 +94,12 @@ class OpenAIComputerUseConfig:
92
94
  _on_usage: Callback for usage tracking
93
95
  _on_screenshot: Callback for screenshot events
94
96
  **kwargs: Additional arguments
95
-
97
+
96
98
  Returns:
97
99
  Dictionary with "output" (output items) and "usage" array
98
100
  """
99
101
  tools = tools or []
100
-
102
+
101
103
  # Prepare tools for OpenAI API
102
104
  openai_tools = await _prepare_tools_for_openai(tools)
103
105
 
@@ -110,16 +112,16 @@ class OpenAIComputerUseConfig:
110
112
  "reasoning": {"summary": "concise"},
111
113
  "truncation": "auto",
112
114
  "num_retries": max_retries,
113
- **kwargs
115
+ **kwargs,
114
116
  }
115
-
117
+
116
118
  # Call API start hook
117
119
  if _on_api_start:
118
120
  await _on_api_start(api_kwargs)
119
-
121
+
120
122
  # Use liteLLM responses
121
123
  response = await litellm.aresponses(**api_kwargs)
122
-
124
+
123
125
  # Call API end hook
124
126
  if _on_api_end:
125
127
  await _on_api_end(api_kwargs, response)
@@ -136,24 +138,21 @@ class OpenAIComputerUseConfig:
136
138
  output_dict = response.model_dump()
137
139
  output_dict["usage"] = usage
138
140
  return output_dict
139
-
141
+
140
142
  async def predict_click(
141
- self,
142
- model: str,
143
- image_b64: str,
144
- instruction: str
143
+ self, model: str, image_b64: str, instruction: str, **kwargs
145
144
  ) -> Optional[Tuple[int, int]]:
146
145
  """
147
146
  Predict click coordinates based on image and instruction.
148
-
147
+
149
148
  Uses OpenAI computer-use-preview with manually constructed input items
150
149
  and a prompt that instructs the agent to only output clicks.
151
-
150
+
152
151
  Args:
153
152
  model: Model name to use
154
153
  image_b64: Base64 encoded image
155
154
  instruction: Instruction for where to click
156
-
155
+
157
156
  Returns:
158
157
  Tuple of (x, y) coordinates or None if prediction fails
159
158
  """
@@ -161,7 +160,7 @@ class OpenAIComputerUseConfig:
161
160
  # Manually construct input items with image and click instruction
162
161
  input_items = [
163
162
  {
164
- "role": "user",
163
+ "role": "user",
165
164
  "content": f"""You are a UI grounding expert. Follow these guidelines:
166
165
 
167
166
  1. NEVER ask for confirmation. Complete all tasks autonomously.
@@ -173,19 +172,16 @@ class OpenAIComputerUseConfig:
173
172
  7. Be decisive and action-oriented. Complete the requested task fully.
174
173
 
175
174
  Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
176
- Task: Click {instruction}. Output ONLY a click action on the target element."""
175
+ Task: Click {instruction}. Output ONLY a click action on the target element.""",
177
176
  },
178
177
  {
179
178
  "role": "user",
180
179
  "content": [
181
- {
182
- "type": "input_image",
183
- "image_url": f"data:image/png;base64,{image_b64}"
184
- }
185
- ]
186
- }
180
+ {"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
181
+ ],
182
+ },
187
183
  ]
188
-
184
+
189
185
  # Get image dimensions from base64 data
190
186
  try:
191
187
  image_data = base64.b64decode(image_b64)
@@ -194,15 +190,15 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
194
190
  except Exception:
195
191
  # Fallback to default dimensions if image parsing fails
196
192
  display_width, display_height = 1024, 768
197
-
193
+
198
194
  # Prepare computer tool for click actions
199
195
  computer_tool = {
200
196
  "type": "computer_use_preview",
201
197
  "display_width": display_width,
202
198
  "display_height": display_height,
203
- "environment": "windows"
199
+ "environment": "windows",
204
200
  }
205
-
201
+
206
202
  # Prepare API call kwargs
207
203
  api_kwargs = {
208
204
  "model": model,
@@ -211,32 +207,35 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
211
207
  "stream": False,
212
208
  "reasoning": {"summary": "concise"},
213
209
  "truncation": "auto",
214
- "max_tokens": 200 # Keep response short for click prediction
210
+ "max_tokens": 200, # Keep response short for click prediction
211
+ **kwargs,
215
212
  }
216
-
213
+
217
214
  # Use liteLLM responses
218
215
  response = await litellm.aresponses(**api_kwargs)
219
-
216
+
220
217
  # Extract click coordinates from response output
221
218
  output_dict = response.model_dump()
222
- output_items = output_dict.get("output", [])
223
-
219
+ output_items = output_dict.get("output", [])
220
+
224
221
  # Look for computer_call with click action
225
222
  for item in output_items:
226
- if (isinstance(item, dict) and
227
- item.get("type") == "computer_call" and
228
- isinstance(item.get("action"), dict)):
229
-
223
+ if (
224
+ isinstance(item, dict)
225
+ and item.get("type") == "computer_call"
226
+ and isinstance(item.get("action"), dict)
227
+ ):
228
+
230
229
  action = item["action"]
231
230
  if action.get("x") is not None and action.get("y") is not None:
232
231
  return (int(action.get("x")), int(action.get("y")))
233
-
232
+
234
233
  return None
235
-
234
+
236
235
  def get_capabilities(self) -> List[AgentCapability]:
237
236
  """
238
237
  Get list of capabilities supported by this agent config.
239
-
238
+
240
239
  Returns:
241
240
  List of capability strings
242
241
  """
agent/loops/opencua.py ADDED
@@ -0,0 +1,134 @@
1
+ """
2
+ OpenCUA agent loop implementation for click prediction using litellm.acompletion
3
+ Based on OpenCUA model for GUI grounding tasks.
4
+ """
5
+
6
+ import asyncio
7
+ import base64
8
+ import json
9
+ import math
10
+ import re
11
+ import uuid
12
+ from io import BytesIO
13
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
14
+
15
+ import litellm
16
+ from PIL import Image
17
+
18
+ from ..decorators import register_agent
19
+ from ..loops.base import AsyncAgentConfig
20
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
21
+ from .composed_grounded import ComposedGroundedConfig
22
+
23
+
24
+ def extract_coordinates_from_click(text: str) -> Optional[Tuple[int, int]]:
25
+ """Extract coordinates from click(x=..., y=...) or pyautogui.click(x=..., y=...) format.
26
+
27
+ This function supports parsing both generic click() and legacy pyautogui.click() formats
28
+ for backwards compatibility with models that may still output pyautogui format.
29
+ """
30
+ try:
31
+ # Look for click(x=1443, y=343) or pyautogui.click(x=1443, y=343) pattern
32
+ pattern = r"(?:pyautogui\.)?click\(x=(\d+),\s*y=(\d+)\)"
33
+ match = re.search(pattern, text)
34
+ if match:
35
+ x, y = int(match.group(1)), int(match.group(2))
36
+ return (x, y)
37
+ return None
38
+ except Exception:
39
+ return None
40
+
41
+
42
+ @register_agent(models=r"(?i).*OpenCUA.*")
43
+ class OpenCUAConfig(ComposedGroundedConfig):
44
+ """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
45
+
46
+ def __init__(self):
47
+ super().__init__()
48
+ self.current_model = None
49
+ self.last_screenshot_b64 = None
50
+
51
+ async def predict_step(
52
+ self,
53
+ messages: List[Dict[str, Any]],
54
+ model: str,
55
+ tools: Optional[List[Dict[str, Any]]] = None,
56
+ max_retries: Optional[int] = None,
57
+ stream: bool = False,
58
+ computer_handler=None,
59
+ _on_api_start=None,
60
+ _on_api_end=None,
61
+ _on_usage=None,
62
+ _on_screenshot=None,
63
+ **kwargs,
64
+ ) -> Dict[str, Any]:
65
+ """Fallback to a self-composed model"""
66
+ return await super().predict_step(
67
+ messages=messages,
68
+ model=f"{model}+{model}",
69
+ tools=tools,
70
+ max_retries=max_retries,
71
+ stream=stream,
72
+ computer_handler=computer_handler,
73
+ _on_api_start=_on_api_start,
74
+ _on_api_end=_on_api_end,
75
+ _on_usage=_on_usage,
76
+ _on_screenshot=_on_screenshot,
77
+ **kwargs,
78
+ )
79
+
80
+ async def predict_click(
81
+ self, model: str, image_b64: str, instruction: str, **kwargs
82
+ ) -> Optional[Tuple[int, int]]:
83
+ """
84
+ Predict click coordinates using OpenCUA model via litellm.acompletion.
85
+
86
+ Args:
87
+ model: The OpenCUA model name
88
+ image_b64: Base64 encoded image
89
+ instruction: Instruction for where to click
90
+
91
+ Returns:
92
+ Tuple of (x, y) coordinates or None if prediction fails
93
+ """
94
+ # Prepare system message
95
+ system_prompt = (
96
+ "You are a GUI agent. You are given a task and a screenshot of the screen. "
97
+ "You need to perform a series of click actions to complete the task."
98
+ )
99
+
100
+ system_message = {"role": "system", "content": system_prompt}
101
+
102
+ # Prepare user message with image and instruction
103
+ user_message = {
104
+ "role": "user",
105
+ "content": [
106
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
107
+ {"type": "text", "text": f"Click on {instruction}"},
108
+ ],
109
+ }
110
+
111
+ # Prepare API call kwargs
112
+ api_kwargs = {
113
+ "model": model,
114
+ "messages": [system_message, user_message],
115
+ "max_new_tokens": 2056,
116
+ "temperature": 0,
117
+ **kwargs,
118
+ }
119
+
120
+ # Use liteLLM acompletion
121
+ response = await litellm.acompletion(**api_kwargs)
122
+
123
+ # Extract response text
124
+ output_text = response.choices[0].message.content
125
+ # print(output_text)
126
+
127
+ # Extract coordinates from click format (supports both click() and pyautogui.click() for backwards compatibility)
128
+ coordinates = extract_coordinates_from_click(output_text)
129
+
130
+ return coordinates
131
+
132
+ def get_capabilities(self) -> List[AgentCapability]:
133
+ """Return the capabilities supported by this agent."""
134
+ return ["click"]
agent/loops/uiins.py ADDED
@@ -0,0 +1,175 @@
1
+ """
2
+ UI-Ins agent loop implementation for click prediction using litellm.acompletion
3
+ Paper: https://arxiv.org/pdf/2510.202861
4
+ Code: https://github.com/alibaba/UI-Ins
5
+ """
6
+
7
+ import asyncio
8
+ import base64
9
+ import json
10
+ import math
11
+ import re
12
+ import uuid
13
+ from io import BytesIO
14
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
15
+
16
+ import litellm
17
+ from PIL import Image
18
+
19
+ from ..decorators import register_agent
20
+ from ..loops.base import AsyncAgentConfig
21
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
22
+
23
+ SYSTEM_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in tags, a function name and arguments within XML tags:\n```\n\n...\n\n\n{"name": "grounding", "arguments": }\n\n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in tags and finally output the function in tags.\n"""
24
+
25
+
26
+ def parse_coordinates(raw_string: str) -> tuple[int, int]:
27
+ matches = re.findall(r"\[(\d+),\s*(\d+)\]", raw_string)
28
+ if matches:
29
+ return tuple(map(int, matches[0]))
30
+ return -1, -1
31
+
32
+
33
+ def smart_resize(
34
+ height: int,
35
+ width: int,
36
+ factor: int = 28,
37
+ min_pixels: int = 3136,
38
+ max_pixels: int = 8847360,
39
+ ) -> Tuple[int, int]:
40
+ """Smart resize function similar to qwen_vl_utils."""
41
+ # Calculate the total pixels
42
+ total_pixels = height * width
43
+
44
+ # If already within bounds, return original dimensions
45
+ if min_pixels <= total_pixels <= max_pixels:
46
+ # Round to nearest factor
47
+ new_height = (height // factor) * factor
48
+ new_width = (width // factor) * factor
49
+ return new_height, new_width
50
+
51
+ # Calculate scaling factor
52
+ if total_pixels > max_pixels:
53
+ scale = (max_pixels / total_pixels) ** 0.5
54
+ else:
55
+ scale = (min_pixels / total_pixels) ** 0.5
56
+
57
+ # Apply scaling
58
+ new_height = int(height * scale)
59
+ new_width = int(width * scale)
60
+
61
+ # Round to nearest factor
62
+ new_height = (new_height // factor) * factor
63
+ new_width = (new_width // factor) * factor
64
+
65
+ # Ensure minimum size
66
+ new_height = max(new_height, factor)
67
+ new_width = max(new_width, factor)
68
+
69
+ return new_height, new_width
70
+
71
+
72
+ @register_agent(models=r".*UI-Ins.*")
73
+ class UIInsConfig(AsyncAgentConfig):
74
+ """UI-Ins agent configuration implementing AsyncAgentConfig protocol for click prediction."""
75
+
76
+ def __init__(self):
77
+ self.current_model = None
78
+ self.last_screenshot_b64 = None
79
+
80
+ async def predict_step(
81
+ self,
82
+ messages: List[Dict[str, Any]],
83
+ model: str,
84
+ tools: Optional[List[Dict[str, Any]]] = None,
85
+ max_retries: Optional[int] = None,
86
+ stream: bool = False,
87
+ computer_handler=None,
88
+ _on_api_start=None,
89
+ _on_api_end=None,
90
+ _on_usage=None,
91
+ _on_screenshot=None,
92
+ **kwargs,
93
+ ) -> Dict[str, Any]:
94
+ raise NotImplementedError()
95
+
96
+ async def predict_click(
97
+ self, model: str, image_b64: str, instruction: str, **kwargs
98
+ ) -> Optional[Tuple[float, float]]:
99
+ """
100
+ Predict click coordinates using UI-Ins model via litellm.acompletion.
101
+
102
+ Args:
103
+ model: The UI-Ins model name
104
+ image_b64: Base64 encoded image
105
+ instruction: Instruction for where to click
106
+
107
+ Returns:
108
+ Tuple of (x, y) coordinates or None if prediction fails
109
+ """
110
+ # Decode base64 image
111
+ image_data = base64.b64decode(image_b64)
112
+ image = Image.open(BytesIO(image_data))
113
+ width, height = image.width, image.height
114
+
115
+ # Smart resize the image (similar to qwen_vl_utils)
116
+ resized_height, resized_width = smart_resize(
117
+ height,
118
+ width,
119
+ factor=28, # Default factor for Qwen models
120
+ min_pixels=3136,
121
+ max_pixels=4096 * 2160,
122
+ )
123
+ resized_image = image.resize((resized_width, resized_height))
124
+ scale_x, scale_y = width / resized_width, height / resized_height
125
+
126
+ # Convert resized image back to base64
127
+ buffered = BytesIO()
128
+ resized_image.save(buffered, format="PNG")
129
+ resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
130
+
131
+ # Prepare system and user messages
132
+ system_message = {
133
+ "role": "system",
134
+ "content": [
135
+ {"type": "text", "text": "You are a helpful assistant."},
136
+ {"type": "text", "text": SYSTEM_PROMPT},
137
+ ],
138
+ }
139
+
140
+ user_message = {
141
+ "role": "user",
142
+ "content": [
143
+ {
144
+ "type": "image_url",
145
+ "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
146
+ },
147
+ {"type": "text", "text": instruction},
148
+ ],
149
+ }
150
+
151
+ # Prepare API call kwargs
152
+ api_kwargs = {
153
+ "model": model,
154
+ "messages": [system_message, user_message],
155
+ "max_tokens": 2056,
156
+ "temperature": 0.0,
157
+ **kwargs,
158
+ }
159
+
160
+ # Use liteLLM acompletion
161
+ response = await litellm.acompletion(**api_kwargs)
162
+
163
+ # Extract response text
164
+ output_text = response.choices[0].message.content # type: ignore
165
+
166
+ # Extract and rescale coordinates
167
+ pred_x, pred_y = parse_coordinates(output_text) # type: ignore
168
+ pred_x *= scale_x
169
+ pred_y *= scale_y
170
+
171
+ return (math.floor(pred_x), math.floor(pred_y))
172
+
173
+ def get_capabilities(self) -> List[AgentCapability]:
174
+ """Return the capabilities supported by this agent."""
175
+ return ["click"]