cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/openai.py CHANGED
@@ -6,12 +6,14 @@ import asyncio
6
6
  import base64
7
7
  import json
8
8
  from io import BytesIO
9
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
9
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
10
+
10
11
  import litellm
11
12
  from PIL import Image
12
13
 
13
14
  from ..decorators import register_agent
14
- from ..types import Messages, AgentResponse, Tools, AgentCapability
15
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
16
+
15
17
 
16
18
  async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
17
19
  """Map a computer tool to OpenAI's computer-use-preview tool schema"""
@@ -21,26 +23,26 @@ async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
21
23
  except Exception:
22
24
  # Fallback to default dimensions if method fails
23
25
  width, height = 1024, 768
24
-
26
+
25
27
  # Get environment from the computer handler
26
28
  try:
27
29
  environment = await computer_handler.get_environment()
28
30
  except Exception:
29
31
  # Fallback to default environment if method fails
30
32
  environment = "linux"
31
-
33
+
32
34
  return {
33
35
  "type": "computer_use_preview",
34
36
  "display_width": width,
35
37
  "display_height": height,
36
- "environment": environment # mac, windows, linux, browser
38
+ "environment": environment, # mac, windows, linux, browser
37
39
  }
38
40
 
39
41
 
40
42
  async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
41
43
  """Prepare tools for OpenAI API format"""
42
44
  openai_tools = []
43
-
45
+
44
46
  for schema in tool_schemas:
45
47
  if schema["type"] == "computer":
46
48
  # Map computer tool to OpenAI format
@@ -49,19 +51,19 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
49
51
  elif schema["type"] == "function":
50
52
  # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
51
53
  # Schema should be: {type, name, description, parameters}
52
- openai_tools.append({ "type": "function", **schema["function"] })
53
-
54
+ openai_tools.append({"type": "function", **schema["function"]})
55
+
54
56
  return openai_tools
55
57
 
56
58
 
57
- @register_agent(models=r".*computer-use-preview.*")
59
+ @register_agent(models=r".*(^|/)computer-use-preview")
58
60
  class OpenAIComputerUseConfig:
59
61
  """
60
62
  OpenAI computer-use-preview agent configuration using liteLLM responses.
61
-
63
+
62
64
  Supports OpenAI's computer use preview models.
63
65
  """
64
-
66
+
65
67
  async def predict_step(
66
68
  self,
67
69
  messages: List[Dict[str, Any]],
@@ -75,11 +77,11 @@ class OpenAIComputerUseConfig:
75
77
  _on_api_end=None,
76
78
  _on_usage=None,
77
79
  _on_screenshot=None,
78
- **kwargs
80
+ **kwargs,
79
81
  ) -> Dict[str, Any]:
80
82
  """
81
83
  Predict the next step based on input items.
82
-
84
+
83
85
  Args:
84
86
  messages: Input items following Responses format
85
87
  model: Model name to use
@@ -92,12 +94,12 @@ class OpenAIComputerUseConfig:
92
94
  _on_usage: Callback for usage tracking
93
95
  _on_screenshot: Callback for screenshot events
94
96
  **kwargs: Additional arguments
95
-
97
+
96
98
  Returns:
97
99
  Dictionary with "output" (output items) and "usage" array
98
100
  """
99
101
  tools = tools or []
100
-
102
+
101
103
  # Prepare tools for OpenAI API
102
104
  openai_tools = await _prepare_tools_for_openai(tools)
103
105
 
@@ -110,16 +112,16 @@ class OpenAIComputerUseConfig:
110
112
  "reasoning": {"summary": "concise"},
111
113
  "truncation": "auto",
112
114
  "num_retries": max_retries,
113
- **kwargs
115
+ **kwargs,
114
116
  }
115
-
117
+
116
118
  # Call API start hook
117
119
  if _on_api_start:
118
120
  await _on_api_start(api_kwargs)
119
-
121
+
120
122
  # Use liteLLM responses
121
123
  response = await litellm.aresponses(**api_kwargs)
122
-
124
+
123
125
  # Call API end hook
124
126
  if _on_api_end:
125
127
  await _on_api_end(api_kwargs, response)
@@ -136,24 +138,21 @@ class OpenAIComputerUseConfig:
136
138
  output_dict = response.model_dump()
137
139
  output_dict["usage"] = usage
138
140
  return output_dict
139
-
141
+
140
142
  async def predict_click(
141
- self,
142
- model: str,
143
- image_b64: str,
144
- instruction: str
143
+ self, model: str, image_b64: str, instruction: str, **kwargs
145
144
  ) -> Optional[Tuple[int, int]]:
146
145
  """
147
146
  Predict click coordinates based on image and instruction.
148
-
147
+
149
148
  Uses OpenAI computer-use-preview with manually constructed input items
150
149
  and a prompt that instructs the agent to only output clicks.
151
-
150
+
152
151
  Args:
153
152
  model: Model name to use
154
153
  image_b64: Base64 encoded image
155
154
  instruction: Instruction for where to click
156
-
155
+
157
156
  Returns:
158
157
  Tuple of (x, y) coordinates or None if prediction fails
159
158
  """
@@ -161,20 +160,28 @@ class OpenAIComputerUseConfig:
161
160
  # Manually construct input items with image and click instruction
162
161
  input_items = [
163
162
  {
164
- "role": "user",
165
- "content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
163
+ "role": "user",
164
+ "content": f"""You are a UI grounding expert. Follow these guidelines:
165
+
166
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
167
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
168
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
169
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
170
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
171
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
172
+ 7. Be decisive and action-oriented. Complete the requested task fully.
173
+
174
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
175
+ Task: Click {instruction}. Output ONLY a click action on the target element.""",
166
176
  },
167
177
  {
168
178
  "role": "user",
169
179
  "content": [
170
- {
171
- "type": "input_image",
172
- "image_url": f"data:image/png;base64,{image_b64}"
173
- }
174
- ]
175
- }
180
+ {"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
181
+ ],
182
+ },
176
183
  ]
177
-
184
+
178
185
  # Get image dimensions from base64 data
179
186
  try:
180
187
  image_data = base64.b64decode(image_b64)
@@ -183,15 +190,15 @@ class OpenAIComputerUseConfig:
183
190
  except Exception:
184
191
  # Fallback to default dimensions if image parsing fails
185
192
  display_width, display_height = 1024, 768
186
-
193
+
187
194
  # Prepare computer tool for click actions
188
195
  computer_tool = {
189
196
  "type": "computer_use_preview",
190
197
  "display_width": display_width,
191
198
  "display_height": display_height,
192
- "environment": "windows"
199
+ "environment": "windows",
193
200
  }
194
-
201
+
195
202
  # Prepare API call kwargs
196
203
  api_kwargs = {
197
204
  "model": model,
@@ -200,35 +207,35 @@ class OpenAIComputerUseConfig:
200
207
  "stream": False,
201
208
  "reasoning": {"summary": "concise"},
202
209
  "truncation": "auto",
203
- "max_tokens": 100 # Keep response short for click prediction
210
+ "max_tokens": 200, # Keep response short for click prediction
211
+ **kwargs,
204
212
  }
205
-
213
+
206
214
  # Use liteLLM responses
207
215
  response = await litellm.aresponses(**api_kwargs)
208
-
216
+
209
217
  # Extract click coordinates from response output
210
218
  output_dict = response.model_dump()
211
- output_items = output_dict.get("output", [])
212
-
219
+ output_items = output_dict.get("output", [])
220
+
213
221
  # Look for computer_call with click action
214
222
  for item in output_items:
215
- if (isinstance(item, dict) and
216
- item.get("type") == "computer_call" and
217
- isinstance(item.get("action"), dict)):
218
-
223
+ if (
224
+ isinstance(item, dict)
225
+ and item.get("type") == "computer_call"
226
+ and isinstance(item.get("action"), dict)
227
+ ):
228
+
219
229
  action = item["action"]
220
- if action.get("type") == "click":
221
- x = action.get("x")
222
- y = action.get("y")
223
- if x is not None and y is not None:
224
- return (int(x), int(y))
225
-
230
+ if action.get("x") is not None and action.get("y") is not None:
231
+ return (int(action.get("x")), int(action.get("y")))
232
+
226
233
  return None
227
-
234
+
228
235
  def get_capabilities(self) -> List[AgentCapability]:
229
236
  """
230
237
  Get list of capabilities supported by this agent config.
231
-
238
+
232
239
  Returns:
233
240
  List of capability strings
234
241
  """
agent/loops/opencua.py ADDED
@@ -0,0 +1,134 @@
1
+ """
2
+ OpenCUA agent loop implementation for click prediction using litellm.acompletion
3
+ Based on OpenCUA model for GUI grounding tasks.
4
+ """
5
+
6
+ import asyncio
7
+ import base64
8
+ import json
9
+ import math
10
+ import re
11
+ import uuid
12
+ from io import BytesIO
13
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
14
+
15
+ import litellm
16
+ from PIL import Image
17
+
18
+ from ..decorators import register_agent
19
+ from ..loops.base import AsyncAgentConfig
20
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
21
+ from .composed_grounded import ComposedGroundedConfig
22
+
23
+
24
+ def extract_coordinates_from_click(text: str) -> Optional[Tuple[int, int]]:
25
+ """Extract coordinates from click(x=..., y=...) or pyautogui.click(x=..., y=...) format.
26
+
27
+ This function supports parsing both generic click() and legacy pyautogui.click() formats
28
+ for backwards compatibility with models that may still output pyautogui format.
29
+ """
30
+ try:
31
+ # Look for click(x=1443, y=343) or pyautogui.click(x=1443, y=343) pattern
32
+ pattern = r"(?:pyautogui\.)?click\(x=(\d+),\s*y=(\d+)\)"
33
+ match = re.search(pattern, text)
34
+ if match:
35
+ x, y = int(match.group(1)), int(match.group(2))
36
+ return (x, y)
37
+ return None
38
+ except Exception:
39
+ return None
40
+
41
+
42
+ @register_agent(models=r"(?i).*OpenCUA.*")
43
+ class OpenCUAConfig(ComposedGroundedConfig):
44
+ """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
45
+
46
+ def __init__(self):
47
+ super().__init__()
48
+ self.current_model = None
49
+ self.last_screenshot_b64 = None
50
+
51
+ async def predict_step(
52
+ self,
53
+ messages: List[Dict[str, Any]],
54
+ model: str,
55
+ tools: Optional[List[Dict[str, Any]]] = None,
56
+ max_retries: Optional[int] = None,
57
+ stream: bool = False,
58
+ computer_handler=None,
59
+ _on_api_start=None,
60
+ _on_api_end=None,
61
+ _on_usage=None,
62
+ _on_screenshot=None,
63
+ **kwargs,
64
+ ) -> Dict[str, Any]:
65
+ """Fallback to a self-composed model"""
66
+ return await super().predict_step(
67
+ messages=messages,
68
+ model=f"{model}+{model}",
69
+ tools=tools,
70
+ max_retries=max_retries,
71
+ stream=stream,
72
+ computer_handler=computer_handler,
73
+ _on_api_start=_on_api_start,
74
+ _on_api_end=_on_api_end,
75
+ _on_usage=_on_usage,
76
+ _on_screenshot=_on_screenshot,
77
+ **kwargs,
78
+ )
79
+
80
+ async def predict_click(
81
+ self, model: str, image_b64: str, instruction: str, **kwargs
82
+ ) -> Optional[Tuple[int, int]]:
83
+ """
84
+ Predict click coordinates using OpenCUA model via litellm.acompletion.
85
+
86
+ Args:
87
+ model: The OpenCUA model name
88
+ image_b64: Base64 encoded image
89
+ instruction: Instruction for where to click
90
+
91
+ Returns:
92
+ Tuple of (x, y) coordinates or None if prediction fails
93
+ """
94
+ # Prepare system message
95
+ system_prompt = (
96
+ "You are a GUI agent. You are given a task and a screenshot of the screen. "
97
+ "You need to perform a series of click actions to complete the task."
98
+ )
99
+
100
+ system_message = {"role": "system", "content": system_prompt}
101
+
102
+ # Prepare user message with image and instruction
103
+ user_message = {
104
+ "role": "user",
105
+ "content": [
106
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
107
+ {"type": "text", "text": f"Click on {instruction}"},
108
+ ],
109
+ }
110
+
111
+ # Prepare API call kwargs
112
+ api_kwargs = {
113
+ "model": model,
114
+ "messages": [system_message, user_message],
115
+ "max_new_tokens": 2056,
116
+ "temperature": 0,
117
+ **kwargs,
118
+ }
119
+
120
+ # Use liteLLM acompletion
121
+ response = await litellm.acompletion(**api_kwargs)
122
+
123
+ # Extract response text
124
+ output_text = response.choices[0].message.content
125
+ # print(output_text)
126
+
127
+ # Extract coordinates from click format (supports both click() and pyautogui.click() for backwards compatibility)
128
+ coordinates = extract_coordinates_from_click(output_text)
129
+
130
+ return coordinates
131
+
132
+ def get_capabilities(self) -> List[AgentCapability]:
133
+ """Return the capabilities supported by this agent."""
134
+ return ["click"]
agent/loops/uiins.py ADDED
@@ -0,0 +1,175 @@
1
+ """
2
+ UI-Ins agent loop implementation for click prediction using litellm.acompletion
3
+ Paper: https://arxiv.org/pdf/2510.202861
4
+ Code: https://github.com/alibaba/UI-Ins
5
+ """
6
+
7
+ import asyncio
8
+ import base64
9
+ import json
10
+ import math
11
+ import re
12
+ import uuid
13
+ from io import BytesIO
14
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
15
+
16
+ import litellm
17
+ from PIL import Image
18
+
19
+ from ..decorators import register_agent
20
+ from ..loops.base import AsyncAgentConfig
21
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
22
+
23
+ SYSTEM_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in tags, a function name and arguments within XML tags:\n```\n\n...\n\n\n{"name": "grounding", "arguments": }\n\n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in tags and finally output the function in tags.\n"""
24
+
25
+
26
+ def parse_coordinates(raw_string: str) -> tuple[int, int]:
27
+ matches = re.findall(r"\[(\d+),\s*(\d+)\]", raw_string)
28
+ if matches:
29
+ return tuple(map(int, matches[0]))
30
+ return -1, -1
31
+
32
+
33
+ def smart_resize(
34
+ height: int,
35
+ width: int,
36
+ factor: int = 28,
37
+ min_pixels: int = 3136,
38
+ max_pixels: int = 8847360,
39
+ ) -> Tuple[int, int]:
40
+ """Smart resize function similar to qwen_vl_utils."""
41
+ # Calculate the total pixels
42
+ total_pixels = height * width
43
+
44
+ # If already within bounds, return original dimensions
45
+ if min_pixels <= total_pixels <= max_pixels:
46
+ # Round to nearest factor
47
+ new_height = (height // factor) * factor
48
+ new_width = (width // factor) * factor
49
+ return new_height, new_width
50
+
51
+ # Calculate scaling factor
52
+ if total_pixels > max_pixels:
53
+ scale = (max_pixels / total_pixels) ** 0.5
54
+ else:
55
+ scale = (min_pixels / total_pixels) ** 0.5
56
+
57
+ # Apply scaling
58
+ new_height = int(height * scale)
59
+ new_width = int(width * scale)
60
+
61
+ # Round to nearest factor
62
+ new_height = (new_height // factor) * factor
63
+ new_width = (new_width // factor) * factor
64
+
65
+ # Ensure minimum size
66
+ new_height = max(new_height, factor)
67
+ new_width = max(new_width, factor)
68
+
69
+ return new_height, new_width
70
+
71
+
72
+ @register_agent(models=r".*UI-Ins.*")
73
+ class UIInsConfig(AsyncAgentConfig):
74
+ """UI-Ins agent configuration implementing AsyncAgentConfig protocol for click prediction."""
75
+
76
+ def __init__(self):
77
+ self.current_model = None
78
+ self.last_screenshot_b64 = None
79
+
80
+ async def predict_step(
81
+ self,
82
+ messages: List[Dict[str, Any]],
83
+ model: str,
84
+ tools: Optional[List[Dict[str, Any]]] = None,
85
+ max_retries: Optional[int] = None,
86
+ stream: bool = False,
87
+ computer_handler=None,
88
+ _on_api_start=None,
89
+ _on_api_end=None,
90
+ _on_usage=None,
91
+ _on_screenshot=None,
92
+ **kwargs,
93
+ ) -> Dict[str, Any]:
94
+ raise NotImplementedError()
95
+
96
+ async def predict_click(
97
+ self, model: str, image_b64: str, instruction: str, **kwargs
98
+ ) -> Optional[Tuple[float, float]]:
99
+ """
100
+ Predict click coordinates using UI-Ins model via litellm.acompletion.
101
+
102
+ Args:
103
+ model: The UI-Ins model name
104
+ image_b64: Base64 encoded image
105
+ instruction: Instruction for where to click
106
+
107
+ Returns:
108
+ Tuple of (x, y) coordinates or None if prediction fails
109
+ """
110
+ # Decode base64 image
111
+ image_data = base64.b64decode(image_b64)
112
+ image = Image.open(BytesIO(image_data))
113
+ width, height = image.width, image.height
114
+
115
+ # Smart resize the image (similar to qwen_vl_utils)
116
+ resized_height, resized_width = smart_resize(
117
+ height,
118
+ width,
119
+ factor=28, # Default factor for Qwen models
120
+ min_pixels=3136,
121
+ max_pixels=4096 * 2160,
122
+ )
123
+ resized_image = image.resize((resized_width, resized_height))
124
+ scale_x, scale_y = width / resized_width, height / resized_height
125
+
126
+ # Convert resized image back to base64
127
+ buffered = BytesIO()
128
+ resized_image.save(buffered, format="PNG")
129
+ resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
130
+
131
+ # Prepare system and user messages
132
+ system_message = {
133
+ "role": "system",
134
+ "content": [
135
+ {"type": "text", "text": "You are a helpful assistant."},
136
+ {"type": "text", "text": SYSTEM_PROMPT},
137
+ ],
138
+ }
139
+
140
+ user_message = {
141
+ "role": "user",
142
+ "content": [
143
+ {
144
+ "type": "image_url",
145
+ "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
146
+ },
147
+ {"type": "text", "text": instruction},
148
+ ],
149
+ }
150
+
151
+ # Prepare API call kwargs
152
+ api_kwargs = {
153
+ "model": model,
154
+ "messages": [system_message, user_message],
155
+ "max_tokens": 2056,
156
+ "temperature": 0.0,
157
+ **kwargs,
158
+ }
159
+
160
+ # Use liteLLM acompletion
161
+ response = await litellm.acompletion(**api_kwargs)
162
+
163
+ # Extract response text
164
+ output_text = response.choices[0].message.content # type: ignore
165
+
166
+ # Extract and rescale coordinates
167
+ pred_x, pred_y = parse_coordinates(output_text) # type: ignore
168
+ pred_x *= scale_x
169
+ pred_y *= scale_y
170
+
171
+ return (math.floor(pred_x), math.floor(pred_y))
172
+
173
+ def get_capabilities(self) -> List[AgentCapability]:
174
+ """Return the capabilities supported by this agent."""
175
+ return ["click"]