cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/omniparser.py CHANGED
@@ -5,100 +5,102 @@ Code: https://github.com/microsoft/OmniParser
5
5
  """
6
6
 
7
7
  import asyncio
8
+ import base64
9
+ import inspect
8
10
  import json
9
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
11
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
12
+
10
13
  import litellm
11
- import inspect
12
- import base64
13
14
 
14
15
  from ..decorators import register_agent
15
- from ..types import Messages, AgentResponse, Tools, AgentCapability
16
16
  from ..loops.base import AsyncAgentConfig
17
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
17
18
 
18
19
  SOM_TOOL_SCHEMA = {
19
- "type": "function",
20
- "name": "computer",
21
- "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
22
- "parameters": {
23
- "type": "object",
24
- "properties": {
25
- "action": {
26
- "type": "string",
27
- "enum": [
28
- "screenshot",
29
- "click",
30
- "double_click",
31
- "drag",
32
- "type",
33
- "keypress",
34
- "scroll",
35
- "move",
36
- "wait",
37
- "get_current_url",
38
- "get_dimensions",
39
- "get_environment"
40
- ],
41
- "description": "The action to perform"
42
- },
43
- "element_id": {
44
- "type": "integer",
45
- "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
46
- },
47
- "start_element_id": {
48
- "type": "integer",
49
- "description": "The ID of the element to start dragging from (required for drag action)"
50
- },
51
- "end_element_id": {
52
- "type": "integer",
53
- "description": "The ID of the element to drag to (required for drag action)"
54
- },
55
- "text": {
56
- "type": "string",
57
- "description": "The text to type (required for type action)"
58
- },
59
- "keys": {
60
- "type": "string",
61
- "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
62
- },
63
- "button": {
64
- "type": "string",
65
- "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
66
- },
67
- "scroll_x": {
68
- "type": "integer",
69
- "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
70
- },
71
- "scroll_y": {
72
- "type": "integer",
73
- "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
74
- },
20
+ "type": "function",
21
+ "name": "computer",
22
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
23
+ "parameters": {
24
+ "type": "object",
25
+ "properties": {
26
+ "action": {
27
+ "type": "string",
28
+ "enum": [
29
+ "screenshot",
30
+ "click",
31
+ "double_click",
32
+ "drag",
33
+ "type",
34
+ "keypress",
35
+ "scroll",
36
+ "move",
37
+ "wait",
38
+ "get_current_url",
39
+ "get_dimensions",
40
+ "get_environment",
41
+ ],
42
+ "description": "The action to perform",
43
+ },
44
+ "element_id": {
45
+ "type": "integer",
46
+ "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
47
+ },
48
+ "start_element_id": {
49
+ "type": "integer",
50
+ "description": "The ID of the element to start dragging from (required for drag action)",
51
+ },
52
+ "end_element_id": {
53
+ "type": "integer",
54
+ "description": "The ID of the element to drag to (required for drag action)",
55
+ },
56
+ "text": {
57
+ "type": "string",
58
+ "description": "The text to type (required for type action)",
59
+ },
60
+ "keys": {
61
+ "type": "string",
62
+ "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
63
+ },
64
+ "button": {
65
+ "type": "string",
66
+ "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
67
+ },
68
+ "scroll_x": {
69
+ "type": "integer",
70
+ "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
71
+ },
72
+ "scroll_y": {
73
+ "type": "integer",
74
+ "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
75
+ },
76
+ },
77
+ "required": ["action"],
75
78
  },
76
- "required": [
77
- "action"
78
- ]
79
- }
80
79
  }
81
80
 
82
81
  OMNIPARSER_AVAILABLE = False
83
82
  try:
84
83
  from som import OmniParser
84
+
85
85
  OMNIPARSER_AVAILABLE = True
86
86
  except ImportError:
87
87
  pass
88
88
  OMNIPARSER_SINGLETON = None
89
89
 
90
+
90
91
  def get_parser():
91
92
  global OMNIPARSER_SINGLETON
92
93
  if OMNIPARSER_SINGLETON is None:
93
94
  OMNIPARSER_SINGLETON = OmniParser()
94
95
  return OMNIPARSER_SINGLETON
95
-
96
+
97
+
96
98
  def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
97
99
  """Get the last computer_call_output message from a messages list.
98
-
100
+
99
101
  Args:
100
102
  messages: List of messages to search through
101
-
103
+
102
104
  Returns:
103
105
  The last computer_call_output message dict, or None if not found
104
106
  """
@@ -107,11 +109,12 @@ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Di
107
109
  return message
108
110
  return None
109
111
 
112
+
110
113
  def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
111
114
  """Prepare tools for OpenAI API format"""
112
115
  omniparser_tools = []
113
116
  id2xy = dict()
114
-
117
+
115
118
  for schema in tool_schemas:
116
119
  if schema["type"] == "computer":
117
120
  omniparser_tools.append(SOM_TOOL_SCHEMA)
@@ -122,72 +125,80 @@ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[T
122
125
  elif schema["type"] == "function":
123
126
  # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
124
127
  # Schema should be: {type, name, description, parameters}
125
- omniparser_tools.append({ "type": "function", **schema["function"] })
126
-
128
+ omniparser_tools.append({"type": "function", **schema["function"]})
129
+
127
130
  return omniparser_tools, id2xy
128
131
 
129
- async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
130
- item_type = item.get("type")
131
-
132
- def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
133
- if element_id is None:
134
- return (None, None)
135
- return id2xy.get(element_id, (None, None))
136
-
137
- if item_type == "function_call":
138
- fn_name = item.get("name")
139
- fn_args = json.loads(item.get("arguments", "{}"))
140
-
141
- item_id = item.get("id")
142
- call_id = item.get("call_id")
143
-
144
- if fn_name == "computer":
145
- action = fn_args.get("action")
146
- element_id = fn_args.get("element_id")
147
- start_element_id = fn_args.get("start_element_id")
148
- end_element_id = fn_args.get("end_element_id")
149
- text = fn_args.get("text")
150
- keys = fn_args.get("keys")
151
- button = fn_args.get("button")
152
- scroll_x = fn_args.get("scroll_x")
153
- scroll_y = fn_args.get("scroll_y")
154
-
155
- x, y = _get_xy(element_id)
156
- start_x, start_y = _get_xy(start_element_id)
157
- end_x, end_y = _get_xy(end_element_id)
158
-
159
- action_args = {
160
- "type": action,
161
- "x": x,
162
- "y": y,
163
- "start_x": start_x,
164
- "start_y": start_y,
165
- "end_x": end_x,
166
- "end_y": end_y,
167
- "text": text,
168
- "keys": keys,
169
- "button": button,
170
- "scroll_x": scroll_x,
171
- "scroll_y": scroll_y
172
- }
173
- # Remove None values to keep the JSON clean
174
- action_args = {k: v for k, v in action_args.items() if v is not None}
175
132
 
176
- return [{
177
- "type": "computer_call",
178
- "action": action_args,
179
- "id": item_id,
180
- "call_id": call_id,
181
- "status": "completed"
182
- }]
133
+ async def replace_function_with_computer_call(
134
+ item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]
135
+ ):
136
+ item_type = item.get("type")
137
+
138
+ def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
139
+ if element_id is None:
140
+ return (None, None)
141
+ return id2xy.get(element_id, (None, None))
142
+
143
+ if item_type == "function_call":
144
+ fn_name = item.get("name")
145
+ fn_args = json.loads(item.get("arguments", "{}"))
146
+
147
+ item_id = item.get("id")
148
+ call_id = item.get("call_id")
149
+
150
+ if fn_name == "computer":
151
+ action = fn_args.get("action")
152
+ element_id = fn_args.get("element_id")
153
+ start_element_id = fn_args.get("start_element_id")
154
+ end_element_id = fn_args.get("end_element_id")
155
+ text = fn_args.get("text")
156
+ keys = fn_args.get("keys")
157
+ button = fn_args.get("button")
158
+ scroll_x = fn_args.get("scroll_x")
159
+ scroll_y = fn_args.get("scroll_y")
183
160
 
184
- return [item]
161
+ x, y = _get_xy(element_id)
162
+ start_x, start_y = _get_xy(start_element_id)
163
+ end_x, end_y = _get_xy(end_element_id)
185
164
 
186
- async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
165
+ action_args = {
166
+ "type": action,
167
+ "x": x,
168
+ "y": y,
169
+ "start_x": start_x,
170
+ "start_y": start_y,
171
+ "end_x": end_x,
172
+ "end_y": end_y,
173
+ "text": text,
174
+ "keys": keys,
175
+ "button": button,
176
+ "scroll_x": scroll_x,
177
+ "scroll_y": scroll_y,
178
+ }
179
+ # Remove None values to keep the JSON clean
180
+ action_args = {k: v for k, v in action_args.items() if v is not None}
181
+
182
+ return [
183
+ {
184
+ "type": "computer_call",
185
+ "action": action_args,
186
+ "id": item_id,
187
+ "call_id": call_id,
188
+ "status": "completed",
189
+ }
190
+ ]
191
+
192
+ return [item]
193
+
194
+
195
+ async def replace_computer_call_with_function(
196
+ item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]
197
+ ):
187
198
  """
188
199
  Convert computer_call back to function_call format.
189
200
  Also handles computer_call_output -> function_call_output conversion.
190
-
201
+
191
202
  Args:
192
203
  item: The item to convert
193
204
  xy2id: Mapping from (x, y) coordinates to element IDs
@@ -202,12 +213,12 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
202
213
 
203
214
  if item_type == "computer_call":
204
215
  action_data = item.get("action", {})
205
-
216
+
206
217
  # Extract coordinates and convert back to element IDs
207
218
  element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
208
219
  start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
209
220
  end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
210
-
221
+
211
222
  # Build function arguments
212
223
  fn_args = {
213
224
  "action": action_data.get("type"),
@@ -218,33 +229,36 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
218
229
  "keys": action_data.get("keys"),
219
230
  "button": action_data.get("button"),
220
231
  "scroll_x": action_data.get("scroll_x"),
221
- "scroll_y": action_data.get("scroll_y")
232
+ "scroll_y": action_data.get("scroll_y"),
222
233
  }
223
-
234
+
224
235
  # Remove None values to keep the JSON clean
225
236
  fn_args = {k: v for k, v in fn_args.items() if v is not None}
226
-
227
- return [{
228
- "type": "function_call",
229
- "name": "computer",
230
- "arguments": json.dumps(fn_args),
231
- "id": item.get("id"),
232
- "call_id": item.get("call_id"),
233
- "status": "completed",
234
-
235
- # Fall back to string representation
236
- "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
237
- }]
238
-
237
+
238
+ return [
239
+ {
240
+ "type": "function_call",
241
+ "name": "computer",
242
+ "arguments": json.dumps(fn_args),
243
+ "id": item.get("id"),
244
+ "call_id": item.get("call_id"),
245
+ "status": "completed",
246
+ # Fall back to string representation
247
+ "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
248
+ }
249
+ ]
250
+
239
251
  elif item_type == "computer_call_output":
240
252
  # Simple conversion: computer_call_output -> function_call_output
241
- return [{
242
- "type": "function_call_output",
243
- "call_id": item.get("call_id"),
244
- "content": [item.get("output")],
245
- "id": item.get("id"),
246
- "status": "completed"
247
- }]
253
+ return [
254
+ {
255
+ "type": "function_call_output",
256
+ "call_id": item.get("call_id"),
257
+ "content": [item.get("output")],
258
+ "id": item.get("id"),
259
+ "status": "completed",
260
+ }
261
+ ]
248
262
 
249
263
  return [item]
250
264
 
@@ -252,7 +266,7 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
252
266
  @register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
253
267
  class OmniparserConfig(AsyncAgentConfig):
254
268
  """Omniparser agent configuration implementing AsyncAgentConfig protocol."""
255
-
269
+
256
270
  async def predict_step(
257
271
  self,
258
272
  messages: List[Dict[str, Any]],
@@ -266,25 +280,27 @@ class OmniparserConfig(AsyncAgentConfig):
266
280
  _on_api_end=None,
267
281
  _on_usage=None,
268
282
  _on_screenshot=None,
269
- **kwargs
283
+ **kwargs,
270
284
  ) -> Dict[str, Any]:
271
285
  """
272
286
  OpenAI computer-use-preview agent loop using liteLLM responses.
273
-
287
+
274
288
  Supports OpenAI's computer use preview models.
275
289
  """
276
290
  if not OMNIPARSER_AVAILABLE:
277
- raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
278
-
291
+ raise ValueError(
292
+ "omniparser loop requires som to be installed. Install it with `pip install cua-som`."
293
+ )
294
+
279
295
  tools = tools or []
280
-
281
- llm_model = model.split('+')[-1]
296
+
297
+ llm_model = model.split("+")[-1]
282
298
 
283
299
  # Prepare tools for OpenAI API
284
300
  openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
285
301
 
286
302
  # Find last computer_call_output
287
- last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
303
+ last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
288
304
  if last_computer_call_output:
289
305
  image_url = last_computer_call_output.get("output", {}).get("image_url", "")
290
306
  image_data = image_url.split(",")[-1]
@@ -294,14 +310,17 @@ class OmniparserConfig(AsyncAgentConfig):
294
310
  if _on_screenshot:
295
311
  await _on_screenshot(result.annotated_image_base64, "annotated_image")
296
312
  for element in result.elements:
297
- id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
298
-
313
+ id2xy[element.id] = (
314
+ (element.bbox.x1 + element.bbox.x2) / 2,
315
+ (element.bbox.y1 + element.bbox.y2) / 2,
316
+ )
317
+
299
318
  # handle computer calls -> function calls
300
319
  new_messages = []
301
320
  for message in messages:
302
321
  if not isinstance(message, dict):
303
322
  message = message.__dict__
304
- new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
323
+ new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
305
324
  messages = new_messages
306
325
 
307
326
  # Prepare API call kwargs
@@ -312,13 +331,13 @@ class OmniparserConfig(AsyncAgentConfig):
312
331
  "stream": stream,
313
332
  "truncation": "auto",
314
333
  "num_retries": max_retries,
315
- **kwargs
334
+ **kwargs,
316
335
  }
317
-
336
+
318
337
  # Call API start hook
319
338
  if _on_api_start:
320
339
  await _on_api_start(api_kwargs)
321
-
340
+
322
341
  print(str(api_kwargs)[:1000])
323
342
 
324
343
  # Use liteLLM responses
@@ -330,60 +349,50 @@ class OmniparserConfig(AsyncAgentConfig):
330
349
 
331
350
  # Extract usage information
332
351
  usage = {
333
- **response.usage.model_dump(), # type: ignore
334
- "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
352
+ **response.usage.model_dump(), # type: ignore
353
+ "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
335
354
  }
336
355
  if _on_usage:
337
356
  await _on_usage(usage)
338
357
 
339
358
  # handle som function calls -> xy computer calls
340
359
  new_output = []
341
- for i in range(len(response.output)): # type: ignore
342
- new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
343
-
344
- return {
345
- "output": new_output,
346
- "usage": usage
347
- }
348
-
360
+ for i in range(len(response.output)): # type: ignore
361
+ new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
362
+
363
+ return {"output": new_output, "usage": usage}
364
+
349
365
  async def predict_click(
350
- self,
351
- model: str,
352
- image_b64: str,
353
- instruction: str,
354
- **kwargs
366
+ self, model: str, image_b64: str, instruction: str, **kwargs
355
367
  ) -> Optional[Tuple[float, float]]:
356
368
  """
357
369
  Predict click coordinates using OmniParser and LLM.
358
-
370
+
359
371
  Uses OmniParser to annotate the image with element IDs, then uses LLM
360
372
  to identify the correct element ID based on the instruction.
361
373
  """
362
374
  if not OMNIPARSER_AVAILABLE:
363
375
  return None
364
-
376
+
365
377
  # Parse the image with OmniParser to get annotated image and elements
366
378
  parser = get_parser()
367
379
  result = parser.parse(image_b64)
368
-
380
+
369
381
  # Extract the LLM model from composed model string
370
- llm_model = model.split('+')[-1]
371
-
382
+ llm_model = model.split("+")[-1]
383
+
372
384
  # Create system prompt for element ID prediction
373
- SYSTEM_PROMPT = f'''
385
+ SYSTEM_PROMPT = """
374
386
  You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
375
387
 
376
388
  The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
377
389
 
378
390
  Output only the element ID as a single integer.
379
- '''.strip()
380
-
391
+ """.strip()
392
+
381
393
  # Prepare messages for LLM
382
394
  messages = [
383
- {
384
- "role": "system",
385
- "content": SYSTEM_PROMPT
386
- },
395
+ {"role": "system", "content": SYSTEM_PROMPT},
387
396
  {
388
397
  "role": "user",
389
398
  "content": [
@@ -391,31 +400,25 @@ Output only the element ID as a single integer.
391
400
  "type": "image_url",
392
401
  "image_url": {
393
402
  "url": f"data:image/png;base64,{result.annotated_image_base64}"
394
- }
403
+ },
395
404
  },
396
- {
397
- "type": "text",
398
- "text": f"Find the element: {instruction}"
399
- }
400
- ]
401
- }
405
+ {"type": "text", "text": f"Find the element: {instruction}"},
406
+ ],
407
+ },
402
408
  ]
403
-
409
+
404
410
  # Call LLM to predict element ID
405
411
  response = await litellm.acompletion(
406
- model=llm_model,
407
- messages=messages,
408
- max_tokens=10,
409
- temperature=0.1
412
+ model=llm_model, messages=messages, max_tokens=10, temperature=0.1
410
413
  )
411
-
414
+
412
415
  # Extract element ID from response
413
- response_text = response.choices[0].message.content.strip() # type: ignore
414
-
416
+ response_text = response.choices[0].message.content.strip() # type: ignore
417
+
415
418
  # Try to parse the element ID
416
419
  try:
417
420
  element_id = int(response_text)
418
-
421
+
419
422
  # Find the element with this ID and return its center coordinates
420
423
  for element in result.elements:
421
424
  if element.id == element_id:
@@ -425,9 +428,9 @@ Output only the element ID as a single integer.
425
428
  except ValueError:
426
429
  # If we can't parse the ID, return None
427
430
  pass
428
-
431
+
429
432
  return None
430
-
433
+
431
434
  def get_capabilities(self) -> List[AgentCapability]:
432
435
  """Return the capabilities supported by this agent."""
433
436
  return ["step"]