cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +32 -19
  24. agent/computers/cua.py +33 -25
  25. agent/computers/custom.py +78 -71
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +215 -210
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +510 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.36.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/loops/omniparser.py CHANGED
@@ -5,100 +5,102 @@ Code: https://github.com/microsoft/OmniParser
5
5
  """
6
6
 
7
7
  import asyncio
8
+ import base64
9
+ import inspect
8
10
  import json
9
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
11
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
12
+
10
13
  import litellm
11
- import inspect
12
- import base64
13
14
 
14
15
  from ..decorators import register_agent
15
- from ..types import Messages, AgentResponse, Tools, AgentCapability
16
16
  from ..loops.base import AsyncAgentConfig
17
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
17
18
 
18
19
  SOM_TOOL_SCHEMA = {
19
- "type": "function",
20
- "name": "computer",
21
- "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
22
- "parameters": {
23
- "type": "object",
24
- "properties": {
25
- "action": {
26
- "type": "string",
27
- "enum": [
28
- "screenshot",
29
- "click",
30
- "double_click",
31
- "drag",
32
- "type",
33
- "keypress",
34
- "scroll",
35
- "move",
36
- "wait",
37
- "get_current_url",
38
- "get_dimensions",
39
- "get_environment"
40
- ],
41
- "description": "The action to perform"
42
- },
43
- "element_id": {
44
- "type": "integer",
45
- "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
46
- },
47
- "start_element_id": {
48
- "type": "integer",
49
- "description": "The ID of the element to start dragging from (required for drag action)"
50
- },
51
- "end_element_id": {
52
- "type": "integer",
53
- "description": "The ID of the element to drag to (required for drag action)"
54
- },
55
- "text": {
56
- "type": "string",
57
- "description": "The text to type (required for type action)"
58
- },
59
- "keys": {
60
- "type": "string",
61
- "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
62
- },
63
- "button": {
64
- "type": "string",
65
- "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
66
- },
67
- "scroll_x": {
68
- "type": "integer",
69
- "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
70
- },
71
- "scroll_y": {
72
- "type": "integer",
73
- "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
74
- },
20
+ "type": "function",
21
+ "name": "computer",
22
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
23
+ "parameters": {
24
+ "type": "object",
25
+ "properties": {
26
+ "action": {
27
+ "type": "string",
28
+ "enum": [
29
+ "screenshot",
30
+ "click",
31
+ "double_click",
32
+ "drag",
33
+ "type",
34
+ "keypress",
35
+ "scroll",
36
+ "move",
37
+ "wait",
38
+ "get_current_url",
39
+ "get_dimensions",
40
+ "get_environment",
41
+ ],
42
+ "description": "The action to perform",
43
+ },
44
+ "element_id": {
45
+ "type": "integer",
46
+ "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
47
+ },
48
+ "start_element_id": {
49
+ "type": "integer",
50
+ "description": "The ID of the element to start dragging from (required for drag action)",
51
+ },
52
+ "end_element_id": {
53
+ "type": "integer",
54
+ "description": "The ID of the element to drag to (required for drag action)",
55
+ },
56
+ "text": {
57
+ "type": "string",
58
+ "description": "The text to type (required for type action)",
59
+ },
60
+ "keys": {
61
+ "type": "string",
62
+ "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
63
+ },
64
+ "button": {
65
+ "type": "string",
66
+ "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
67
+ },
68
+ "scroll_x": {
69
+ "type": "integer",
70
+ "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
71
+ },
72
+ "scroll_y": {
73
+ "type": "integer",
74
+ "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
75
+ },
76
+ },
77
+ "required": ["action"],
75
78
  },
76
- "required": [
77
- "action"
78
- ]
79
- }
80
79
  }
81
80
 
82
81
  OMNIPARSER_AVAILABLE = False
83
82
  try:
84
83
  from som import OmniParser
84
+
85
85
  OMNIPARSER_AVAILABLE = True
86
86
  except ImportError:
87
87
  pass
88
88
  OMNIPARSER_SINGLETON = None
89
89
 
90
+
90
91
  def get_parser():
91
92
  global OMNIPARSER_SINGLETON
92
93
  if OMNIPARSER_SINGLETON is None:
93
94
  OMNIPARSER_SINGLETON = OmniParser()
94
95
  return OMNIPARSER_SINGLETON
95
-
96
+
97
+
96
98
  def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
97
99
  """Get the last computer_call_output message from a messages list.
98
-
100
+
99
101
  Args:
100
102
  messages: List of messages to search through
101
-
103
+
102
104
  Returns:
103
105
  The last computer_call_output message dict, or None if not found
104
106
  """
@@ -107,11 +109,12 @@ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Di
107
109
  return message
108
110
  return None
109
111
 
112
+
110
113
  def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
111
114
  """Prepare tools for OpenAI API format"""
112
115
  omniparser_tools = []
113
116
  id2xy = dict()
114
-
117
+
115
118
  for schema in tool_schemas:
116
119
  if schema["type"] == "computer":
117
120
  omniparser_tools.append(SOM_TOOL_SCHEMA)
@@ -122,72 +125,80 @@ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[T
122
125
  elif schema["type"] == "function":
123
126
  # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
124
127
  # Schema should be: {type, name, description, parameters}
125
- omniparser_tools.append({ "type": "function", **schema["function"] })
126
-
128
+ omniparser_tools.append({"type": "function", **schema["function"]})
129
+
127
130
  return omniparser_tools, id2xy
128
131
 
129
- async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
130
- item_type = item.get("type")
131
-
132
- def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
133
- if element_id is None:
134
- return (None, None)
135
- return id2xy.get(element_id, (None, None))
136
-
137
- if item_type == "function_call":
138
- fn_name = item.get("name")
139
- fn_args = json.loads(item.get("arguments", "{}"))
140
-
141
- item_id = item.get("id")
142
- call_id = item.get("call_id")
143
-
144
- if fn_name == "computer":
145
- action = fn_args.get("action")
146
- element_id = fn_args.get("element_id")
147
- start_element_id = fn_args.get("start_element_id")
148
- end_element_id = fn_args.get("end_element_id")
149
- text = fn_args.get("text")
150
- keys = fn_args.get("keys")
151
- button = fn_args.get("button")
152
- scroll_x = fn_args.get("scroll_x")
153
- scroll_y = fn_args.get("scroll_y")
154
-
155
- x, y = _get_xy(element_id)
156
- start_x, start_y = _get_xy(start_element_id)
157
- end_x, end_y = _get_xy(end_element_id)
158
-
159
- action_args = {
160
- "type": action,
161
- "x": x,
162
- "y": y,
163
- "start_x": start_x,
164
- "start_y": start_y,
165
- "end_x": end_x,
166
- "end_y": end_y,
167
- "text": text,
168
- "keys": keys,
169
- "button": button,
170
- "scroll_x": scroll_x,
171
- "scroll_y": scroll_y
172
- }
173
- # Remove None values to keep the JSON clean
174
- action_args = {k: v for k, v in action_args.items() if v is not None}
175
132
 
176
- return [{
177
- "type": "computer_call",
178
- "action": action_args,
179
- "id": item_id,
180
- "call_id": call_id,
181
- "status": "completed"
182
- }]
133
+ async def replace_function_with_computer_call(
134
+ item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]
135
+ ):
136
+ item_type = item.get("type")
137
+
138
+ def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
139
+ if element_id is None:
140
+ return (None, None)
141
+ return id2xy.get(element_id, (None, None))
142
+
143
+ if item_type == "function_call":
144
+ fn_name = item.get("name")
145
+ fn_args = json.loads(item.get("arguments", "{}"))
146
+
147
+ item_id = item.get("id")
148
+ call_id = item.get("call_id")
183
149
 
184
- return [item]
150
+ if fn_name == "computer":
151
+ action = fn_args.get("action")
152
+ element_id = fn_args.get("element_id")
153
+ start_element_id = fn_args.get("start_element_id")
154
+ end_element_id = fn_args.get("end_element_id")
155
+ text = fn_args.get("text")
156
+ keys = fn_args.get("keys")
157
+ button = fn_args.get("button")
158
+ scroll_x = fn_args.get("scroll_x")
159
+ scroll_y = fn_args.get("scroll_y")
185
160
 
186
- async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
161
+ x, y = _get_xy(element_id)
162
+ start_x, start_y = _get_xy(start_element_id)
163
+ end_x, end_y = _get_xy(end_element_id)
164
+
165
+ action_args = {
166
+ "type": action,
167
+ "x": x,
168
+ "y": y,
169
+ "start_x": start_x,
170
+ "start_y": start_y,
171
+ "end_x": end_x,
172
+ "end_y": end_y,
173
+ "text": text,
174
+ "keys": keys,
175
+ "button": button,
176
+ "scroll_x": scroll_x,
177
+ "scroll_y": scroll_y,
178
+ }
179
+ # Remove None values to keep the JSON clean
180
+ action_args = {k: v for k, v in action_args.items() if v is not None}
181
+
182
+ return [
183
+ {
184
+ "type": "computer_call",
185
+ "action": action_args,
186
+ "id": item_id,
187
+ "call_id": call_id,
188
+ "status": "completed",
189
+ }
190
+ ]
191
+
192
+ return [item]
193
+
194
+
195
+ async def replace_computer_call_with_function(
196
+ item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]
197
+ ):
187
198
  """
188
199
  Convert computer_call back to function_call format.
189
200
  Also handles computer_call_output -> function_call_output conversion.
190
-
201
+
191
202
  Args:
192
203
  item: The item to convert
193
204
  xy2id: Mapping from (x, y) coordinates to element IDs
@@ -202,12 +213,12 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
202
213
 
203
214
  if item_type == "computer_call":
204
215
  action_data = item.get("action", {})
205
-
216
+
206
217
  # Extract coordinates and convert back to element IDs
207
218
  element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
208
219
  start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
209
220
  end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
210
-
221
+
211
222
  # Build function arguments
212
223
  fn_args = {
213
224
  "action": action_data.get("type"),
@@ -218,33 +229,38 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
218
229
  "keys": action_data.get("keys"),
219
230
  "button": action_data.get("button"),
220
231
  "scroll_x": action_data.get("scroll_x"),
221
- "scroll_y": action_data.get("scroll_y")
232
+ "scroll_y": action_data.get("scroll_y"),
222
233
  }
223
-
234
+
224
235
  # Remove None values to keep the JSON clean
225
236
  fn_args = {k: v for k, v in fn_args.items() if v is not None}
226
-
227
- return [{
228
- "type": "function_call",
229
- "name": "computer",
230
- "arguments": json.dumps(fn_args),
231
- "id": item.get("id"),
232
- "call_id": item.get("call_id"),
233
- "status": "completed",
234
-
235
- # Fall back to string representation
236
- "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
237
- }]
238
-
237
+
238
+ return [
239
+ {
240
+ "type": "function_call",
241
+ "name": "computer",
242
+ "arguments": json.dumps(fn_args),
243
+ "id": item.get("id"),
244
+ "call_id": item.get("call_id"),
245
+ "status": "completed",
246
+ }
247
+ ]
248
+
239
249
  elif item_type == "computer_call_output":
240
- # Simple conversion: computer_call_output -> function_call_output
241
- return [{
242
- "type": "function_call_output",
243
- "call_id": item.get("call_id"),
244
- "content": [item.get("output")],
245
- "id": item.get("id"),
246
- "status": "completed"
247
- }]
250
+ output = item.get("output")
251
+
252
+ if isinstance(output, dict):
253
+ output = [output]
254
+
255
+ return [
256
+ {
257
+ "type": "function_call_output",
258
+ "call_id": item.get("call_id"),
259
+ "output": output,
260
+ "id": item.get("id"),
261
+ "status": "completed",
262
+ }
263
+ ]
248
264
 
249
265
  return [item]
250
266
 
@@ -252,7 +268,7 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
252
268
  @register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
253
269
  class OmniparserConfig(AsyncAgentConfig):
254
270
  """Omniparser agent configuration implementing AsyncAgentConfig protocol."""
255
-
271
+
256
272
  async def predict_step(
257
273
  self,
258
274
  messages: List[Dict[str, Any]],
@@ -266,25 +282,27 @@ class OmniparserConfig(AsyncAgentConfig):
266
282
  _on_api_end=None,
267
283
  _on_usage=None,
268
284
  _on_screenshot=None,
269
- **kwargs
285
+ **kwargs,
270
286
  ) -> Dict[str, Any]:
271
287
  """
272
288
  OpenAI computer-use-preview agent loop using liteLLM responses.
273
-
289
+
274
290
  Supports OpenAI's computer use preview models.
275
291
  """
276
292
  if not OMNIPARSER_AVAILABLE:
277
- raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
278
-
293
+ raise ValueError(
294
+ "omniparser loop requires som to be installed. Install it with `pip install cua-som`."
295
+ )
296
+
279
297
  tools = tools or []
280
-
281
- llm_model = model.split('+')[-1]
298
+
299
+ llm_model = model.split("+")[-1]
282
300
 
283
301
  # Prepare tools for OpenAI API
284
302
  openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
285
303
 
286
304
  # Find last computer_call_output
287
- last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
305
+ last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
288
306
  if last_computer_call_output:
289
307
  image_url = last_computer_call_output.get("output", {}).get("image_url", "")
290
308
  image_data = image_url.split(",")[-1]
@@ -294,14 +312,17 @@ class OmniparserConfig(AsyncAgentConfig):
294
312
  if _on_screenshot:
295
313
  await _on_screenshot(result.annotated_image_base64, "annotated_image")
296
314
  for element in result.elements:
297
- id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
298
-
315
+ id2xy[element.id] = (
316
+ (element.bbox.x1 + element.bbox.x2) / 2,
317
+ (element.bbox.y1 + element.bbox.y2) / 2,
318
+ )
319
+
299
320
  # handle computer calls -> function calls
300
321
  new_messages = []
301
322
  for message in messages:
302
323
  if not isinstance(message, dict):
303
324
  message = message.__dict__
304
- new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
325
+ new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
305
326
  messages = new_messages
306
327
 
307
328
  # Prepare API call kwargs
@@ -312,13 +333,13 @@ class OmniparserConfig(AsyncAgentConfig):
312
333
  "stream": stream,
313
334
  "truncation": "auto",
314
335
  "num_retries": max_retries,
315
- **kwargs
336
+ **kwargs,
316
337
  }
317
-
338
+
318
339
  # Call API start hook
319
340
  if _on_api_start:
320
341
  await _on_api_start(api_kwargs)
321
-
342
+
322
343
  print(str(api_kwargs)[:1000])
323
344
 
324
345
  # Use liteLLM responses
@@ -330,60 +351,50 @@ class OmniparserConfig(AsyncAgentConfig):
330
351
 
331
352
  # Extract usage information
332
353
  usage = {
333
- **response.usage.model_dump(), # type: ignore
334
- "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
354
+ **response.usage.model_dump(), # type: ignore
355
+ "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
335
356
  }
336
357
  if _on_usage:
337
358
  await _on_usage(usage)
338
359
 
339
360
  # handle som function calls -> xy computer calls
340
361
  new_output = []
341
- for i in range(len(response.output)): # type: ignore
342
- new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
343
-
344
- return {
345
- "output": new_output,
346
- "usage": usage
347
- }
348
-
362
+ for i in range(len(response.output)): # type: ignore
363
+ new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
364
+
365
+ return {"output": new_output, "usage": usage}
366
+
349
367
  async def predict_click(
350
- self,
351
- model: str,
352
- image_b64: str,
353
- instruction: str,
354
- **kwargs
368
+ self, model: str, image_b64: str, instruction: str, **kwargs
355
369
  ) -> Optional[Tuple[float, float]]:
356
370
  """
357
371
  Predict click coordinates using OmniParser and LLM.
358
-
372
+
359
373
  Uses OmniParser to annotate the image with element IDs, then uses LLM
360
374
  to identify the correct element ID based on the instruction.
361
375
  """
362
376
  if not OMNIPARSER_AVAILABLE:
363
377
  return None
364
-
378
+
365
379
  # Parse the image with OmniParser to get annotated image and elements
366
380
  parser = get_parser()
367
381
  result = parser.parse(image_b64)
368
-
382
+
369
383
  # Extract the LLM model from composed model string
370
- llm_model = model.split('+')[-1]
371
-
384
+ llm_model = model.split("+")[-1]
385
+
372
386
  # Create system prompt for element ID prediction
373
- SYSTEM_PROMPT = f'''
387
+ SYSTEM_PROMPT = """
374
388
  You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
375
389
 
376
390
  The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
377
391
 
378
392
  Output only the element ID as a single integer.
379
- '''.strip()
380
-
393
+ """.strip()
394
+
381
395
  # Prepare messages for LLM
382
396
  messages = [
383
- {
384
- "role": "system",
385
- "content": SYSTEM_PROMPT
386
- },
397
+ {"role": "system", "content": SYSTEM_PROMPT},
387
398
  {
388
399
  "role": "user",
389
400
  "content": [
@@ -391,31 +402,25 @@ Output only the element ID as a single integer.
391
402
  "type": "image_url",
392
403
  "image_url": {
393
404
  "url": f"data:image/png;base64,{result.annotated_image_base64}"
394
- }
405
+ },
395
406
  },
396
- {
397
- "type": "text",
398
- "text": f"Find the element: {instruction}"
399
- }
400
- ]
401
- }
407
+ {"type": "text", "text": f"Find the element: {instruction}"},
408
+ ],
409
+ },
402
410
  ]
403
-
411
+
404
412
  # Call LLM to predict element ID
405
413
  response = await litellm.acompletion(
406
- model=llm_model,
407
- messages=messages,
408
- max_tokens=10,
409
- temperature=0.1
414
+ model=llm_model, messages=messages, max_tokens=10, temperature=0.1
410
415
  )
411
-
416
+
412
417
  # Extract element ID from response
413
- response_text = response.choices[0].message.content.strip() # type: ignore
414
-
418
+ response_text = response.choices[0].message.content.strip() # type: ignore
419
+
415
420
  # Try to parse the element ID
416
421
  try:
417
422
  element_id = int(response_text)
418
-
423
+
419
424
  # Find the element with this ID and return its center coordinates
420
425
  for element in result.elements:
421
426
  if element.id == element_id:
@@ -425,9 +430,9 @@ Output only the element ID as a single integer.
425
430
  except ValueError:
426
431
  # If we can't parse the ID, return None
427
432
  pass
428
-
433
+
429
434
  return None
430
-
435
+
431
436
  def get_capabilities(self) -> List[AgentCapability]:
432
437
  """Return the capabilities supported by this agent."""
433
438
  return ["step"]