cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/loops/omniparser.py CHANGED
@@ -5,100 +5,108 @@ Code: https://github.com/microsoft/OmniParser
5
5
  """
6
6
 
7
7
  import asyncio
8
+ import base64
9
+ import inspect
8
10
  import json
9
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
11
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
12
+
10
13
  import litellm
11
- import inspect
12
- import base64
13
14
 
14
15
  from ..decorators import register_agent
15
- from ..types import Messages, AgentResponse, Tools, AgentCapability
16
16
  from ..loops.base import AsyncAgentConfig
17
+ from ..responses import (
18
+ convert_completion_messages_to_responses_items,
19
+ convert_responses_items_to_completion_messages,
20
+ )
21
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
17
22
 
18
23
  SOM_TOOL_SCHEMA = {
19
- "type": "function",
20
- "name": "computer",
21
- "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
22
- "parameters": {
23
- "type": "object",
24
- "properties": {
25
- "action": {
26
- "type": "string",
27
- "enum": [
28
- "screenshot",
29
- "click",
30
- "double_click",
31
- "drag",
32
- "type",
33
- "keypress",
34
- "scroll",
35
- "move",
36
- "wait",
37
- "get_current_url",
38
- "get_dimensions",
39
- "get_environment"
40
- ],
41
- "description": "The action to perform"
42
- },
43
- "element_id": {
44
- "type": "integer",
45
- "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
46
- },
47
- "start_element_id": {
48
- "type": "integer",
49
- "description": "The ID of the element to start dragging from (required for drag action)"
50
- },
51
- "end_element_id": {
52
- "type": "integer",
53
- "description": "The ID of the element to drag to (required for drag action)"
54
- },
55
- "text": {
56
- "type": "string",
57
- "description": "The text to type (required for type action)"
58
- },
59
- "keys": {
60
- "type": "string",
61
- "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
62
- },
63
- "button": {
64
- "type": "string",
65
- "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
66
- },
67
- "scroll_x": {
68
- "type": "integer",
69
- "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
70
- },
71
- "scroll_y": {
72
- "type": "integer",
73
- "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
74
- },
24
+ "type": "function",
25
+ "function": {
26
+ "name": "computer",
27
+ "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
28
+ "parameters": {
29
+ "type": "object",
30
+ "properties": {
31
+ "action": {
32
+ "type": "string",
33
+ "enum": [
34
+ "screenshot",
35
+ "click",
36
+ "double_click",
37
+ "drag",
38
+ "type",
39
+ "keypress",
40
+ "scroll",
41
+ "move",
42
+ "wait",
43
+ "get_current_url",
44
+ "get_dimensions",
45
+ "get_environment",
46
+ ],
47
+ "description": "The action to perform",
48
+ },
49
+ "element_id": {
50
+ "type": "integer",
51
+ "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
52
+ },
53
+ "start_element_id": {
54
+ "type": "integer",
55
+ "description": "The ID of the element to start dragging from (required for drag action)",
56
+ },
57
+ "end_element_id": {
58
+ "type": "integer",
59
+ "description": "The ID of the element to drag to (required for drag action)",
60
+ },
61
+ "text": {
62
+ "type": "string",
63
+ "description": "The text to type (required for type action)",
64
+ },
65
+ "keys": {
66
+ "type": "string",
67
+ "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
68
+ },
69
+ "button": {
70
+ "type": "string",
71
+ "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
72
+ },
73
+ "scroll_x": {
74
+ "type": "integer",
75
+ "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
76
+ },
77
+ "scroll_y": {
78
+ "type": "integer",
79
+ "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
80
+ },
81
+ },
82
+ "required": ["action", "element_id"],
83
+ },
75
84
  },
76
- "required": [
77
- "action"
78
- ]
79
- }
80
85
  }
81
86
 
82
87
  OMNIPARSER_AVAILABLE = False
83
88
  try:
84
89
  from som import OmniParser
90
+
85
91
  OMNIPARSER_AVAILABLE = True
86
92
  except ImportError:
87
93
  pass
88
94
  OMNIPARSER_SINGLETON = None
89
95
 
96
+
90
97
  def get_parser():
91
98
  global OMNIPARSER_SINGLETON
92
99
  if OMNIPARSER_SINGLETON is None:
93
100
  OMNIPARSER_SINGLETON = OmniParser()
94
101
  return OMNIPARSER_SINGLETON
95
-
102
+
103
+
96
104
  def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
97
105
  """Get the last computer_call_output message from a messages list.
98
-
106
+
99
107
  Args:
100
108
  messages: List of messages to search through
101
-
109
+
102
110
  Returns:
103
111
  The last computer_call_output message dict, or None if not found
104
112
  """
@@ -107,11 +115,12 @@ def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Di
107
115
  return message
108
116
  return None
109
117
 
118
+
110
119
  def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
111
120
  """Prepare tools for OpenAI API format"""
112
121
  omniparser_tools = []
113
122
  id2xy = dict()
114
-
123
+
115
124
  for schema in tool_schemas:
116
125
  if schema["type"] == "computer":
117
126
  omniparser_tools.append(SOM_TOOL_SCHEMA)
@@ -122,72 +131,80 @@ def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[T
122
131
  elif schema["type"] == "function":
123
132
  # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
124
133
  # Schema should be: {type, name, description, parameters}
125
- omniparser_tools.append({ "type": "function", **schema["function"] })
126
-
134
+ omniparser_tools.append({"type": "function", **schema["function"]})
135
+
127
136
  return omniparser_tools, id2xy
128
137
 
129
- async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
130
- item_type = item.get("type")
131
-
132
- def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
133
- if element_id is None:
134
- return (None, None)
135
- return id2xy.get(element_id, (None, None))
136
-
137
- if item_type == "function_call":
138
- fn_name = item.get("name")
139
- fn_args = json.loads(item.get("arguments", "{}"))
140
-
141
- item_id = item.get("id")
142
- call_id = item.get("call_id")
143
-
144
- if fn_name == "computer":
145
- action = fn_args.get("action")
146
- element_id = fn_args.get("element_id")
147
- start_element_id = fn_args.get("start_element_id")
148
- end_element_id = fn_args.get("end_element_id")
149
- text = fn_args.get("text")
150
- keys = fn_args.get("keys")
151
- button = fn_args.get("button")
152
- scroll_x = fn_args.get("scroll_x")
153
- scroll_y = fn_args.get("scroll_y")
154
-
155
- x, y = _get_xy(element_id)
156
- start_x, start_y = _get_xy(start_element_id)
157
- end_x, end_y = _get_xy(end_element_id)
158
-
159
- action_args = {
160
- "type": action,
161
- "x": x,
162
- "y": y,
163
- "start_x": start_x,
164
- "start_y": start_y,
165
- "end_x": end_x,
166
- "end_y": end_y,
167
- "text": text,
168
- "keys": keys,
169
- "button": button,
170
- "scroll_x": scroll_x,
171
- "scroll_y": scroll_y
172
- }
173
- # Remove None values to keep the JSON clean
174
- action_args = {k: v for k, v in action_args.items() if v is not None}
175
138
 
176
- return [{
177
- "type": "computer_call",
178
- "action": action_args,
179
- "id": item_id,
180
- "call_id": call_id,
181
- "status": "completed"
182
- }]
139
+ async def replace_function_with_computer_call(
140
+ item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]
141
+ ):
142
+ item_type = item.get("type")
143
+
144
+ def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
145
+ if element_id is None:
146
+ return (None, None)
147
+ return id2xy.get(element_id, (None, None))
148
+
149
+ if item_type == "function_call":
150
+ fn_name = item.get("name")
151
+ fn_args = json.loads(item.get("arguments", "{}"))
183
152
 
184
- return [item]
153
+ item_id = item.get("id")
154
+ call_id = item.get("call_id")
185
155
 
186
- async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
156
+ if fn_name == "computer":
157
+ action = fn_args.get("action")
158
+ element_id = fn_args.get("element_id")
159
+ start_element_id = fn_args.get("start_element_id")
160
+ end_element_id = fn_args.get("end_element_id")
161
+ text = fn_args.get("text")
162
+ keys = fn_args.get("keys")
163
+ button = fn_args.get("button")
164
+ scroll_x = fn_args.get("scroll_x")
165
+ scroll_y = fn_args.get("scroll_y")
166
+
167
+ x, y = _get_xy(element_id)
168
+ start_x, start_y = _get_xy(start_element_id)
169
+ end_x, end_y = _get_xy(end_element_id)
170
+
171
+ action_args = {
172
+ "type": action,
173
+ "x": x,
174
+ "y": y,
175
+ "start_x": start_x,
176
+ "start_y": start_y,
177
+ "end_x": end_x,
178
+ "end_y": end_y,
179
+ "text": text,
180
+ "keys": keys,
181
+ "button": button,
182
+ "scroll_x": scroll_x,
183
+ "scroll_y": scroll_y,
184
+ }
185
+ # Remove None values to keep the JSON clean
186
+ action_args = {k: v for k, v in action_args.items() if v is not None}
187
+
188
+ return [
189
+ {
190
+ "type": "computer_call",
191
+ "action": action_args,
192
+ "id": item_id,
193
+ "call_id": call_id,
194
+ "status": "completed",
195
+ }
196
+ ]
197
+
198
+ return [item]
199
+
200
+
201
+ async def replace_computer_call_with_function(
202
+ item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]
203
+ ):
187
204
  """
188
205
  Convert computer_call back to function_call format.
189
206
  Also handles computer_call_output -> function_call_output conversion.
190
-
207
+
191
208
  Args:
192
209
  item: The item to convert
193
210
  xy2id: Mapping from (x, y) coordinates to element IDs
@@ -202,12 +219,12 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
202
219
 
203
220
  if item_type == "computer_call":
204
221
  action_data = item.get("action", {})
205
-
222
+
206
223
  # Extract coordinates and convert back to element IDs
207
224
  element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
208
225
  start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
209
226
  end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
210
-
227
+
211
228
  # Build function arguments
212
229
  fn_args = {
213
230
  "action": action_data.get("type"),
@@ -218,33 +235,38 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
218
235
  "keys": action_data.get("keys"),
219
236
  "button": action_data.get("button"),
220
237
  "scroll_x": action_data.get("scroll_x"),
221
- "scroll_y": action_data.get("scroll_y")
238
+ "scroll_y": action_data.get("scroll_y"),
222
239
  }
223
-
240
+
224
241
  # Remove None values to keep the JSON clean
225
242
  fn_args = {k: v for k, v in fn_args.items() if v is not None}
226
-
227
- return [{
228
- "type": "function_call",
229
- "name": "computer",
230
- "arguments": json.dumps(fn_args),
231
- "id": item.get("id"),
232
- "call_id": item.get("call_id"),
233
- "status": "completed",
234
-
235
- # Fall back to string representation
236
- "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
237
- }]
238
-
243
+
244
+ return [
245
+ {
246
+ "type": "function_call",
247
+ "name": "computer",
248
+ "arguments": json.dumps(fn_args),
249
+ "id": item.get("id"),
250
+ "call_id": item.get("call_id"),
251
+ "status": "completed",
252
+ }
253
+ ]
254
+
239
255
  elif item_type == "computer_call_output":
240
- # Simple conversion: computer_call_output -> function_call_output
241
- return [{
242
- "type": "function_call_output",
243
- "call_id": item.get("call_id"),
244
- "content": [item.get("output")],
245
- "id": item.get("id"),
246
- "status": "completed"
247
- }]
256
+ output = item.get("output")
257
+
258
+ if isinstance(output, dict):
259
+ output = [output]
260
+
261
+ return [
262
+ {
263
+ "type": "function_call_output",
264
+ "call_id": item.get("call_id"),
265
+ "output": item.get("output"),
266
+ "id": item.get("id"),
267
+ "status": "completed",
268
+ }
269
+ ]
248
270
 
249
271
  return [item]
250
272
 
@@ -252,7 +274,7 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
252
274
  @register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
253
275
  class OmniparserConfig(AsyncAgentConfig):
254
276
  """Omniparser agent configuration implementing AsyncAgentConfig protocol."""
255
-
277
+
256
278
  async def predict_step(
257
279
  self,
258
280
  messages: List[Dict[str, Any]],
@@ -266,63 +288,124 @@ class OmniparserConfig(AsyncAgentConfig):
266
288
  _on_api_end=None,
267
289
  _on_usage=None,
268
290
  _on_screenshot=None,
269
- **kwargs
291
+ **kwargs,
270
292
  ) -> Dict[str, Any]:
271
293
  """
272
294
  OpenAI computer-use-preview agent loop using liteLLM responses.
273
-
295
+
274
296
  Supports OpenAI's computer use preview models.
275
297
  """
276
298
  if not OMNIPARSER_AVAILABLE:
277
- raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
278
-
299
+ raise ValueError(
300
+ "omniparser loop requires som to be installed. Install it with `pip install cua-som`."
301
+ )
302
+
279
303
  tools = tools or []
280
-
281
- llm_model = model.split('+')[-1]
304
+
305
+ llm_model = model.split("+")[-1]
306
+
307
+ # Get screen dimensions from computer handler
308
+ try:
309
+ width, height = await computer_handler.get_dimensions()
310
+ except Exception:
311
+ # Fallback to default dimensions if method fails
312
+ width, height = 1024, 768
282
313
 
283
314
  # Prepare tools for OpenAI API
284
315
  openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
285
316
 
286
- # Find last computer_call_output
287
- last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
288
- if last_computer_call_output:
289
- image_url = last_computer_call_output.get("output", {}).get("image_url", "")
290
- image_data = image_url.split(",")[-1]
291
- if image_data:
292
- parser = get_parser()
317
+ # Build per-screenshot element mappings for historical consistency
318
+ screenshot_mappings = [] # (message_index, xy2id)
319
+
320
+ parser = get_parser()
321
+
322
+ for idx, message in enumerate(messages):
323
+ if not isinstance(message, dict):
324
+ message = message.__dict__
325
+
326
+ if message.get("type") == "computer_call_output":
327
+ image_url = message.get("output", {}).get("image_url", "")
328
+ if not image_url:
329
+ continue
330
+
331
+ image_data = image_url.split(",")[-1]
332
+ if not image_data:
333
+ continue
334
+
293
335
  result = parser.parse(image_data)
336
+
294
337
  if _on_screenshot:
295
338
  await _on_screenshot(result.annotated_image_base64, "annotated_image")
339
+
340
+ local_id2xy = {}
341
+
296
342
  for element in result.elements:
297
- id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
298
-
299
- # handle computer calls -> function calls
300
- new_messages = []
301
- for message in messages:
343
+ norm_x = (element.bbox.x1 + element.bbox.x2) / 2
344
+ norm_y = (element.bbox.y1 + element.bbox.y2) / 2
345
+ pixel_x = int(norm_x * width)
346
+ pixel_y = int(norm_y * height)
347
+ local_id2xy[element.id] = (pixel_x, pixel_y)
348
+
349
+ xy2id = {v: k for k, v in local_id2xy.items()}
350
+ screenshot_mappings.append((idx, xy2id))
351
+
352
+ # Replace screenshot with annotated image
353
+ message["output"]["image_url"] = (
354
+ f"data:image/png;base64,{result.annotated_image_base64}"
355
+ )
356
+
357
+ def get_mapping_for_index(index):
358
+ applicable = [m for i, m in screenshot_mappings if i <= index]
359
+ return applicable[-1] if applicable else {}
360
+
361
+ messages_with_element_ids = []
362
+
363
+ for i, message in enumerate(messages):
302
364
  if not isinstance(message, dict):
303
365
  message = message.__dict__
304
- new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
305
- messages = new_messages
366
+
367
+ xy2id = get_mapping_for_index(i)
368
+ converted = await replace_computer_call_with_function(message, xy2id)
369
+ messages_with_element_ids.extend(converted)
370
+
371
+ completion_messages = convert_responses_items_to_completion_messages(
372
+ messages_with_element_ids, allow_images_in_tool_results=False
373
+ )
306
374
 
307
375
  # Prepare API call kwargs
308
376
  api_kwargs = {
309
377
  "model": llm_model,
310
- "input": messages,
378
+ "messages": completion_messages,
311
379
  "tools": openai_tools if openai_tools else None,
312
380
  "stream": stream,
313
- "truncation": "auto",
314
381
  "num_retries": max_retries,
315
- **kwargs
382
+ **kwargs,
316
383
  }
317
-
384
+
385
+ # Add Vertex AI specific parameters if using vertex_ai models
386
+ if llm_model.startswith("vertex_ai/"):
387
+ import os
388
+
389
+ # Pass vertex_project and vertex_location to liteLLM
390
+ if "vertex_project" not in api_kwargs:
391
+ api_kwargs["vertex_project"] = os.getenv("GOOGLE_CLOUD_PROJECT")
392
+ if "vertex_location" not in api_kwargs:
393
+ api_kwargs["vertex_location"] = "global"
394
+
395
+ # Pass through Gemini 3-specific parameters if provided
396
+ if "thinking_level" in kwargs:
397
+ api_kwargs["thinking_level"] = kwargs["thinking_level"]
398
+ if "media_resolution" in kwargs:
399
+ api_kwargs["media_resolution"] = kwargs["media_resolution"]
400
+
318
401
  # Call API start hook
319
402
  if _on_api_start:
320
403
  await _on_api_start(api_kwargs)
321
-
404
+
322
405
  print(str(api_kwargs)[:1000])
323
406
 
324
- # Use liteLLM responses
325
- response = await litellm.aresponses(**api_kwargs)
407
+ # Use liteLLM completion
408
+ response = await litellm.acompletion(**api_kwargs)
326
409
 
327
410
  # Call API end hook
328
411
  if _on_api_end:
@@ -330,60 +413,83 @@ class OmniparserConfig(AsyncAgentConfig):
330
413
 
331
414
  # Extract usage information
332
415
  usage = {
333
- **response.usage.model_dump(), # type: ignore
334
- "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
416
+ **response.usage.model_dump(), # type: ignore
417
+ "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
335
418
  }
336
419
  if _on_usage:
337
420
  await _on_usage(usage)
338
421
 
339
- # handle som function calls -> xy computer calls
340
- new_output = []
341
- for i in range(len(response.output)): # type: ignore
342
- new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
343
-
344
- return {
345
- "output": new_output,
346
- "usage": usage
347
- }
348
-
422
+ response_dict = response.model_dump() # type: ignore
423
+ choice_messages = [choice["message"] for choice in response_dict["choices"]]
424
+ responses_items = []
425
+ for choice_message in choice_messages:
426
+ responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
427
+
428
+ # Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
429
+ final_output = []
430
+ for item in responses_items:
431
+ if item.get("type") == "computer_call" and "action" in item:
432
+ action = item["action"].copy()
433
+
434
+ # Handle single element_id
435
+ if "element_id" in action:
436
+ element_id = action["element_id"]
437
+ if element_id in id2xy:
438
+ x, y = id2xy[element_id]
439
+ action["x"] = x
440
+ action["y"] = y
441
+ del action["element_id"]
442
+
443
+ # Handle start_element_id and end_element_id for drag operations
444
+ elif "start_element_id" in action and "end_element_id" in action:
445
+ start_id = action["start_element_id"]
446
+ end_id = action["end_element_id"]
447
+ if start_id in id2xy and end_id in id2xy:
448
+ start_x, start_y = id2xy[start_id]
449
+ end_x, end_y = id2xy[end_id]
450
+ action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
451
+ del action["start_element_id"]
452
+ del action["end_element_id"]
453
+
454
+ converted_item = item.copy()
455
+ converted_item["action"] = action
456
+ final_output.append(converted_item)
457
+ else:
458
+ final_output.append(item)
459
+
460
+ return {"output": final_output, "usage": usage}
461
+
349
462
  async def predict_click(
350
- self,
351
- model: str,
352
- image_b64: str,
353
- instruction: str,
354
- **kwargs
463
+ self, model: str, image_b64: str, instruction: str, **kwargs
355
464
  ) -> Optional[Tuple[float, float]]:
356
465
  """
357
466
  Predict click coordinates using OmniParser and LLM.
358
-
467
+
359
468
  Uses OmniParser to annotate the image with element IDs, then uses LLM
360
469
  to identify the correct element ID based on the instruction.
361
470
  """
362
471
  if not OMNIPARSER_AVAILABLE:
363
472
  return None
364
-
473
+
365
474
  # Parse the image with OmniParser to get annotated image and elements
366
475
  parser = get_parser()
367
476
  result = parser.parse(image_b64)
368
-
477
+
369
478
  # Extract the LLM model from composed model string
370
- llm_model = model.split('+')[-1]
371
-
479
+ llm_model = model.split("+")[-1]
480
+
372
481
  # Create system prompt for element ID prediction
373
- SYSTEM_PROMPT = f'''
482
+ SYSTEM_PROMPT = """
374
483
  You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
375
484
 
376
485
  The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
377
486
 
378
487
  Output only the element ID as a single integer.
379
- '''.strip()
380
-
488
+ """.strip()
489
+
381
490
  # Prepare messages for LLM
382
491
  messages = [
383
- {
384
- "role": "system",
385
- "content": SYSTEM_PROMPT
386
- },
492
+ {"role": "system", "content": SYSTEM_PROMPT},
387
493
  {
388
494
  "role": "user",
389
495
  "content": [
@@ -391,31 +497,25 @@ Output only the element ID as a single integer.
391
497
  "type": "image_url",
392
498
  "image_url": {
393
499
  "url": f"data:image/png;base64,{result.annotated_image_base64}"
394
- }
500
+ },
395
501
  },
396
- {
397
- "type": "text",
398
- "text": f"Find the element: {instruction}"
399
- }
400
- ]
401
- }
502
+ {"type": "text", "text": f"Find the element: {instruction}"},
503
+ ],
504
+ },
402
505
  ]
403
-
506
+
404
507
  # Call LLM to predict element ID
405
508
  response = await litellm.acompletion(
406
- model=llm_model,
407
- messages=messages,
408
- max_tokens=10,
409
- temperature=0.1
509
+ model=llm_model, messages=messages, max_tokens=10, temperature=0.1
410
510
  )
411
-
511
+
412
512
  # Extract element ID from response
413
- response_text = response.choices[0].message.content.strip() # type: ignore
414
-
513
+ response_text = response.choices[0].message.content.strip() # type: ignore
514
+
415
515
  # Try to parse the element ID
416
516
  try:
417
517
  element_id = int(response_text)
418
-
518
+
419
519
  # Find the element with this ID and return its center coordinates
420
520
  for element in result.elements:
421
521
  if element.id == element_id:
@@ -425,9 +525,9 @@ Output only the element ID as a single integer.
425
525
  except ValueError:
426
526
  # If we can't parse the ID, return None
427
527
  pass
428
-
528
+
429
529
  return None
430
-
530
+
431
531
  def get_capabilities(self) -> List[AgentCapability]:
432
532
  """Return the capabilities supported by this agent."""
433
533
  return ["step"]