cua-agent 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/loops/omniparser.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """
2
2
  OpenAI computer-use-preview agent loop implementation using liteLLM
3
+ Paper: https://arxiv.org/abs/2408.00203
4
+ Code: https://github.com/microsoft/OmniParser
3
5
  """
4
6
 
5
7
  import asyncio
@@ -9,8 +11,9 @@ import litellm
9
11
  import inspect
10
12
  import base64
11
13
 
12
- from ..decorators import agent_loop
13
- from ..types import Messages, AgentResponse, Tools
14
+ from ..decorators import register_agent
15
+ from ..types import Messages, AgentResponse, Tools, AgentCapability
16
+ from ..loops.base import AsyncAgentConfig
14
17
 
15
18
  SOM_TOOL_SCHEMA = {
16
19
  "type": "function",
@@ -246,94 +249,185 @@ async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[
246
249
  return [item]
247
250
 
248
251
 
249
- @agent_loop(models=r"omniparser\+.*|omni\+.*", priority=10)
250
- async def omniparser_loop(
251
- messages: Messages,
252
- model: str,
253
- tools: Optional[List[Dict[str, Any]]] = None,
254
- max_retries: Optional[int] = None,
255
- stream: bool = False,
256
- computer_handler=None,
257
- use_prompt_caching: Optional[bool] = False,
258
- _on_api_start=None,
259
- _on_api_end=None,
260
- _on_usage=None,
261
- _on_screenshot=None,
262
- **kwargs
263
- ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
264
- """
265
- OpenAI computer-use-preview agent loop using liteLLM responses.
266
-
267
- Supports OpenAI's computer use preview models.
268
- """
269
- if not OMNIPARSER_AVAILABLE:
270
- raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
271
-
272
- tools = tools or []
252
+ @register_agent(models=r"omniparser\+.*|omni\+.*", priority=2)
253
+ class OmniparserConfig(AsyncAgentConfig):
254
+ """Omniparser agent configuration implementing AsyncAgentConfig protocol."""
273
255
 
274
- llm_model = model.split('+')[-1]
256
+ async def predict_step(
257
+ self,
258
+ messages: List[Dict[str, Any]],
259
+ model: str,
260
+ tools: Optional[List[Dict[str, Any]]] = None,
261
+ max_retries: Optional[int] = None,
262
+ stream: bool = False,
263
+ computer_handler=None,
264
+ use_prompt_caching: Optional[bool] = False,
265
+ _on_api_start=None,
266
+ _on_api_end=None,
267
+ _on_usage=None,
268
+ _on_screenshot=None,
269
+ **kwargs
270
+ ) -> Dict[str, Any]:
271
+ """
272
+ OpenAI computer-use-preview agent loop using liteLLM responses.
273
+
274
+ Supports OpenAI's computer use preview models.
275
+ """
276
+ if not OMNIPARSER_AVAILABLE:
277
+ raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
278
+
279
+ tools = tools or []
280
+
281
+ llm_model = model.split('+')[-1]
275
282
 
276
- # Prepare tools for OpenAI API
277
- openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
283
+ # Prepare tools for OpenAI API
284
+ openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
278
285
 
279
- # Find last computer_call_output
280
- last_computer_call_output = get_last_computer_call_output(messages)
281
- if last_computer_call_output:
282
- image_url = last_computer_call_output.get("output", {}).get("image_url", "")
283
- image_data = image_url.split(",")[-1]
284
- if image_data:
285
- parser = get_parser()
286
- result = parser.parse(image_data)
287
- if _on_screenshot:
288
- await _on_screenshot(result.annotated_image_base64, "annotated_image")
289
- for element in result.elements:
290
- id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
291
-
292
- # handle computer calls -> function calls
293
- new_messages = []
294
- for message in messages:
295
- if not isinstance(message, dict):
296
- message = message.__dict__
297
- new_messages += await replace_computer_call_with_function(message, id2xy)
298
- messages = new_messages
286
+ # Find last computer_call_output
287
+ last_computer_call_output = get_last_computer_call_output(messages) # type: ignore
288
+ if last_computer_call_output:
289
+ image_url = last_computer_call_output.get("output", {}).get("image_url", "")
290
+ image_data = image_url.split(",")[-1]
291
+ if image_data:
292
+ parser = get_parser()
293
+ result = parser.parse(image_data)
294
+ if _on_screenshot:
295
+ await _on_screenshot(result.annotated_image_base64, "annotated_image")
296
+ for element in result.elements:
297
+ id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
298
+
299
+ # handle computer calls -> function calls
300
+ new_messages = []
301
+ for message in messages:
302
+ if not isinstance(message, dict):
303
+ message = message.__dict__
304
+ new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
305
+ messages = new_messages
299
306
 
300
- # Prepare API call kwargs
301
- api_kwargs = {
302
- "model": llm_model,
303
- "input": messages,
304
- "tools": openai_tools if openai_tools else None,
305
- "stream": stream,
306
- "reasoning": {"summary": "concise"},
307
- "truncation": "auto",
308
- "num_retries": max_retries,
309
- **kwargs
310
- }
311
-
312
- # Call API start hook
313
- if _on_api_start:
314
- await _on_api_start(api_kwargs)
315
-
316
- print(str(api_kwargs)[:1000])
307
+ # Prepare API call kwargs
308
+ api_kwargs = {
309
+ "model": llm_model,
310
+ "input": messages,
311
+ "tools": openai_tools if openai_tools else None,
312
+ "stream": stream,
313
+ "truncation": "auto",
314
+ "num_retries": max_retries,
315
+ **kwargs
316
+ }
317
+
318
+ # Call API start hook
319
+ if _on_api_start:
320
+ await _on_api_start(api_kwargs)
321
+
322
+ print(str(api_kwargs)[:1000])
323
+
324
+ # Use liteLLM responses
325
+ response = await litellm.aresponses(**api_kwargs)
317
326
 
318
- # Use liteLLM responses
319
- response = await litellm.aresponses(**api_kwargs)
327
+ # Call API end hook
328
+ if _on_api_end:
329
+ await _on_api_end(api_kwargs, response)
320
330
 
321
- # Call API end hook
322
- if _on_api_end:
323
- await _on_api_end(api_kwargs, response)
331
+ # Extract usage information
332
+ usage = {
333
+ **response.usage.model_dump(), # type: ignore
334
+ "response_cost": response._hidden_params.get("response_cost", 0.0), # type: ignore
335
+ }
336
+ if _on_usage:
337
+ await _on_usage(usage)
324
338
 
325
- # Extract usage information
326
- response.usage = {
327
- **response.usage.model_dump(),
328
- "response_cost": response._hidden_params.get("response_cost", 0.0),
329
- }
330
- if _on_usage:
331
- await _on_usage(response.usage)
339
+ # handle som function calls -> xy computer calls
340
+ new_output = []
341
+ for i in range(len(response.output)): # type: ignore
342
+ new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
343
+
344
+ return {
345
+ "output": new_output,
346
+ "usage": usage
347
+ }
348
+
349
+ async def predict_click(
350
+ self,
351
+ model: str,
352
+ image_b64: str,
353
+ instruction: str,
354
+ **kwargs
355
+ ) -> Optional[Tuple[float, float]]:
356
+ """
357
+ Predict click coordinates using OmniParser and LLM.
358
+
359
+ Uses OmniParser to annotate the image with element IDs, then uses LLM
360
+ to identify the correct element ID based on the instruction.
361
+ """
362
+ if not OMNIPARSER_AVAILABLE:
363
+ return None
364
+
365
+ # Parse the image with OmniParser to get annotated image and elements
366
+ parser = get_parser()
367
+ result = parser.parse(image_b64)
368
+
369
+ # Extract the LLM model from composed model string
370
+ llm_model = model.split('+')[-1]
371
+
372
+ # Create system prompt for element ID prediction
373
+ SYSTEM_PROMPT = f'''
374
+ You are an expert UI element locator. Given a GUI image annotated with numerical IDs over each interactable element, along with a user's element description, provide the ID of the specified element.
332
375
 
333
- # handle som function calls -> xy computer calls
334
- new_output = []
335
- for i in range(len(response.output)):
336
- new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)
337
- response.output = new_output
376
+ The image shows UI elements with numbered overlays. Each number corresponds to a clickable/interactable element.
338
377
 
339
- return response
378
+ Output only the element ID as a single integer.
379
+ '''.strip()
380
+
381
+ # Prepare messages for LLM
382
+ messages = [
383
+ {
384
+ "role": "system",
385
+ "content": SYSTEM_PROMPT
386
+ },
387
+ {
388
+ "role": "user",
389
+ "content": [
390
+ {
391
+ "type": "image_url",
392
+ "image_url": {
393
+ "url": f"data:image/png;base64,{result.annotated_image_base64}"
394
+ }
395
+ },
396
+ {
397
+ "type": "text",
398
+ "text": f"Find the element: {instruction}"
399
+ }
400
+ ]
401
+ }
402
+ ]
403
+
404
+ # Call LLM to predict element ID
405
+ response = await litellm.acompletion(
406
+ model=llm_model,
407
+ messages=messages,
408
+ max_tokens=10,
409
+ temperature=0.1
410
+ )
411
+
412
+ # Extract element ID from response
413
+ response_text = response.choices[0].message.content.strip() # type: ignore
414
+
415
+ # Try to parse the element ID
416
+ try:
417
+ element_id = int(response_text)
418
+
419
+ # Find the element with this ID and return its center coordinates
420
+ for element in result.elements:
421
+ if element.id == element_id:
422
+ center_x = (element.bbox.x1 + element.bbox.x2) / 2
423
+ center_y = (element.bbox.y1 + element.bbox.y2) / 2
424
+ return (center_x, center_y)
425
+ except ValueError:
426
+ # If we can't parse the ID, return None
427
+ pass
428
+
429
+ return None
430
+
431
+ def get_capabilities(self) -> List[AgentCapability]:
432
+ """Return the capabilities supported by this agent."""
433
+ return ["step"]
agent/loops/openai.py CHANGED
@@ -3,31 +3,49 @@ OpenAI computer-use-preview agent loop implementation using liteLLM
3
3
  """
4
4
 
5
5
  import asyncio
6
+ import base64
6
7
  import json
7
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional
8
+ from io import BytesIO
9
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
8
10
  import litellm
11
+ from PIL import Image
9
12
 
10
- from ..decorators import agent_loop
11
- from ..types import Messages, AgentResponse, Tools
13
+ from ..decorators import register_agent
14
+ from ..types import Messages, AgentResponse, Tools, AgentCapability
12
15
 
13
- def _map_computer_tool_to_openai(computer_tool: Any) -> Dict[str, Any]:
16
+ async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
14
17
  """Map a computer tool to OpenAI's computer-use-preview tool schema"""
18
+ # Get dimensions from the computer handler
19
+ try:
20
+ width, height = await computer_handler.get_dimensions()
21
+ except Exception:
22
+ # Fallback to default dimensions if method fails
23
+ width, height = 1024, 768
24
+
25
+ # Get environment from the computer handler
26
+ try:
27
+ environment = await computer_handler.get_environment()
28
+ except Exception:
29
+ # Fallback to default environment if method fails
30
+ environment = "linux"
31
+
15
32
  return {
16
33
  "type": "computer_use_preview",
17
- "display_width": getattr(computer_tool, 'display_width', 1024),
18
- "display_height": getattr(computer_tool, 'display_height', 768),
19
- "environment": getattr(computer_tool, 'environment', "linux") # mac, windows, linux, browser
34
+ "display_width": width,
35
+ "display_height": height,
36
+ "environment": environment # mac, windows, linux, browser
20
37
  }
21
38
 
22
39
 
23
- def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
40
+ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
24
41
  """Prepare tools for OpenAI API format"""
25
42
  openai_tools = []
26
43
 
27
44
  for schema in tool_schemas:
28
45
  if schema["type"] == "computer":
29
46
  # Map computer tool to OpenAI format
30
- openai_tools.append(_map_computer_tool_to_openai(schema["computer"]))
47
+ computer_tool = await _map_computer_tool_to_openai(schema["computer"])
48
+ openai_tools.append(computer_tool)
31
49
  elif schema["type"] == "function":
32
50
  # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
33
51
  # Schema should be: {type, name, description, parameters}
@@ -36,60 +54,182 @@ def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
36
54
  return openai_tools
37
55
 
38
56
 
39
- @agent_loop(models=r".*computer-use-preview.*", priority=10)
40
- async def openai_computer_use_loop(
41
- messages: Messages,
42
- model: str,
43
- tools: Optional[List[Dict[str, Any]]] = None,
44
- max_retries: Optional[int] = None,
45
- stream: bool = False,
46
- computer_handler=None,
47
- use_prompt_caching: Optional[bool] = False,
48
- _on_api_start=None,
49
- _on_api_end=None,
50
- _on_usage=None,
51
- _on_screenshot=None,
52
- **kwargs
53
- ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
57
+ @register_agent(models=r".*computer-use-preview.*")
58
+ class OpenAIComputerUseConfig:
54
59
  """
55
- OpenAI computer-use-preview agent loop using liteLLM responses.
60
+ OpenAI computer-use-preview agent configuration using liteLLM responses.
56
61
 
57
62
  Supports OpenAI's computer use preview models.
58
63
  """
59
- tools = tools or []
60
64
 
61
- # Prepare tools for OpenAI API
62
- openai_tools = _prepare_tools_for_openai(tools)
63
-
64
- # Prepare API call kwargs
65
- api_kwargs = {
66
- "model": model,
67
- "input": messages,
68
- "tools": openai_tools if openai_tools else None,
69
- "stream": stream,
70
- "reasoning": {"summary": "concise"},
71
- "truncation": "auto",
72
- "num_retries": max_retries,
65
+ async def predict_step(
66
+ self,
67
+ messages: List[Dict[str, Any]],
68
+ model: str,
69
+ tools: Optional[List[Dict[str, Any]]] = None,
70
+ max_retries: Optional[int] = None,
71
+ stream: bool = False,
72
+ computer_handler=None,
73
+ use_prompt_caching: Optional[bool] = False,
74
+ _on_api_start=None,
75
+ _on_api_end=None,
76
+ _on_usage=None,
77
+ _on_screenshot=None,
73
78
  **kwargs
74
- }
75
-
76
- # Call API start hook
77
- if _on_api_start:
78
- await _on_api_start(api_kwargs)
79
-
80
- # Use liteLLM responses
81
- response = await litellm.aresponses(**api_kwargs)
82
-
83
- # Call API end hook
84
- if _on_api_end:
85
- await _on_api_end(api_kwargs, response)
79
+ ) -> Dict[str, Any]:
80
+ """
81
+ Predict the next step based on input items.
82
+
83
+ Args:
84
+ messages: Input items following Responses format
85
+ model: Model name to use
86
+ tools: Optional list of tool schemas
87
+ max_retries: Maximum number of retries
88
+ stream: Whether to stream responses
89
+ computer_handler: Computer handler instance
90
+ _on_api_start: Callback for API start
91
+ _on_api_end: Callback for API end
92
+ _on_usage: Callback for usage tracking
93
+ _on_screenshot: Callback for screenshot events
94
+ **kwargs: Additional arguments
95
+
96
+ Returns:
97
+ Dictionary with "output" (output items) and "usage" array
98
+ """
99
+ tools = tools or []
100
+
101
+ # Prepare tools for OpenAI API
102
+ openai_tools = await _prepare_tools_for_openai(tools)
86
103
 
87
- # Extract usage information
88
- response.usage = {
89
- **response.usage.model_dump(),
90
- "response_cost": response._hidden_params.get("response_cost", 0.0),
91
- }
92
- if _on_usage:
93
- await _on_usage(response.usage)
104
+ # Prepare API call kwargs
105
+ api_kwargs = {
106
+ "model": model,
107
+ "input": messages,
108
+ "tools": openai_tools if openai_tools else None,
109
+ "stream": stream,
110
+ "reasoning": {"summary": "concise"},
111
+ "truncation": "auto",
112
+ "num_retries": max_retries,
113
+ **kwargs
114
+ }
115
+
116
+ # Call API start hook
117
+ if _on_api_start:
118
+ await _on_api_start(api_kwargs)
119
+
120
+ # Use liteLLM responses
121
+ response = await litellm.aresponses(**api_kwargs)
122
+
123
+ # Call API end hook
124
+ if _on_api_end:
125
+ await _on_api_end(api_kwargs, response)
126
+
127
+ # Extract usage information
128
+ usage = {
129
+ **response.usage.model_dump(),
130
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
131
+ }
132
+ if _on_usage:
133
+ await _on_usage(usage)
134
+
135
+ # Return in the expected format
136
+ output_dict = response.model_dump()
137
+ output_dict["usage"] = usage
138
+ return output_dict
139
+
140
+ async def predict_click(
141
+ self,
142
+ model: str,
143
+ image_b64: str,
144
+ instruction: str
145
+ ) -> Optional[Tuple[int, int]]:
146
+ """
147
+ Predict click coordinates based on image and instruction.
148
+
149
+ Uses OpenAI computer-use-preview with manually constructed input items
150
+ and a prompt that instructs the agent to only output clicks.
151
+
152
+ Args:
153
+ model: Model name to use
154
+ image_b64: Base64 encoded image
155
+ instruction: Instruction for where to click
156
+
157
+ Returns:
158
+ Tuple of (x, y) coordinates or None if prediction fails
159
+ """
160
+ # TODO: use computer tool to get dimensions + environment
161
+ # Manually construct input items with image and click instruction
162
+ input_items = [
163
+ {
164
+ "role": "user",
165
+ "content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
166
+ },
167
+ {
168
+ "role": "user",
169
+ "content": [
170
+ {
171
+ "type": "input_image",
172
+ "image_url": f"data:image/png;base64,{image_b64}"
173
+ }
174
+ ]
175
+ }
176
+ ]
177
+
178
+ # Get image dimensions from base64 data
179
+ try:
180
+ image_data = base64.b64decode(image_b64)
181
+ image = Image.open(BytesIO(image_data))
182
+ display_width, display_height = image.size
183
+ except Exception:
184
+ # Fallback to default dimensions if image parsing fails
185
+ display_width, display_height = 1024, 768
186
+
187
+ # Prepare computer tool for click actions
188
+ computer_tool = {
189
+ "type": "computer_use_preview",
190
+ "display_width": display_width,
191
+ "display_height": display_height,
192
+ "environment": "windows"
193
+ }
194
+
195
+ # Prepare API call kwargs
196
+ api_kwargs = {
197
+ "model": model,
198
+ "input": input_items,
199
+ "tools": [computer_tool],
200
+ "stream": False,
201
+ "reasoning": {"summary": "concise"},
202
+ "truncation": "auto",
203
+ "max_tokens": 100 # Keep response short for click prediction
204
+ }
205
+
206
+ # Use liteLLM responses
207
+ response = await litellm.aresponses(**api_kwargs)
208
+
209
+ # Extract click coordinates from response output
210
+ output_dict = response.model_dump()
211
+ output_items = output_dict.get("output", [])
212
+
213
+ # Look for computer_call with click action
214
+ for item in output_items:
215
+ if (isinstance(item, dict) and
216
+ item.get("type") == "computer_call" and
217
+ isinstance(item.get("action"), dict)):
218
+
219
+ action = item["action"]
220
+ if action.get("type") == "click":
221
+ x = action.get("x")
222
+ y = action.get("y")
223
+ if x is not None and y is not None:
224
+ return (int(x), int(y))
225
+
226
+ return None
94
227
 
95
- return response
228
+ def get_capabilities(self) -> List[AgentCapability]:
229
+ """
230
+ Get list of capabilities supported by this agent config.
231
+
232
+ Returns:
233
+ List of capability strings
234
+ """
235
+ return ["click", "step"]