cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/openai.py
CHANGED
|
@@ -6,12 +6,14 @@ import asyncio
|
|
|
6
6
|
import base64
|
|
7
7
|
import json
|
|
8
8
|
from io import BytesIO
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
10
|
+
|
|
10
11
|
import litellm
|
|
11
12
|
from PIL import Image
|
|
12
13
|
|
|
13
14
|
from ..decorators import register_agent
|
|
14
|
-
from ..types import
|
|
15
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
16
|
+
|
|
15
17
|
|
|
16
18
|
async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
|
|
17
19
|
"""Map a computer tool to OpenAI's computer-use-preview tool schema"""
|
|
@@ -21,26 +23,26 @@ async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
|
|
|
21
23
|
except Exception:
|
|
22
24
|
# Fallback to default dimensions if method fails
|
|
23
25
|
width, height = 1024, 768
|
|
24
|
-
|
|
26
|
+
|
|
25
27
|
# Get environment from the computer handler
|
|
26
28
|
try:
|
|
27
29
|
environment = await computer_handler.get_environment()
|
|
28
30
|
except Exception:
|
|
29
31
|
# Fallback to default environment if method fails
|
|
30
32
|
environment = "linux"
|
|
31
|
-
|
|
33
|
+
|
|
32
34
|
return {
|
|
33
35
|
"type": "computer_use_preview",
|
|
34
36
|
"display_width": width,
|
|
35
37
|
"display_height": height,
|
|
36
|
-
"environment": environment # mac, windows, linux, browser
|
|
38
|
+
"environment": environment, # mac, windows, linux, browser
|
|
37
39
|
}
|
|
38
40
|
|
|
39
41
|
|
|
40
42
|
async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
|
|
41
43
|
"""Prepare tools for OpenAI API format"""
|
|
42
44
|
openai_tools = []
|
|
43
|
-
|
|
45
|
+
|
|
44
46
|
for schema in tool_schemas:
|
|
45
47
|
if schema["type"] == "computer":
|
|
46
48
|
# Map computer tool to OpenAI format
|
|
@@ -49,18 +51,19 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
|
|
|
49
51
|
elif schema["type"] == "function":
|
|
50
52
|
# Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
|
|
51
53
|
# Schema should be: {type, name, description, parameters}
|
|
52
|
-
openai_tools.append({
|
|
53
|
-
|
|
54
|
+
openai_tools.append({"type": "function", **schema["function"]})
|
|
55
|
+
|
|
54
56
|
return openai_tools
|
|
55
57
|
|
|
58
|
+
|
|
56
59
|
@register_agent(models=r".*(^|/)computer-use-preview")
|
|
57
60
|
class OpenAIComputerUseConfig:
|
|
58
61
|
"""
|
|
59
62
|
OpenAI computer-use-preview agent configuration using liteLLM responses.
|
|
60
|
-
|
|
63
|
+
|
|
61
64
|
Supports OpenAI's computer use preview models.
|
|
62
65
|
"""
|
|
63
|
-
|
|
66
|
+
|
|
64
67
|
async def predict_step(
|
|
65
68
|
self,
|
|
66
69
|
messages: List[Dict[str, Any]],
|
|
@@ -74,11 +77,11 @@ class OpenAIComputerUseConfig:
|
|
|
74
77
|
_on_api_end=None,
|
|
75
78
|
_on_usage=None,
|
|
76
79
|
_on_screenshot=None,
|
|
77
|
-
**kwargs
|
|
80
|
+
**kwargs,
|
|
78
81
|
) -> Dict[str, Any]:
|
|
79
82
|
"""
|
|
80
83
|
Predict the next step based on input items.
|
|
81
|
-
|
|
84
|
+
|
|
82
85
|
Args:
|
|
83
86
|
messages: Input items following Responses format
|
|
84
87
|
model: Model name to use
|
|
@@ -91,12 +94,12 @@ class OpenAIComputerUseConfig:
|
|
|
91
94
|
_on_usage: Callback for usage tracking
|
|
92
95
|
_on_screenshot: Callback for screenshot events
|
|
93
96
|
**kwargs: Additional arguments
|
|
94
|
-
|
|
97
|
+
|
|
95
98
|
Returns:
|
|
96
99
|
Dictionary with "output" (output items) and "usage" array
|
|
97
100
|
"""
|
|
98
101
|
tools = tools or []
|
|
99
|
-
|
|
102
|
+
|
|
100
103
|
# Prepare tools for OpenAI API
|
|
101
104
|
openai_tools = await _prepare_tools_for_openai(tools)
|
|
102
105
|
|
|
@@ -109,16 +112,16 @@ class OpenAIComputerUseConfig:
|
|
|
109
112
|
"reasoning": {"summary": "concise"},
|
|
110
113
|
"truncation": "auto",
|
|
111
114
|
"num_retries": max_retries,
|
|
112
|
-
**kwargs
|
|
115
|
+
**kwargs,
|
|
113
116
|
}
|
|
114
|
-
|
|
117
|
+
|
|
115
118
|
# Call API start hook
|
|
116
119
|
if _on_api_start:
|
|
117
120
|
await _on_api_start(api_kwargs)
|
|
118
|
-
|
|
121
|
+
|
|
119
122
|
# Use liteLLM responses
|
|
120
123
|
response = await litellm.aresponses(**api_kwargs)
|
|
121
|
-
|
|
124
|
+
|
|
122
125
|
# Call API end hook
|
|
123
126
|
if _on_api_end:
|
|
124
127
|
await _on_api_end(api_kwargs, response)
|
|
@@ -135,24 +138,21 @@ class OpenAIComputerUseConfig:
|
|
|
135
138
|
output_dict = response.model_dump()
|
|
136
139
|
output_dict["usage"] = usage
|
|
137
140
|
return output_dict
|
|
138
|
-
|
|
141
|
+
|
|
139
142
|
async def predict_click(
|
|
140
|
-
self,
|
|
141
|
-
model: str,
|
|
142
|
-
image_b64: str,
|
|
143
|
-
instruction: str
|
|
143
|
+
self, model: str, image_b64: str, instruction: str
|
|
144
144
|
) -> Optional[Tuple[int, int]]:
|
|
145
145
|
"""
|
|
146
146
|
Predict click coordinates based on image and instruction.
|
|
147
|
-
|
|
147
|
+
|
|
148
148
|
Uses OpenAI computer-use-preview with manually constructed input items
|
|
149
149
|
and a prompt that instructs the agent to only output clicks.
|
|
150
|
-
|
|
150
|
+
|
|
151
151
|
Args:
|
|
152
152
|
model: Model name to use
|
|
153
153
|
image_b64: Base64 encoded image
|
|
154
154
|
instruction: Instruction for where to click
|
|
155
|
-
|
|
155
|
+
|
|
156
156
|
Returns:
|
|
157
157
|
Tuple of (x, y) coordinates or None if prediction fails
|
|
158
158
|
"""
|
|
@@ -160,7 +160,7 @@ class OpenAIComputerUseConfig:
|
|
|
160
160
|
# Manually construct input items with image and click instruction
|
|
161
161
|
input_items = [
|
|
162
162
|
{
|
|
163
|
-
"role": "user",
|
|
163
|
+
"role": "user",
|
|
164
164
|
"content": f"""You are a UI grounding expert. Follow these guidelines:
|
|
165
165
|
|
|
166
166
|
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
@@ -172,19 +172,16 @@ class OpenAIComputerUseConfig:
|
|
|
172
172
|
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
173
173
|
|
|
174
174
|
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
175
|
-
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
175
|
+
Task: Click {instruction}. Output ONLY a click action on the target element.""",
|
|
176
176
|
},
|
|
177
177
|
{
|
|
178
178
|
"role": "user",
|
|
179
179
|
"content": [
|
|
180
|
-
{
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
}
|
|
184
|
-
]
|
|
185
|
-
}
|
|
180
|
+
{"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
|
|
181
|
+
],
|
|
182
|
+
},
|
|
186
183
|
]
|
|
187
|
-
|
|
184
|
+
|
|
188
185
|
# Get image dimensions from base64 data
|
|
189
186
|
try:
|
|
190
187
|
image_data = base64.b64decode(image_b64)
|
|
@@ -193,15 +190,15 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
|
193
190
|
except Exception:
|
|
194
191
|
# Fallback to default dimensions if image parsing fails
|
|
195
192
|
display_width, display_height = 1024, 768
|
|
196
|
-
|
|
193
|
+
|
|
197
194
|
# Prepare computer tool for click actions
|
|
198
195
|
computer_tool = {
|
|
199
196
|
"type": "computer_use_preview",
|
|
200
197
|
"display_width": display_width,
|
|
201
198
|
"display_height": display_height,
|
|
202
|
-
"environment": "windows"
|
|
199
|
+
"environment": "windows",
|
|
203
200
|
}
|
|
204
|
-
|
|
201
|
+
|
|
205
202
|
# Prepare API call kwargs
|
|
206
203
|
api_kwargs = {
|
|
207
204
|
"model": model,
|
|
@@ -210,32 +207,34 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
|
210
207
|
"stream": False,
|
|
211
208
|
"reasoning": {"summary": "concise"},
|
|
212
209
|
"truncation": "auto",
|
|
213
|
-
"max_tokens": 200 # Keep response short for click prediction
|
|
210
|
+
"max_tokens": 200, # Keep response short for click prediction
|
|
214
211
|
}
|
|
215
|
-
|
|
212
|
+
|
|
216
213
|
# Use liteLLM responses
|
|
217
214
|
response = await litellm.aresponses(**api_kwargs)
|
|
218
|
-
|
|
215
|
+
|
|
219
216
|
# Extract click coordinates from response output
|
|
220
217
|
output_dict = response.model_dump()
|
|
221
|
-
output_items = output_dict.get("output", [])
|
|
222
|
-
|
|
218
|
+
output_items = output_dict.get("output", [])
|
|
219
|
+
|
|
223
220
|
# Look for computer_call with click action
|
|
224
221
|
for item in output_items:
|
|
225
|
-
if (
|
|
226
|
-
item
|
|
227
|
-
|
|
228
|
-
|
|
222
|
+
if (
|
|
223
|
+
isinstance(item, dict)
|
|
224
|
+
and item.get("type") == "computer_call"
|
|
225
|
+
and isinstance(item.get("action"), dict)
|
|
226
|
+
):
|
|
227
|
+
|
|
229
228
|
action = item["action"]
|
|
230
229
|
if action.get("x") is not None and action.get("y") is not None:
|
|
231
230
|
return (int(action.get("x")), int(action.get("y")))
|
|
232
|
-
|
|
231
|
+
|
|
233
232
|
return None
|
|
234
|
-
|
|
233
|
+
|
|
235
234
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
236
235
|
"""
|
|
237
236
|
Get list of capabilities supported by this agent config.
|
|
238
|
-
|
|
237
|
+
|
|
239
238
|
Returns:
|
|
240
239
|
List of capability strings
|
|
241
240
|
"""
|
agent/loops/opencua.py
CHANGED
|
@@ -4,20 +4,22 @@ Based on OpenCUA model for GUI grounding tasks.
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import asyncio
|
|
7
|
+
import base64
|
|
7
8
|
import json
|
|
9
|
+
import math
|
|
8
10
|
import re
|
|
9
|
-
import base64
|
|
10
|
-
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
11
|
-
from io import BytesIO
|
|
12
11
|
import uuid
|
|
13
|
-
from
|
|
12
|
+
from io import BytesIO
|
|
13
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
|
14
|
+
|
|
14
15
|
import litellm
|
|
15
|
-
import
|
|
16
|
+
from PIL import Image
|
|
16
17
|
|
|
17
|
-
from .composed_grounded import ComposedGroundedConfig
|
|
18
18
|
from ..decorators import register_agent
|
|
19
|
-
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
20
19
|
from ..loops.base import AsyncAgentConfig
|
|
20
|
+
from ..types import AgentCapability, AgentResponse, Messages, Tools
|
|
21
|
+
from .composed_grounded import ComposedGroundedConfig
|
|
22
|
+
|
|
21
23
|
|
|
22
24
|
def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
|
|
23
25
|
"""Extract coordinates from pyautogui.click(x=..., y=...) format."""
|
|
@@ -32,10 +34,11 @@ def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
|
|
|
32
34
|
except Exception:
|
|
33
35
|
return None
|
|
34
36
|
|
|
37
|
+
|
|
35
38
|
@register_agent(models=r"(?i).*OpenCUA.*")
|
|
36
39
|
class OpenCUAConfig(ComposedGroundedConfig):
|
|
37
40
|
"""OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
|
|
38
|
-
|
|
41
|
+
|
|
39
42
|
def __init__(self):
|
|
40
43
|
super().__init__()
|
|
41
44
|
self.current_model = None
|
|
@@ -53,7 +56,7 @@ class OpenCUAConfig(ComposedGroundedConfig):
|
|
|
53
56
|
_on_api_end=None,
|
|
54
57
|
_on_usage=None,
|
|
55
58
|
_on_screenshot=None,
|
|
56
|
-
**kwargs
|
|
59
|
+
**kwargs,
|
|
57
60
|
) -> Dict[str, Any]:
|
|
58
61
|
"""Fallback to a self-composed model"""
|
|
59
62
|
return await super().predict_step(
|
|
@@ -67,24 +70,20 @@ class OpenCUAConfig(ComposedGroundedConfig):
|
|
|
67
70
|
_on_api_end=_on_api_end,
|
|
68
71
|
_on_usage=_on_usage,
|
|
69
72
|
_on_screenshot=_on_screenshot,
|
|
70
|
-
**kwargs
|
|
73
|
+
**kwargs,
|
|
71
74
|
)
|
|
72
75
|
|
|
73
76
|
async def predict_click(
|
|
74
|
-
self,
|
|
75
|
-
model: str,
|
|
76
|
-
image_b64: str,
|
|
77
|
-
instruction: str,
|
|
78
|
-
**kwargs
|
|
77
|
+
self, model: str, image_b64: str, instruction: str, **kwargs
|
|
79
78
|
) -> Optional[Tuple[int, int]]:
|
|
80
79
|
"""
|
|
81
80
|
Predict click coordinates using OpenCUA model via litellm.acompletion.
|
|
82
|
-
|
|
81
|
+
|
|
83
82
|
Args:
|
|
84
83
|
model: The OpenCUA model name
|
|
85
84
|
image_b64: Base64 encoded image
|
|
86
85
|
instruction: Instruction for where to click
|
|
87
|
-
|
|
86
|
+
|
|
88
87
|
Returns:
|
|
89
88
|
Tuple of (x, y) coordinates or None if prediction fails
|
|
90
89
|
"""
|
|
@@ -93,50 +92,39 @@ class OpenCUAConfig(ComposedGroundedConfig):
|
|
|
93
92
|
"You are a GUI agent. You are given a task and a screenshot of the screen. "
|
|
94
93
|
"You need to perform a series of pyautogui actions to complete the task."
|
|
95
94
|
)
|
|
96
|
-
|
|
97
|
-
system_message = {
|
|
98
|
-
|
|
99
|
-
"content": system_prompt
|
|
100
|
-
}
|
|
101
|
-
|
|
95
|
+
|
|
96
|
+
system_message = {"role": "system", "content": system_prompt}
|
|
97
|
+
|
|
102
98
|
# Prepare user message with image and instruction
|
|
103
99
|
user_message = {
|
|
104
100
|
"role": "user",
|
|
105
101
|
"content": [
|
|
106
|
-
{
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"url": f"data:image/png;base64,{image_b64}"
|
|
110
|
-
}
|
|
111
|
-
},
|
|
112
|
-
{
|
|
113
|
-
"type": "text",
|
|
114
|
-
"text": f"Click on {instruction}"
|
|
115
|
-
}
|
|
116
|
-
]
|
|
102
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
|
|
103
|
+
{"type": "text", "text": f"Click on {instruction}"},
|
|
104
|
+
],
|
|
117
105
|
}
|
|
118
|
-
|
|
106
|
+
|
|
119
107
|
# Prepare API call kwargs
|
|
120
108
|
api_kwargs = {
|
|
121
109
|
"model": model,
|
|
122
110
|
"messages": [system_message, user_message],
|
|
123
111
|
"max_new_tokens": 2056,
|
|
124
112
|
"temperature": 0,
|
|
125
|
-
**kwargs
|
|
113
|
+
**kwargs,
|
|
126
114
|
}
|
|
127
|
-
|
|
115
|
+
|
|
128
116
|
# Use liteLLM acompletion
|
|
129
117
|
response = await litellm.acompletion(**api_kwargs)
|
|
130
|
-
|
|
118
|
+
|
|
131
119
|
# Extract response text
|
|
132
120
|
output_text = response.choices[0].message.content
|
|
133
121
|
# print(output_text)
|
|
134
|
-
|
|
122
|
+
|
|
135
123
|
# Extract coordinates from pyautogui format
|
|
136
124
|
coordinates = extract_coordinates_from_pyautogui(output_text)
|
|
137
|
-
|
|
125
|
+
|
|
138
126
|
return coordinates
|
|
139
|
-
|
|
127
|
+
|
|
140
128
|
def get_capabilities(self) -> List[AgentCapability]:
|
|
141
129
|
"""Return the capabilities supported by this agent."""
|
|
142
130
|
return ["click"]
|