cua-agent 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/adapters/__init__.py +2 -0
- agent/adapters/mlxvlm_adapter.py +359 -0
- agent/agent.py +14 -3
- agent/callbacks/__init__.py +2 -0
- agent/callbacks/operator_validator.py +138 -0
- agent/callbacks/trajectory_saver.py +87 -5
- agent/integrations/hud/__init__.py +223 -72
- agent/integrations/hud/proxy.py +183 -0
- agent/loops/anthropic.py +12 -1
- agent/loops/composed_grounded.py +26 -14
- agent/loops/openai.py +15 -7
- agent/loops/uitars.py +17 -8
- agent/proxy/examples.py +192 -0
- agent/proxy/handlers.py +248 -0
- {cua_agent-0.4.17.dist-info → cua_agent-0.4.19.dist-info}/METADATA +3 -3
- {cua_agent-0.4.17.dist-info → cua_agent-0.4.19.dist-info}/RECORD +18 -16
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/agent.py +0 -373
- agent/integrations/hud/computer_handler.py +0 -187
- {cua_agent-0.4.17.dist-info → cua_agent-0.4.19.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.17.dist-info → cua_agent-0.4.19.dist-info}/entry_points.txt +0 -0
agent/loops/composed_grounded.py
CHANGED
|
@@ -48,11 +48,11 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
|
48
48
|
"get_dimensions",
|
|
49
49
|
"get_environment"
|
|
50
50
|
],
|
|
51
|
-
"description": "The action to perform"
|
|
51
|
+
"description": "The action to perform (required for all actions)"
|
|
52
52
|
},
|
|
53
53
|
"element_description": {
|
|
54
54
|
"type": "string",
|
|
55
|
-
"description": "Description of the element to interact with (required for click, double_click, move, scroll actions
|
|
55
|
+
"description": "Description of the element to interact with (required for click, double_click, move, scroll actions)"
|
|
56
56
|
},
|
|
57
57
|
"start_element_description": {
|
|
58
58
|
"type": "string",
|
|
@@ -67,20 +67,30 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
|
|
67
67
|
"description": "The text to type (required for type action)"
|
|
68
68
|
},
|
|
69
69
|
"keys": {
|
|
70
|
-
"type": "
|
|
71
|
-
"
|
|
70
|
+
"type": "array",
|
|
71
|
+
"items": {
|
|
72
|
+
"type": "string"
|
|
73
|
+
},
|
|
74
|
+
"description": "Key(s) to press (required for keypress action)"
|
|
72
75
|
},
|
|
73
76
|
"button": {
|
|
74
77
|
"type": "string",
|
|
75
|
-
"
|
|
78
|
+
"enum": [
|
|
79
|
+
"left",
|
|
80
|
+
"right",
|
|
81
|
+
"wheel",
|
|
82
|
+
"back",
|
|
83
|
+
"forward"
|
|
84
|
+
],
|
|
85
|
+
"description": "The mouse button to use for click action (required for click and double_click action)",
|
|
76
86
|
},
|
|
77
87
|
"scroll_x": {
|
|
78
88
|
"type": "integer",
|
|
79
|
-
"description": "Horizontal scroll amount for scroll action (
|
|
89
|
+
"description": "Horizontal scroll amount for scroll action (required for scroll action)",
|
|
80
90
|
},
|
|
81
91
|
"scroll_y": {
|
|
82
92
|
"type": "integer",
|
|
83
|
-
"description": "Vertical scroll amount for scroll action (
|
|
93
|
+
"description": "Vertical scroll amount for scroll action (required for scroll action)",
|
|
84
94
|
},
|
|
85
95
|
},
|
|
86
96
|
"required": [
|
|
@@ -266,13 +276,15 @@ class ComposedGroundedConfig:
|
|
|
266
276
|
grounding_agent = grounding_agent_conf.agent_class()
|
|
267
277
|
|
|
268
278
|
for desc in element_descriptions:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
279
|
+
for _ in range(3): # try 3 times
|
|
280
|
+
coords = await grounding_agent.predict_click(
|
|
281
|
+
model=grounding_model,
|
|
282
|
+
image_b64=last_image_b64,
|
|
283
|
+
instruction=desc
|
|
284
|
+
)
|
|
285
|
+
if coords:
|
|
286
|
+
self.desc2xy[desc] = coords
|
|
287
|
+
break
|
|
276
288
|
|
|
277
289
|
# Step 6: Convert computer calls from descriptions back to xy coordinates
|
|
278
290
|
final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
|
agent/loops/openai.py
CHANGED
|
@@ -162,7 +162,18 @@ class OpenAIComputerUseConfig:
|
|
|
162
162
|
input_items = [
|
|
163
163
|
{
|
|
164
164
|
"role": "user",
|
|
165
|
-
"content": f"You are a UI grounding expert.
|
|
165
|
+
"content": f"""You are a UI grounding expert. Follow these guidelines:
|
|
166
|
+
|
|
167
|
+
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
168
|
+
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
|
169
|
+
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
|
170
|
+
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
|
171
|
+
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
|
172
|
+
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
|
173
|
+
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
174
|
+
|
|
175
|
+
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
176
|
+
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
166
177
|
},
|
|
167
178
|
{
|
|
168
179
|
"role": "user",
|
|
@@ -200,7 +211,7 @@ class OpenAIComputerUseConfig:
|
|
|
200
211
|
"stream": False,
|
|
201
212
|
"reasoning": {"summary": "concise"},
|
|
202
213
|
"truncation": "auto",
|
|
203
|
-
"max_tokens":
|
|
214
|
+
"max_tokens": 200 # Keep response short for click prediction
|
|
204
215
|
}
|
|
205
216
|
|
|
206
217
|
# Use liteLLM responses
|
|
@@ -217,11 +228,8 @@ class OpenAIComputerUseConfig:
|
|
|
217
228
|
isinstance(item.get("action"), dict)):
|
|
218
229
|
|
|
219
230
|
action = item["action"]
|
|
220
|
-
if action.get("
|
|
221
|
-
|
|
222
|
-
y = action.get("y")
|
|
223
|
-
if x is not None and y is not None:
|
|
224
|
-
return (int(x), int(y))
|
|
231
|
+
if action.get("x") is not None and action.get("y") is not None:
|
|
232
|
+
return (int(action.get("x")), int(action.get("y")))
|
|
225
233
|
|
|
226
234
|
return None
|
|
227
235
|
|
agent/loops/uitars.py
CHANGED
|
@@ -228,15 +228,24 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
|
|
|
228
228
|
|
|
229
229
|
# Handle coordinate parameters
|
|
230
230
|
if "start_box" in param_name or "end_box" in param_name:
|
|
231
|
-
# Parse coordinates like '(x,y)' or '(
|
|
232
|
-
|
|
233
|
-
|
|
231
|
+
# Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
|
|
232
|
+
# First, remove special tokens
|
|
233
|
+
clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
|
|
234
|
+
# Then remove parentheses and split
|
|
235
|
+
numbers = clean_param.replace("(", "").replace(")", "").split(",")
|
|
234
236
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
237
|
+
try:
|
|
238
|
+
float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
|
|
239
|
+
|
|
240
|
+
if len(float_numbers) == 2:
|
|
241
|
+
# Single point, duplicate for box format
|
|
242
|
+
float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
|
|
243
|
+
|
|
244
|
+
action_inputs[param_name.strip()] = str(float_numbers)
|
|
245
|
+
except ValueError as e:
|
|
246
|
+
# If parsing fails, keep the original parameter value
|
|
247
|
+
print(f"Warning: Could not parse coordinates '{param}': {e}")
|
|
248
|
+
action_inputs[param_name.strip()] = param
|
|
240
249
|
|
|
241
250
|
return [{
|
|
242
251
|
"thought": thought,
|
agent/proxy/examples.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Example usage of the proxy server and client requests.
|
|
3
|
+
"""
|
|
4
|
+
import dotenv
|
|
5
|
+
dotenv.load_dotenv()
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import aiohttp
|
|
11
|
+
from typing import Dict, Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def test_http_endpoint():
|
|
15
|
+
"""Test the HTTP /responses endpoint."""
|
|
16
|
+
|
|
17
|
+
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
18
|
+
assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
|
|
19
|
+
|
|
20
|
+
# Example 1: Simple text request
|
|
21
|
+
simple_request = {
|
|
22
|
+
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
23
|
+
"input": "Tell me a three sentence bedtime story about a unicorn.",
|
|
24
|
+
"env": {
|
|
25
|
+
"ANTHROPIC_API_KEY": anthropic_api_key
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# Example 2: Multi-modal request with image
|
|
30
|
+
multimodal_request = {
|
|
31
|
+
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
32
|
+
"input": [
|
|
33
|
+
{
|
|
34
|
+
"role": "user",
|
|
35
|
+
"content": [
|
|
36
|
+
{"type": "input_text", "text": "what is in this image?"},
|
|
37
|
+
{
|
|
38
|
+
"type": "input_image",
|
|
39
|
+
"image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
|
40
|
+
}
|
|
41
|
+
]
|
|
42
|
+
}
|
|
43
|
+
],
|
|
44
|
+
"env": {
|
|
45
|
+
"ANTHROPIC_API_KEY": anthropic_api_key
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Example 3: Request with custom agent and computer kwargs
|
|
50
|
+
custom_request = {
|
|
51
|
+
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
52
|
+
"input": "Take a screenshot and tell me what you see",
|
|
53
|
+
"env": {
|
|
54
|
+
"ANTHROPIC_API_KEY": anthropic_api_key
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Test requests
|
|
59
|
+
base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
|
|
60
|
+
# base_url = "http://localhost:8000"
|
|
61
|
+
api_key = os.getenv("CUA_API_KEY")
|
|
62
|
+
assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
|
|
63
|
+
|
|
64
|
+
async with aiohttp.ClientSession() as session:
|
|
65
|
+
for i, request_data in enumerate([
|
|
66
|
+
simple_request,
|
|
67
|
+
# multimodal_request,
|
|
68
|
+
custom_request
|
|
69
|
+
], 1):
|
|
70
|
+
print(f"\n--- Test {i} ---")
|
|
71
|
+
print(f"Request: {json.dumps(request_data, indent=2)}")
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
print(f"Sending request to {base_url}/responses")
|
|
75
|
+
async with session.post(
|
|
76
|
+
f"{base_url}/responses",
|
|
77
|
+
json=request_data,
|
|
78
|
+
headers={"Content-Type": "application/json", "X-API-Key": api_key}
|
|
79
|
+
) as response:
|
|
80
|
+
result = await response.json()
|
|
81
|
+
print(f"Status: {response.status}")
|
|
82
|
+
print(f"Response: {json.dumps(result, indent=2)}")
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
print(f"Error: {e}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def curl_examples():
|
|
89
|
+
"""Print curl command examples."""
|
|
90
|
+
|
|
91
|
+
print("=== CURL Examples ===\n")
|
|
92
|
+
|
|
93
|
+
print("1. Simple text request:")
|
|
94
|
+
print("""curl http://localhost:8000/responses \\
|
|
95
|
+
-H "Content-Type: application/json" \\
|
|
96
|
+
-d '{
|
|
97
|
+
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
98
|
+
"input": "Tell me a three sentence bedtime story about a unicorn."
|
|
99
|
+
}'""")
|
|
100
|
+
|
|
101
|
+
print("\n2. Multi-modal request with image:")
|
|
102
|
+
print("""curl http://localhost:8000/responses \\
|
|
103
|
+
-H "Content-Type: application/json" \\
|
|
104
|
+
-d '{
|
|
105
|
+
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
106
|
+
"input": [
|
|
107
|
+
{
|
|
108
|
+
"role": "user",
|
|
109
|
+
"content": [
|
|
110
|
+
{"type": "input_text", "text": "what is in this image?"},
|
|
111
|
+
{
|
|
112
|
+
"type": "input_image",
|
|
113
|
+
"image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
|
114
|
+
}
|
|
115
|
+
]
|
|
116
|
+
}
|
|
117
|
+
]
|
|
118
|
+
}'""")
|
|
119
|
+
|
|
120
|
+
print("\n3. Request with custom configuration:")
|
|
121
|
+
print("""curl http://localhost:8000/responses \\
|
|
122
|
+
-H "Content-Type: application/json" \\
|
|
123
|
+
-d '{
|
|
124
|
+
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
125
|
+
"input": "Take a screenshot and tell me what you see",
|
|
126
|
+
"agent_kwargs": {
|
|
127
|
+
"save_trajectory": true,
|
|
128
|
+
"verbosity": 20
|
|
129
|
+
},
|
|
130
|
+
"computer_kwargs": {
|
|
131
|
+
"os_type": "linux",
|
|
132
|
+
"provider_type": "cloud"
|
|
133
|
+
}
|
|
134
|
+
}'""")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
async def test_p2p_client():
|
|
138
|
+
"""Example P2P client using peerjs-python."""
|
|
139
|
+
try:
|
|
140
|
+
from peerjs import Peer, PeerOptions, ConnectionEventType
|
|
141
|
+
from aiortc import RTCConfiguration, RTCIceServer
|
|
142
|
+
|
|
143
|
+
# Set up client peer
|
|
144
|
+
options = PeerOptions(
|
|
145
|
+
host="0.peerjs.com",
|
|
146
|
+
port=443,
|
|
147
|
+
secure=True,
|
|
148
|
+
config=RTCConfiguration(
|
|
149
|
+
iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
client_peer = Peer(id="test-client", peer_options=options)
|
|
154
|
+
await client_peer.start()
|
|
155
|
+
|
|
156
|
+
# Connect to proxy server
|
|
157
|
+
connection = client_peer.connect("computer-agent-proxy")
|
|
158
|
+
|
|
159
|
+
@connection.on(ConnectionEventType.Open)
|
|
160
|
+
async def connection_open():
|
|
161
|
+
print("Connected to proxy server")
|
|
162
|
+
|
|
163
|
+
# Send a test request
|
|
164
|
+
request = {
|
|
165
|
+
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
166
|
+
"input": "Hello from P2P client!"
|
|
167
|
+
}
|
|
168
|
+
await connection.send(json.dumps(request))
|
|
169
|
+
|
|
170
|
+
@connection.on(ConnectionEventType.Data)
|
|
171
|
+
async def connection_data(data):
|
|
172
|
+
print(f"Received response: {data}")
|
|
173
|
+
await client_peer.destroy()
|
|
174
|
+
|
|
175
|
+
# Wait for connection
|
|
176
|
+
await asyncio.sleep(10)
|
|
177
|
+
|
|
178
|
+
except ImportError:
|
|
179
|
+
print("P2P dependencies not available. Install peerjs-python for P2P testing.")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
print(f"P2P test error: {e}")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
import sys
|
|
186
|
+
|
|
187
|
+
if len(sys.argv) > 1 and sys.argv[1] == "curl":
|
|
188
|
+
curl_examples()
|
|
189
|
+
elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
|
|
190
|
+
asyncio.run(test_p2p_client())
|
|
191
|
+
else:
|
|
192
|
+
asyncio.run(test_http_endpoint())
|
agent/proxy/handlers.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Request handlers for the proxy endpoints.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from contextlib import contextmanager
|
|
10
|
+
from typing import Dict, Any, List, Union, Optional
|
|
11
|
+
|
|
12
|
+
from ..agent import ComputerAgent
|
|
13
|
+
from computer import Computer
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ResponsesHandler:
|
|
19
|
+
"""Handler for /responses endpoint that processes agent requests."""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.computer = None
|
|
23
|
+
self.agent = None
|
|
24
|
+
# Simple in-memory caches
|
|
25
|
+
self._computer_cache: Dict[str, Any] = {}
|
|
26
|
+
self._agent_cache: Dict[str, Any] = {}
|
|
27
|
+
|
|
28
|
+
async def setup_computer_agent(
|
|
29
|
+
self,
|
|
30
|
+
model: str,
|
|
31
|
+
agent_kwargs: Optional[Dict[str, Any]] = None,
|
|
32
|
+
computer_kwargs: Optional[Dict[str, Any]] = None,
|
|
33
|
+
):
|
|
34
|
+
"""Set up (and cache) computer and agent instances.
|
|
35
|
+
|
|
36
|
+
Caching keys:
|
|
37
|
+
- Computer cache key: computer_kwargs
|
|
38
|
+
- Agent cache key: {"model": model, **agent_kwargs}
|
|
39
|
+
"""
|
|
40
|
+
agent_kwargs = agent_kwargs or {}
|
|
41
|
+
computer_kwargs = computer_kwargs or {}
|
|
42
|
+
|
|
43
|
+
def _stable_key(obj: Dict[str, Any]) -> str:
|
|
44
|
+
try:
|
|
45
|
+
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
|
|
46
|
+
except Exception:
|
|
47
|
+
# Fallback: stringify non-serializable values
|
|
48
|
+
safe_obj = {}
|
|
49
|
+
for k, v in obj.items():
|
|
50
|
+
try:
|
|
51
|
+
json.dumps(v)
|
|
52
|
+
safe_obj[k] = v
|
|
53
|
+
except Exception:
|
|
54
|
+
safe_obj[k] = str(v)
|
|
55
|
+
return json.dumps(safe_obj, sort_keys=True, separators=(",", ":"))
|
|
56
|
+
|
|
57
|
+
# Determine if custom tools are supplied; if so, skip computer setup entirely
|
|
58
|
+
has_custom_tools = bool(agent_kwargs.get("tools"))
|
|
59
|
+
|
|
60
|
+
computer = None
|
|
61
|
+
if not has_custom_tools:
|
|
62
|
+
# ---------- Computer setup (with cache) ----------
|
|
63
|
+
comp_key = _stable_key(computer_kwargs)
|
|
64
|
+
|
|
65
|
+
computer = self._computer_cache.get(comp_key)
|
|
66
|
+
if computer is None:
|
|
67
|
+
# Default computer configuration
|
|
68
|
+
default_c_config = {
|
|
69
|
+
"os_type": "linux",
|
|
70
|
+
"provider_type": "cloud",
|
|
71
|
+
"name": os.getenv("CUA_CONTAINER_NAME"),
|
|
72
|
+
"api_key": os.getenv("CUA_API_KEY"),
|
|
73
|
+
}
|
|
74
|
+
default_c_config.update(computer_kwargs)
|
|
75
|
+
computer = Computer(**default_c_config)
|
|
76
|
+
await computer.__aenter__()
|
|
77
|
+
self._computer_cache[comp_key] = computer
|
|
78
|
+
logger.info(f"Computer created and cached with key={comp_key} config={default_c_config}")
|
|
79
|
+
else:
|
|
80
|
+
logger.info(f"Reusing cached computer for key={comp_key}")
|
|
81
|
+
|
|
82
|
+
# Bind current computer reference (None if custom tools supplied)
|
|
83
|
+
self.computer = computer
|
|
84
|
+
|
|
85
|
+
# ---------- Agent setup (with cache) ----------
|
|
86
|
+
# Build agent cache key from {model} + agent_kwargs (excluding tools unless explicitly passed)
|
|
87
|
+
agent_kwargs_for_key = dict(agent_kwargs)
|
|
88
|
+
agent_key_payload = {"model": model, **agent_kwargs_for_key}
|
|
89
|
+
agent_key = _stable_key(agent_key_payload)
|
|
90
|
+
|
|
91
|
+
agent = self._agent_cache.get(agent_key)
|
|
92
|
+
if agent is None:
|
|
93
|
+
# Default agent configuration
|
|
94
|
+
default_a_config: Dict[str, Any] = {"model": model}
|
|
95
|
+
if not has_custom_tools:
|
|
96
|
+
default_a_config["tools"] = [computer]
|
|
97
|
+
# Apply user overrides, but keep tools unless user explicitly sets
|
|
98
|
+
if agent_kwargs:
|
|
99
|
+
if not has_custom_tools:
|
|
100
|
+
agent_kwargs.setdefault("tools", [computer])
|
|
101
|
+
default_a_config.update(agent_kwargs)
|
|
102
|
+
# JSON-derived kwargs may have loose types; ignore static arg typing here
|
|
103
|
+
agent = ComputerAgent(**default_a_config) # type: ignore[arg-type]
|
|
104
|
+
self._agent_cache[agent_key] = agent
|
|
105
|
+
logger.info(f"Agent created and cached with key={agent_key} model={model}")
|
|
106
|
+
else:
|
|
107
|
+
# Ensure cached agent uses the current computer tool (in case object differs)
|
|
108
|
+
# Only update if tools not explicitly provided in agent_kwargs
|
|
109
|
+
if not has_custom_tools:
|
|
110
|
+
try:
|
|
111
|
+
agent.tools = [computer]
|
|
112
|
+
except Exception:
|
|
113
|
+
pass
|
|
114
|
+
logger.info(f"Reusing cached agent for key={agent_key}")
|
|
115
|
+
|
|
116
|
+
# Bind current agent reference
|
|
117
|
+
self.agent = agent
|
|
118
|
+
|
|
119
|
+
async def process_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
120
|
+
"""
|
|
121
|
+
Process a /responses request and return the result.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
request_data: Dictionary containing model, input, and optional kwargs
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Dictionary with the agent's response
|
|
128
|
+
"""
|
|
129
|
+
try:
|
|
130
|
+
# Extract request parameters
|
|
131
|
+
model = request_data.get("model")
|
|
132
|
+
input_data = request_data.get("input")
|
|
133
|
+
agent_kwargs = request_data.get("agent_kwargs", {})
|
|
134
|
+
computer_kwargs = request_data.get("computer_kwargs", {})
|
|
135
|
+
env_overrides = request_data.get("env", {}) or {}
|
|
136
|
+
|
|
137
|
+
if not model:
|
|
138
|
+
raise ValueError("Model is required")
|
|
139
|
+
if not input_data:
|
|
140
|
+
raise ValueError("Input is required")
|
|
141
|
+
|
|
142
|
+
# Apply env overrides for the duration of this request
|
|
143
|
+
with self._env_overrides(env_overrides):
|
|
144
|
+
# Set up (and possibly reuse) computer and agent via caches
|
|
145
|
+
await self.setup_computer_agent(model, agent_kwargs, computer_kwargs)
|
|
146
|
+
|
|
147
|
+
# Defensive: ensure agent is initialized for type checkers
|
|
148
|
+
agent = self.agent
|
|
149
|
+
if agent is None:
|
|
150
|
+
raise RuntimeError("Agent failed to initialize")
|
|
151
|
+
|
|
152
|
+
# Convert input to messages format
|
|
153
|
+
messages = self._convert_input_to_messages(input_data)
|
|
154
|
+
|
|
155
|
+
# Run agent and get first result
|
|
156
|
+
async for result in agent.run(messages):
|
|
157
|
+
# Return the first result and break
|
|
158
|
+
return {
|
|
159
|
+
"success": True,
|
|
160
|
+
"result": result,
|
|
161
|
+
"model": model
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
# If no results were yielded
|
|
165
|
+
return {
|
|
166
|
+
"success": False,
|
|
167
|
+
"error": "No results from agent",
|
|
168
|
+
"model": model
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.error(f"Error processing request: {e}")
|
|
173
|
+
return {
|
|
174
|
+
"success": False,
|
|
175
|
+
"error": str(e),
|
|
176
|
+
"model": request_data.get("model", "unknown")
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
def _convert_input_to_messages(self, input_data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
|
180
|
+
"""Convert input data to messages format."""
|
|
181
|
+
if isinstance(input_data, str):
|
|
182
|
+
# Simple string input
|
|
183
|
+
return [{"role": "user", "content": input_data}]
|
|
184
|
+
elif isinstance(input_data, list):
|
|
185
|
+
# Already in messages format
|
|
186
|
+
messages = []
|
|
187
|
+
for msg in input_data:
|
|
188
|
+
# Convert content array format if needed
|
|
189
|
+
if isinstance(msg.get("content"), list):
|
|
190
|
+
content_parts = []
|
|
191
|
+
for part in msg["content"]:
|
|
192
|
+
if part.get("type") == "input_text":
|
|
193
|
+
content_parts.append({"type": "text", "text": part["text"]})
|
|
194
|
+
elif part.get("type") == "input_image":
|
|
195
|
+
content_parts.append({
|
|
196
|
+
"type": "image_url",
|
|
197
|
+
"image_url": {"url": part["image_url"]}
|
|
198
|
+
})
|
|
199
|
+
else:
|
|
200
|
+
content_parts.append(part)
|
|
201
|
+
messages.append({
|
|
202
|
+
"role": msg["role"],
|
|
203
|
+
"content": content_parts
|
|
204
|
+
})
|
|
205
|
+
else:
|
|
206
|
+
messages.append(msg)
|
|
207
|
+
return messages
|
|
208
|
+
else:
|
|
209
|
+
raise ValueError("Input must be string or list of messages")
|
|
210
|
+
|
|
211
|
+
async def cleanup(self):
|
|
212
|
+
"""Clean up resources."""
|
|
213
|
+
if self.computer:
|
|
214
|
+
try:
|
|
215
|
+
await self.computer.__aexit__(None, None, None)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.error(f"Error cleaning up computer: {e}")
|
|
218
|
+
finally:
|
|
219
|
+
self.computer = None
|
|
220
|
+
self.agent = None
|
|
221
|
+
|
|
222
|
+
@staticmethod
|
|
223
|
+
@contextmanager
|
|
224
|
+
def _env_overrides(env: Dict[str, str]):
|
|
225
|
+
"""Temporarily apply environment variable overrides for the current process.
|
|
226
|
+
Restores previous values after the context exits.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
env: Mapping of env var names to override for this request.
|
|
230
|
+
"""
|
|
231
|
+
if not env:
|
|
232
|
+
# No-op context
|
|
233
|
+
yield
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
original: Dict[str, Optional[str]] = {}
|
|
237
|
+
try:
|
|
238
|
+
for k, v in env.items():
|
|
239
|
+
original[k] = os.environ.get(k)
|
|
240
|
+
os.environ[k] = str(v)
|
|
241
|
+
yield
|
|
242
|
+
finally:
|
|
243
|
+
for k, old in original.items():
|
|
244
|
+
if old is None:
|
|
245
|
+
# Was not set before
|
|
246
|
+
os.environ.pop(k, None)
|
|
247
|
+
else:
|
|
248
|
+
os.environ[k] = old
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.19
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.12
|
|
@@ -38,7 +38,7 @@ Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
|
|
|
38
38
|
Provides-Extra: cli
|
|
39
39
|
Requires-Dist: yaspin>=3.1.0; extra == "cli"
|
|
40
40
|
Provides-Extra: hud
|
|
41
|
-
Requires-Dist: hud-python
|
|
41
|
+
Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "hud"
|
|
42
42
|
Provides-Extra: all
|
|
43
43
|
Requires-Dist: ultralytics>=8.0.0; extra == "all"
|
|
44
44
|
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
|
|
@@ -49,7 +49,7 @@ Requires-Dist: transformers>=4.54.0; extra == "all"
|
|
|
49
49
|
Requires-Dist: gradio>=5.23.3; extra == "all"
|
|
50
50
|
Requires-Dist: python-dotenv>=1.0.1; extra == "all"
|
|
51
51
|
Requires-Dist: yaspin>=3.1.0; extra == "all"
|
|
52
|
-
Requires-Dist: hud-python
|
|
52
|
+
Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "all"
|
|
53
53
|
Description-Content-Type: text/markdown
|
|
54
54
|
|
|
55
55
|
<div align="center">
|