cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/proxy/examples.py
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Example usage of the proxy server and client requests.
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
import dotenv
|
|
6
|
+
|
|
5
7
|
dotenv.load_dotenv()
|
|
6
8
|
|
|
7
9
|
import asyncio
|
|
8
10
|
import json
|
|
9
11
|
import os
|
|
12
|
+
from typing import Any, Dict
|
|
13
|
+
|
|
10
14
|
import aiohttp
|
|
11
|
-
from typing import Dict, Any
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
async def test_http_endpoint():
|
|
15
18
|
"""Test the HTTP /responses endpoint."""
|
|
16
|
-
|
|
19
|
+
|
|
17
20
|
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
18
21
|
assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
|
|
19
22
|
|
|
@@ -21,11 +24,9 @@ async def test_http_endpoint():
|
|
|
21
24
|
simple_request = {
|
|
22
25
|
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
23
26
|
"input": "Tell me a three sentence bedtime story about a unicorn.",
|
|
24
|
-
"env": {
|
|
25
|
-
"ANTHROPIC_API_KEY": anthropic_api_key
|
|
26
|
-
}
|
|
27
|
+
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
|
|
27
28
|
}
|
|
28
|
-
|
|
29
|
+
|
|
29
30
|
# Example 2: Multi-modal request with image
|
|
30
31
|
multimodal_request = {
|
|
31
32
|
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
@@ -36,70 +37,72 @@ async def test_http_endpoint():
|
|
|
36
37
|
{"type": "input_text", "text": "what is in this image?"},
|
|
37
38
|
{
|
|
38
39
|
"type": "input_image",
|
|
39
|
-
"image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
|
40
|
-
}
|
|
41
|
-
]
|
|
40
|
+
"image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
|
41
|
+
},
|
|
42
|
+
],
|
|
42
43
|
}
|
|
43
44
|
],
|
|
44
|
-
"env": {
|
|
45
|
-
"ANTHROPIC_API_KEY": anthropic_api_key
|
|
46
|
-
}
|
|
45
|
+
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
|
|
47
46
|
}
|
|
48
|
-
|
|
47
|
+
|
|
49
48
|
# Example 3: Request with custom agent and computer kwargs
|
|
50
49
|
custom_request = {
|
|
51
50
|
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
52
51
|
"input": "Take a screenshot and tell me what you see",
|
|
53
|
-
"env": {
|
|
54
|
-
"ANTHROPIC_API_KEY": anthropic_api_key
|
|
55
|
-
}
|
|
52
|
+
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
|
|
56
53
|
}
|
|
57
|
-
|
|
54
|
+
|
|
58
55
|
# Test requests
|
|
59
56
|
base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
|
|
60
57
|
# base_url = "http://localhost:8000"
|
|
61
58
|
api_key = os.getenv("CUA_API_KEY")
|
|
62
59
|
assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
|
|
63
|
-
|
|
60
|
+
|
|
64
61
|
async with aiohttp.ClientSession() as session:
|
|
65
|
-
for i, request_data in enumerate(
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
62
|
+
for i, request_data in enumerate(
|
|
63
|
+
[
|
|
64
|
+
simple_request,
|
|
65
|
+
# multimodal_request,
|
|
66
|
+
custom_request,
|
|
67
|
+
],
|
|
68
|
+
1,
|
|
69
|
+
):
|
|
70
70
|
print(f"\n--- Test {i} ---")
|
|
71
71
|
print(f"Request: {json.dumps(request_data, indent=2)}")
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
try:
|
|
74
74
|
print(f"Sending request to {base_url}/responses")
|
|
75
75
|
async with session.post(
|
|
76
76
|
f"{base_url}/responses",
|
|
77
77
|
json=request_data,
|
|
78
|
-
headers={"Content-Type": "application/json", "X-API-Key": api_key}
|
|
78
|
+
headers={"Content-Type": "application/json", "X-API-Key": api_key},
|
|
79
79
|
) as response:
|
|
80
80
|
result = await response.json()
|
|
81
81
|
print(f"Status: {response.status}")
|
|
82
82
|
print(f"Response: {json.dumps(result, indent=2)}")
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
except Exception as e:
|
|
85
85
|
print(f"Error: {e}")
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
def curl_examples():
|
|
89
89
|
"""Print curl command examples."""
|
|
90
|
-
|
|
90
|
+
|
|
91
91
|
print("=== CURL Examples ===\n")
|
|
92
|
-
|
|
92
|
+
|
|
93
93
|
print("1. Simple text request:")
|
|
94
|
-
print(
|
|
94
|
+
print(
|
|
95
|
+
"""curl http://localhost:8000/responses \\
|
|
95
96
|
-H "Content-Type: application/json" \\
|
|
96
97
|
-d '{
|
|
97
98
|
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
98
99
|
"input": "Tell me a three sentence bedtime story about a unicorn."
|
|
99
|
-
}'"""
|
|
100
|
-
|
|
100
|
+
}'"""
|
|
101
|
+
)
|
|
102
|
+
|
|
101
103
|
print("\n2. Multi-modal request with image:")
|
|
102
|
-
print(
|
|
104
|
+
print(
|
|
105
|
+
"""curl http://localhost:8000/responses \\
|
|
103
106
|
-H "Content-Type: application/json" \\
|
|
104
107
|
-d '{
|
|
105
108
|
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
@@ -115,10 +118,12 @@ def curl_examples():
|
|
|
115
118
|
]
|
|
116
119
|
}
|
|
117
120
|
]
|
|
118
|
-
}'"""
|
|
119
|
-
|
|
121
|
+
}'"""
|
|
122
|
+
)
|
|
123
|
+
|
|
120
124
|
print("\n3. Request with custom configuration:")
|
|
121
|
-
print(
|
|
125
|
+
print(
|
|
126
|
+
"""curl http://localhost:8000/responses \\
|
|
122
127
|
-H "Content-Type: application/json" \\
|
|
123
128
|
-d '{
|
|
124
129
|
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
@@ -131,50 +136,49 @@ def curl_examples():
|
|
|
131
136
|
"os_type": "linux",
|
|
132
137
|
"provider_type": "cloud"
|
|
133
138
|
}
|
|
134
|
-
}'"""
|
|
139
|
+
}'"""
|
|
140
|
+
)
|
|
135
141
|
|
|
136
142
|
|
|
137
143
|
async def test_p2p_client():
|
|
138
144
|
"""Example P2P client using peerjs-python."""
|
|
139
145
|
try:
|
|
140
|
-
from peerjs import Peer, PeerOptions, ConnectionEventType
|
|
141
146
|
from aiortc import RTCConfiguration, RTCIceServer
|
|
142
|
-
|
|
147
|
+
from peerjs import ConnectionEventType, Peer, PeerOptions
|
|
148
|
+
|
|
143
149
|
# Set up client peer
|
|
144
150
|
options = PeerOptions(
|
|
145
151
|
host="0.peerjs.com",
|
|
146
152
|
port=443,
|
|
147
153
|
secure=True,
|
|
148
|
-
config=RTCConfiguration(
|
|
149
|
-
iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]
|
|
150
|
-
)
|
|
154
|
+
config=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]),
|
|
151
155
|
)
|
|
152
|
-
|
|
156
|
+
|
|
153
157
|
client_peer = Peer(id="test-client", peer_options=options)
|
|
154
158
|
await client_peer.start()
|
|
155
|
-
|
|
159
|
+
|
|
156
160
|
# Connect to proxy server
|
|
157
161
|
connection = client_peer.connect("computer-agent-proxy")
|
|
158
|
-
|
|
162
|
+
|
|
159
163
|
@connection.on(ConnectionEventType.Open)
|
|
160
164
|
async def connection_open():
|
|
161
165
|
print("Connected to proxy server")
|
|
162
|
-
|
|
166
|
+
|
|
163
167
|
# Send a test request
|
|
164
168
|
request = {
|
|
165
169
|
"model": "anthropic/claude-3-5-sonnet-20241022",
|
|
166
|
-
"input": "Hello from P2P client!"
|
|
170
|
+
"input": "Hello from P2P client!",
|
|
167
171
|
}
|
|
168
172
|
await connection.send(json.dumps(request))
|
|
169
|
-
|
|
173
|
+
|
|
170
174
|
@connection.on(ConnectionEventType.Data)
|
|
171
175
|
async def connection_data(data):
|
|
172
176
|
print(f"Received response: {data}")
|
|
173
177
|
await client_peer.destroy()
|
|
174
|
-
|
|
178
|
+
|
|
175
179
|
# Wait for connection
|
|
176
180
|
await asyncio.sleep(10)
|
|
177
|
-
|
|
181
|
+
|
|
178
182
|
except ImportError:
|
|
179
183
|
print("P2P dependencies not available. Install peerjs-python for P2P testing.")
|
|
180
184
|
except Exception as e:
|
|
@@ -183,7 +187,7 @@ async def test_p2p_client():
|
|
|
183
187
|
|
|
184
188
|
if __name__ == "__main__":
|
|
185
189
|
import sys
|
|
186
|
-
|
|
190
|
+
|
|
187
191
|
if len(sys.argv) > 1 and sys.argv[1] == "curl":
|
|
188
192
|
curl_examples()
|
|
189
193
|
elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
|
agent/proxy/handlers.py
CHANGED
|
@@ -7,24 +7,25 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
from contextlib import contextmanager
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import Any, Dict, List, Optional, Union
|
|
11
11
|
|
|
12
|
-
from ..agent import ComputerAgent
|
|
13
12
|
from computer import Computer
|
|
14
13
|
|
|
14
|
+
from ..agent import ComputerAgent
|
|
15
|
+
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class ResponsesHandler:
|
|
19
20
|
"""Handler for /responses endpoint that processes agent requests."""
|
|
20
|
-
|
|
21
|
+
|
|
21
22
|
def __init__(self):
|
|
22
23
|
self.computer = None
|
|
23
24
|
self.agent = None
|
|
24
25
|
# Simple in-memory caches
|
|
25
26
|
self._computer_cache: Dict[str, Any] = {}
|
|
26
27
|
self._agent_cache: Dict[str, Any] = {}
|
|
27
|
-
|
|
28
|
+
|
|
28
29
|
async def setup_computer_agent(
|
|
29
30
|
self,
|
|
30
31
|
model: str,
|
|
@@ -75,7 +76,9 @@ class ResponsesHandler:
|
|
|
75
76
|
computer = Computer(**default_c_config)
|
|
76
77
|
await computer.__aenter__()
|
|
77
78
|
self._computer_cache[comp_key] = computer
|
|
78
|
-
logger.info(
|
|
79
|
+
logger.info(
|
|
80
|
+
f"Computer created and cached with key={comp_key} config={default_c_config}"
|
|
81
|
+
)
|
|
79
82
|
else:
|
|
80
83
|
logger.info(f"Reusing cached computer for key={comp_key}")
|
|
81
84
|
|
|
@@ -115,14 +118,14 @@ class ResponsesHandler:
|
|
|
115
118
|
|
|
116
119
|
# Bind current agent reference
|
|
117
120
|
self.agent = agent
|
|
118
|
-
|
|
121
|
+
|
|
119
122
|
async def process_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
120
123
|
"""
|
|
121
124
|
Process a /responses request and return the result.
|
|
122
|
-
|
|
125
|
+
|
|
123
126
|
Args:
|
|
124
127
|
request_data: Dictionary containing model, input, and optional kwargs
|
|
125
|
-
|
|
128
|
+
|
|
126
129
|
Returns:
|
|
127
130
|
Dictionary with the agent's response
|
|
128
131
|
"""
|
|
@@ -133,12 +136,12 @@ class ResponsesHandler:
|
|
|
133
136
|
agent_kwargs = request_data.get("agent_kwargs", {})
|
|
134
137
|
computer_kwargs = request_data.get("computer_kwargs", {})
|
|
135
138
|
env_overrides = request_data.get("env", {}) or {}
|
|
136
|
-
|
|
139
|
+
|
|
137
140
|
if not model:
|
|
138
141
|
raise ValueError("Model is required")
|
|
139
142
|
if not input_data:
|
|
140
143
|
raise ValueError("Input is required")
|
|
141
|
-
|
|
144
|
+
|
|
142
145
|
# Apply env overrides for the duration of this request
|
|
143
146
|
with self._env_overrides(env_overrides):
|
|
144
147
|
# Set up (and possibly reuse) computer and agent via caches
|
|
@@ -155,28 +158,22 @@ class ResponsesHandler:
|
|
|
155
158
|
# Run agent and get first result
|
|
156
159
|
async for result in agent.run(messages):
|
|
157
160
|
# Return the first result and break
|
|
158
|
-
return {
|
|
159
|
-
|
|
160
|
-
"result": result,
|
|
161
|
-
"model": model
|
|
162
|
-
}
|
|
163
|
-
|
|
161
|
+
return {"success": True, "result": result, "model": model}
|
|
162
|
+
|
|
164
163
|
# If no results were yielded
|
|
165
|
-
return {
|
|
166
|
-
|
|
167
|
-
"error": "No results from agent",
|
|
168
|
-
"model": model
|
|
169
|
-
}
|
|
170
|
-
|
|
164
|
+
return {"success": False, "error": "No results from agent", "model": model}
|
|
165
|
+
|
|
171
166
|
except Exception as e:
|
|
172
167
|
logger.error(f"Error processing request: {e}")
|
|
173
168
|
return {
|
|
174
169
|
"success": False,
|
|
175
170
|
"error": str(e),
|
|
176
|
-
"model": request_data.get("model", "unknown")
|
|
171
|
+
"model": request_data.get("model", "unknown"),
|
|
177
172
|
}
|
|
178
|
-
|
|
179
|
-
def _convert_input_to_messages(
|
|
173
|
+
|
|
174
|
+
def _convert_input_to_messages(
|
|
175
|
+
self, input_data: Union[str, List[Dict[str, Any]]]
|
|
176
|
+
) -> List[Dict[str, Any]]:
|
|
180
177
|
"""Convert input data to messages format."""
|
|
181
178
|
if isinstance(input_data, str):
|
|
182
179
|
# Simple string input
|
|
@@ -192,22 +189,18 @@ class ResponsesHandler:
|
|
|
192
189
|
if part.get("type") == "input_text":
|
|
193
190
|
content_parts.append({"type": "text", "text": part["text"]})
|
|
194
191
|
elif part.get("type") == "input_image":
|
|
195
|
-
content_parts.append(
|
|
196
|
-
"type": "image_url",
|
|
197
|
-
|
|
198
|
-
})
|
|
192
|
+
content_parts.append(
|
|
193
|
+
{"type": "image_url", "image_url": {"url": part["image_url"]}}
|
|
194
|
+
)
|
|
199
195
|
else:
|
|
200
196
|
content_parts.append(part)
|
|
201
|
-
messages.append({
|
|
202
|
-
"role": msg["role"],
|
|
203
|
-
"content": content_parts
|
|
204
|
-
})
|
|
197
|
+
messages.append({"role": msg["role"], "content": content_parts})
|
|
205
198
|
else:
|
|
206
199
|
messages.append(msg)
|
|
207
200
|
return messages
|
|
208
201
|
else:
|
|
209
202
|
raise ValueError("Input must be string or list of messages")
|
|
210
|
-
|
|
203
|
+
|
|
211
204
|
async def cleanup(self):
|
|
212
205
|
"""Clean up resources."""
|
|
213
206
|
if self.computer:
|