cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,5 @@
1
+ """Playground server for Cua agents."""
2
+
3
+ from .server import PlaygroundServer
4
+
5
+ __all__ = ["PlaygroundServer"]
@@ -0,0 +1,301 @@
1
+ """Playground server implementation for Cua agents."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ import platform
7
+ import socket
8
+ import traceback
9
+ import webbrowser
10
+ from typing import Any, Dict, List, Optional, Union
11
+ from urllib.parse import quote
12
+
13
+ import uvicorn
14
+ from fastapi import FastAPI, HTTPException, Request
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from fastapi.responses import JSONResponse
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class PlaygroundServer:
22
+ """Playground server for running Cua agents via HTTP API."""
23
+
24
+ def __init__(self, agent_instance=None):
25
+ """
26
+ Initialize the playground server.
27
+
28
+ Args:
29
+ agent_instance: Optional pre-configured agent instance to use
30
+ """
31
+ self.agent_instance = agent_instance
32
+ self.app = FastAPI(
33
+ title="Cua Playground Server",
34
+ description="Playground server for Cua agents",
35
+ version="0.1.0",
36
+ )
37
+ self._setup_middleware()
38
+ self._setup_routes()
39
+ self.server = None
40
+ self.port = None
41
+
42
+ def _setup_middleware(self):
43
+ """Setup CORS middleware."""
44
+ self.app.add_middleware(
45
+ CORSMiddleware,
46
+ allow_origins=["*"],
47
+ allow_credentials=True,
48
+ allow_methods=["*"],
49
+ allow_headers=["*"],
50
+ )
51
+
52
+ def _setup_routes(self):
53
+ """Setup API routes."""
54
+
55
+ @self.app.get("/status")
56
+ async def status():
57
+ """Health check endpoint."""
58
+ sys = platform.system().lower()
59
+ if "darwin" in sys or sys in ("macos", "mac"):
60
+ os_type = "macos"
61
+ elif "windows" in sys:
62
+ os_type = "windows"
63
+ else:
64
+ os_type = "linux"
65
+
66
+ return {
67
+ "status": "ok",
68
+ "os_type": os_type,
69
+ "features": ["agent", "playground"],
70
+ }
71
+
72
+ @self.app.post("/responses")
73
+ async def responses_endpoint(request: Request):
74
+ """
75
+ Run ComputerAgent for up to 2 turns.
76
+
77
+ Body JSON:
78
+ {
79
+ "model": "...", # required
80
+ "input": "... or messages[]", # required
81
+ "agent_kwargs": { ... }, # optional, passed directly to ComputerAgent
82
+ "env": { ... } # optional env overrides for agent
83
+ }
84
+ """
85
+ # Import here to avoid circular imports
86
+ try:
87
+ from agent import ComputerAgent
88
+ except ImportError:
89
+ raise HTTPException(status_code=501, detail="ComputerAgent not available")
90
+
91
+ # Parse request body
92
+ try:
93
+ body = await request.json()
94
+ except Exception as e:
95
+ raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
96
+
97
+ model = body.get("model")
98
+ input_data = body.get("input")
99
+ if not model or input_data is None:
100
+ raise HTTPException(status_code=400, detail="'model' and 'input' are required")
101
+
102
+ agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
103
+ env_overrides: Dict[str, str] = body.get("env") or {}
104
+
105
+ # Simple env override context
106
+ class _EnvOverride:
107
+ def __init__(self, overrides: Dict[str, str]):
108
+ self.overrides = overrides
109
+ self._original: Dict[str, Optional[str]] = {}
110
+
111
+ def __enter__(self):
112
+ for k, v in (self.overrides or {}).items():
113
+ self._original[k] = os.environ.get(k)
114
+ os.environ[k] = str(v)
115
+
116
+ def __exit__(self, exc_type, exc, tb):
117
+ for k, old in self._original.items():
118
+ if old is None:
119
+ os.environ.pop(k, None)
120
+ else:
121
+ os.environ[k] = old
122
+
123
+ # Convert input to messages
124
+ def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
125
+ if isinstance(data, str):
126
+ return [{"role": "user", "content": data}]
127
+ if isinstance(data, list):
128
+ return data
129
+ return []
130
+
131
+ messages = _to_messages(input_data)
132
+
133
+ error = None
134
+
135
+ with _EnvOverride(env_overrides):
136
+ # Use pre-configured agent if available, otherwise create new one
137
+ if self.agent_instance:
138
+ agent = self.agent_instance
139
+ else:
140
+ agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type]
141
+
142
+ total_output: List[Any] = []
143
+ total_usage: Dict[str, Any] = {}
144
+
145
+ pending_computer_call_ids = set()
146
+ try:
147
+ async for result in agent.run(messages):
148
+ total_output += result["output"]
149
+ # Try to collect usage if present
150
+ if (
151
+ isinstance(result, dict)
152
+ and "usage" in result
153
+ and isinstance(result["usage"], dict)
154
+ ):
155
+ # Merge usage counters
156
+ for k, v in result["usage"].items():
157
+ if isinstance(v, (int, float)):
158
+ total_usage[k] = total_usage.get(k, 0) + v
159
+ else:
160
+ total_usage[k] = v
161
+ for msg in result.get("output", []):
162
+ if msg.get("type") == "computer_call":
163
+ pending_computer_call_ids.add(msg["call_id"])
164
+ elif msg.get("type") == "computer_call_output":
165
+ pending_computer_call_ids.discard(msg["call_id"])
166
+ elif msg.get("type") == "function_call":
167
+ pending_computer_call_ids.add(msg["call_id"])
168
+ elif msg.get("type") == "function_call_output":
169
+ pending_computer_call_ids.discard(msg["call_id"])
170
+ # exit if no pending computer calls
171
+ if not pending_computer_call_ids:
172
+ break
173
+ except Exception as e:
174
+ logger.error(f"Error running agent: {str(e)}")
175
+ logger.error(traceback.format_exc())
176
+ error = str(e)
177
+
178
+ # Build response payload
179
+ payload = {
180
+ "model": model,
181
+ "error": error,
182
+ "output": total_output,
183
+ "usage": total_usage,
184
+ "status": "completed" if not error else "failed",
185
+ }
186
+
187
+ # CORS: allow any origin
188
+ headers = {
189
+ "Cache-Control": "no-cache",
190
+ "Connection": "keep-alive",
191
+ }
192
+
193
+ return JSONResponse(content=payload, headers=headers)
194
+
195
+ def _find_available_port(self, start_port: int = 8000, max_attempts: int = 100) -> int:
196
+ """Find an available port starting from start_port."""
197
+ for port in range(start_port, start_port + max_attempts):
198
+ try:
199
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
200
+ s.bind(("127.0.0.1", port))
201
+ return port
202
+ except OSError:
203
+ continue
204
+ raise RuntimeError(
205
+ f"Could not find an available port in range {start_port}-{start_port + max_attempts}"
206
+ )
207
+
208
+ async def start_async(self, port: Optional[int] = None, open_browser: bool = False):
209
+ """
210
+ Start the playground server asynchronously.
211
+
212
+ Args:
213
+ port: Port to run the server on. If None, finds an available port.
214
+ open_browser: Whether to open the browser automatically.
215
+ """
216
+ if port is None:
217
+ port = self._find_available_port()
218
+
219
+ self.port = port
220
+ host = f"http://localhost:{port}"
221
+
222
+ logger.info(f"Starting playground server on {host}")
223
+
224
+ if open_browser:
225
+ # Construct the playground URL
226
+ encoded_host = quote(host, safe="")
227
+ encoded_model = quote(self.agent_instance.model, safe="")
228
+ encoded_vnc_url = quote("http://localhost:8006/?autoconnect=true", safe="")
229
+
230
+ # Build URL with custom_model if agent instance is configured
231
+ playground_url = (
232
+ # f"http://cua.ai/dashboard/playground"
233
+ f"http://localhost:3000/dashboard/playground"
234
+ f"?host={encoded_host}"
235
+ f"&port={port}"
236
+ f"&id=localhost"
237
+ f"&name=localhost"
238
+ f"&custom_model={encoded_model}"
239
+ f"&custom_vnc_url={encoded_vnc_url}"
240
+ f"&vnc_password=null"
241
+ f"&resize=scale"
242
+ f"&fullscreen=true"
243
+ )
244
+
245
+ logger.info(f"Opening browser at: {playground_url}")
246
+ webbrowser.open(playground_url)
247
+
248
+ config = uvicorn.Config(
249
+ self.app,
250
+ host="0.0.0.0",
251
+ port=port,
252
+ log_level="info",
253
+ )
254
+ self.server = uvicorn.Server(config)
255
+ await self.server.serve()
256
+
257
+ def start(self, port: Optional[int] = None, open_browser: bool = False):
258
+ """
259
+ Start the playground server (blocking).
260
+
261
+ Args:
262
+ port: Port to run the server on. If None, finds an available port.
263
+ open_browser: Whether to open the browser automatically.
264
+ """
265
+ # Check if there's already a running event loop
266
+ try:
267
+ loop = asyncio.get_running_loop()
268
+ # If we're in an async context, schedule as a task
269
+ import threading
270
+
271
+ # Run the server in a separate thread to avoid blocking
272
+ server_thread = threading.Thread(
273
+ target=self._run_in_new_loop,
274
+ args=(port, open_browser),
275
+ daemon=True,
276
+ )
277
+ server_thread.start()
278
+
279
+ # Give the server a moment to start and open browser
280
+ import time
281
+
282
+ time.sleep(1)
283
+
284
+ except RuntimeError:
285
+ # No running loop, can use asyncio.run() safely
286
+ asyncio.run(self.start_async(port=port, open_browser=open_browser))
287
+
288
+ def _run_in_new_loop(self, port: Optional[int] = None, open_browser: bool = False):
289
+ """Helper to run server in a new event loop (for threading)."""
290
+ new_loop = asyncio.new_event_loop()
291
+ asyncio.set_event_loop(new_loop)
292
+ try:
293
+ new_loop.run_until_complete(self.start_async(port=port, open_browser=open_browser))
294
+ finally:
295
+ new_loop.close()
296
+
297
+ async def stop(self):
298
+ """Stop the playground server."""
299
+ if self.server:
300
+ logger.info("Stopping playground server")
301
+ await self.server.shutdown()
@@ -0,0 +1,196 @@
1
+ """
2
+ Example usage of the proxy server and client requests.
3
+ """
4
+
5
+ import dotenv
6
+
7
+ dotenv.load_dotenv()
8
+
9
+ import asyncio
10
+ import json
11
+ import os
12
+ from typing import Any, Dict
13
+
14
+ import aiohttp
15
+
16
+
17
+ async def test_http_endpoint():
18
+ """Test the HTTP /responses endpoint."""
19
+
20
+ anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
21
+ assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
22
+
23
+ # Example 1: Simple text request
24
+ simple_request = {
25
+ "model": "anthropic/claude-sonnet-4-5-20250929",
26
+ "input": "Tell me a three sentence bedtime story about a unicorn.",
27
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
28
+ }
29
+
30
+ # Example 2: Multi-modal request with image
31
+ multimodal_request = {
32
+ "model": "anthropic/claude-sonnet-4-5-20250929",
33
+ "input": [
34
+ {
35
+ "role": "user",
36
+ "content": [
37
+ {"type": "input_text", "text": "what is in this image?"},
38
+ {
39
+ "type": "input_image",
40
+ "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
41
+ },
42
+ ],
43
+ }
44
+ ],
45
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
46
+ }
47
+
48
+ # Example 3: Request with custom agent and computer kwargs
49
+ custom_request = {
50
+ "model": "anthropic/claude-sonnet-4-5-20250929",
51
+ "input": "Take a screenshot and tell me what you see",
52
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
53
+ }
54
+
55
+ # Test requests
56
+ base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
57
+ # base_url = "http://localhost:8000"
58
+ api_key = os.getenv("CUA_API_KEY")
59
+ assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
60
+
61
+ async with aiohttp.ClientSession() as session:
62
+ for i, request_data in enumerate(
63
+ [
64
+ simple_request,
65
+ # multimodal_request,
66
+ custom_request,
67
+ ],
68
+ 1,
69
+ ):
70
+ print(f"\n--- Test {i} ---")
71
+ print(f"Request: {json.dumps(request_data, indent=2)}")
72
+
73
+ try:
74
+ print(f"Sending request to {base_url}/responses")
75
+ async with session.post(
76
+ f"{base_url}/responses",
77
+ json=request_data,
78
+ headers={"Content-Type": "application/json", "X-API-Key": api_key},
79
+ ) as response:
80
+ result = await response.json()
81
+ print(f"Status: {response.status}")
82
+ print(f"Response: {json.dumps(result, indent=2)}")
83
+
84
+ except Exception as e:
85
+ print(f"Error: {e}")
86
+
87
+
88
+ def curl_examples():
89
+ """Print curl command examples."""
90
+
91
+ print("=== CURL Examples ===\n")
92
+
93
+ print("1. Simple text request:")
94
+ print(
95
+ """curl http://localhost:8000/responses \\
96
+ -H "Content-Type: application/json" \\
97
+ -d '{
98
+ "model": "anthropic/claude-sonnet-4-5-20250929",
99
+ "input": "Tell me a three sentence bedtime story about a unicorn."
100
+ }'"""
101
+ )
102
+
103
+ print("\n2. Multi-modal request with image:")
104
+ print(
105
+ """curl http://localhost:8000/responses \\
106
+ -H "Content-Type: application/json" \\
107
+ -d '{
108
+ "model": "anthropic/claude-sonnet-4-5-20250929",
109
+ "input": [
110
+ {
111
+ "role": "user",
112
+ "content": [
113
+ {"type": "input_text", "text": "what is in this image?"},
114
+ {
115
+ "type": "input_image",
116
+ "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
117
+ }
118
+ ]
119
+ }
120
+ ]
121
+ }'"""
122
+ )
123
+
124
+ print("\n3. Request with custom configuration:")
125
+ print(
126
+ """curl http://localhost:8000/responses \\
127
+ -H "Content-Type: application/json" \\
128
+ -d '{
129
+ "model": "anthropic/claude-sonnet-4-5-20250929",
130
+ "input": "Take a screenshot and tell me what you see",
131
+ "agent_kwargs": {
132
+ "save_trajectory": true,
133
+ "verbosity": 20
134
+ },
135
+ "computer_kwargs": {
136
+ "os_type": "linux",
137
+ "provider_type": "cloud"
138
+ }
139
+ }'"""
140
+ )
141
+
142
+
143
+ async def test_p2p_client():
144
+ """Example P2P client using peerjs-python."""
145
+ try:
146
+ from aiortc import RTCConfiguration, RTCIceServer
147
+ from peerjs import ConnectionEventType, Peer, PeerOptions
148
+
149
+ # Set up client peer
150
+ options = PeerOptions(
151
+ host="0.peerjs.com",
152
+ port=443,
153
+ secure=True,
154
+ config=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]),
155
+ )
156
+
157
+ client_peer = Peer(id="test-client", peer_options=options)
158
+ await client_peer.start()
159
+
160
+ # Connect to proxy server
161
+ connection = client_peer.connect("computer-agent-proxy")
162
+
163
+ @connection.on(ConnectionEventType.Open)
164
+ async def connection_open():
165
+ print("Connected to proxy server")
166
+
167
+ # Send a test request
168
+ request = {
169
+ "model": "anthropic/claude-sonnet-4-5-20250929",
170
+ "input": "Hello from P2P client!",
171
+ }
172
+ await connection.send(json.dumps(request))
173
+
174
+ @connection.on(ConnectionEventType.Data)
175
+ async def connection_data(data):
176
+ print(f"Received response: {data}")
177
+ await client_peer.destroy()
178
+
179
+ # Wait for connection
180
+ await asyncio.sleep(10)
181
+
182
+ except ImportError:
183
+ print("P2P dependencies not available. Install peerjs-python for P2P testing.")
184
+ except Exception as e:
185
+ print(f"P2P test error: {e}")
186
+
187
+
188
+ if __name__ == "__main__":
189
+ import sys
190
+
191
+ if len(sys.argv) > 1 and sys.argv[1] == "curl":
192
+ curl_examples()
193
+ elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
194
+ asyncio.run(test_p2p_client())
195
+ else:
196
+ asyncio.run(test_http_endpoint())