cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,5 @@
1
+ """Playground server for Cua agents."""
2
+
3
+ from .server import PlaygroundServer
4
+
5
+ __all__ = ["PlaygroundServer"]
@@ -0,0 +1,301 @@
1
+ """Playground server implementation for Cua agents."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ import platform
7
+ import socket
8
+ import traceback
9
+ import webbrowser
10
+ from typing import Any, Dict, List, Optional, Union
11
+ from urllib.parse import quote
12
+
13
+ import uvicorn
14
+ from fastapi import FastAPI, HTTPException, Request
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from fastapi.responses import JSONResponse
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class PlaygroundServer:
22
+ """Playground server for running Cua agents via HTTP API."""
23
+
24
+ def __init__(self, agent_instance=None):
25
+ """
26
+ Initialize the playground server.
27
+
28
+ Args:
29
+ agent_instance: Optional pre-configured agent instance to use
30
+ """
31
+ self.agent_instance = agent_instance
32
+ self.app = FastAPI(
33
+ title="Cua Playground Server",
34
+ description="Playground server for Cua agents",
35
+ version="0.1.0",
36
+ )
37
+ self._setup_middleware()
38
+ self._setup_routes()
39
+ self.server = None
40
+ self.port = None
41
+
42
+ def _setup_middleware(self):
43
+ """Setup CORS middleware."""
44
+ self.app.add_middleware(
45
+ CORSMiddleware,
46
+ allow_origins=["*"],
47
+ allow_credentials=True,
48
+ allow_methods=["*"],
49
+ allow_headers=["*"],
50
+ )
51
+
52
+ def _setup_routes(self):
53
+ """Setup API routes."""
54
+
55
+ @self.app.get("/status")
56
+ async def status():
57
+ """Health check endpoint."""
58
+ sys = platform.system().lower()
59
+ if "darwin" in sys or sys in ("macos", "mac"):
60
+ os_type = "macos"
61
+ elif "windows" in sys:
62
+ os_type = "windows"
63
+ else:
64
+ os_type = "linux"
65
+
66
+ return {
67
+ "status": "ok",
68
+ "os_type": os_type,
69
+ "features": ["agent", "playground"],
70
+ }
71
+
72
+ @self.app.post("/responses")
73
+ async def responses_endpoint(request: Request):
74
+ """
75
+ Run ComputerAgent for up to 2 turns.
76
+
77
+ Body JSON:
78
+ {
79
+ "model": "...", # required
80
+ "input": "... or messages[]", # required
81
+ "agent_kwargs": { ... }, # optional, passed directly to ComputerAgent
82
+ "env": { ... } # optional env overrides for agent
83
+ }
84
+ """
85
+ # Import here to avoid circular imports
86
+ try:
87
+ from agent import ComputerAgent
88
+ except ImportError:
89
+ raise HTTPException(status_code=501, detail="ComputerAgent not available")
90
+
91
+ # Parse request body
92
+ try:
93
+ body = await request.json()
94
+ except Exception as e:
95
+ raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
96
+
97
+ model = body.get("model")
98
+ input_data = body.get("input")
99
+ if not model or input_data is None:
100
+ raise HTTPException(status_code=400, detail="'model' and 'input' are required")
101
+
102
+ agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
103
+ env_overrides: Dict[str, str] = body.get("env") or {}
104
+
105
+ # Simple env override context
106
+ class _EnvOverride:
107
+ def __init__(self, overrides: Dict[str, str]):
108
+ self.overrides = overrides
109
+ self._original: Dict[str, Optional[str]] = {}
110
+
111
+ def __enter__(self):
112
+ for k, v in (self.overrides or {}).items():
113
+ self._original[k] = os.environ.get(k)
114
+ os.environ[k] = str(v)
115
+
116
+ def __exit__(self, exc_type, exc, tb):
117
+ for k, old in self._original.items():
118
+ if old is None:
119
+ os.environ.pop(k, None)
120
+ else:
121
+ os.environ[k] = old
122
+
123
+ # Convert input to messages
124
+ def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
125
+ if isinstance(data, str):
126
+ return [{"role": "user", "content": data}]
127
+ if isinstance(data, list):
128
+ return data
129
+ return []
130
+
131
+ messages = _to_messages(input_data)
132
+
133
+ error = None
134
+
135
+ with _EnvOverride(env_overrides):
136
+ # Use pre-configured agent if available, otherwise create new one
137
+ if self.agent_instance:
138
+ agent = self.agent_instance
139
+ else:
140
+ agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type]
141
+
142
+ total_output: List[Any] = []
143
+ total_usage: Dict[str, Any] = {}
144
+
145
+ pending_computer_call_ids = set()
146
+ try:
147
+ async for result in agent.run(messages):
148
+ total_output += result["output"]
149
+ # Try to collect usage if present
150
+ if (
151
+ isinstance(result, dict)
152
+ and "usage" in result
153
+ and isinstance(result["usage"], dict)
154
+ ):
155
+ # Merge usage counters
156
+ for k, v in result["usage"].items():
157
+ if isinstance(v, (int, float)):
158
+ total_usage[k] = total_usage.get(k, 0) + v
159
+ else:
160
+ total_usage[k] = v
161
+ for msg in result.get("output", []):
162
+ if msg.get("type") == "computer_call":
163
+ pending_computer_call_ids.add(msg["call_id"])
164
+ elif msg.get("type") == "computer_call_output":
165
+ pending_computer_call_ids.discard(msg["call_id"])
166
+ elif msg.get("type") == "function_call":
167
+ pending_computer_call_ids.add(msg["call_id"])
168
+ elif msg.get("type") == "function_call_output":
169
+ pending_computer_call_ids.discard(msg["call_id"])
170
+ # exit if no pending computer calls
171
+ if not pending_computer_call_ids:
172
+ break
173
+ except Exception as e:
174
+ logger.error(f"Error running agent: {str(e)}")
175
+ logger.error(traceback.format_exc())
176
+ error = str(e)
177
+
178
+ # Build response payload
179
+ payload = {
180
+ "model": model,
181
+ "error": error,
182
+ "output": total_output,
183
+ "usage": total_usage,
184
+ "status": "completed" if not error else "failed",
185
+ }
186
+
187
+ # CORS: allow any origin
188
+ headers = {
189
+ "Cache-Control": "no-cache",
190
+ "Connection": "keep-alive",
191
+ }
192
+
193
+ return JSONResponse(content=payload, headers=headers)
194
+
195
+ def _find_available_port(self, start_port: int = 8000, max_attempts: int = 100) -> int:
196
+ """Find an available port starting from start_port."""
197
+ for port in range(start_port, start_port + max_attempts):
198
+ try:
199
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
200
+ s.bind(("127.0.0.1", port))
201
+ return port
202
+ except OSError:
203
+ continue
204
+ raise RuntimeError(
205
+ f"Could not find an available port in range {start_port}-{start_port + max_attempts}"
206
+ )
207
+
208
+ async def start_async(self, port: Optional[int] = None, open_browser: bool = False):
209
+ """
210
+ Start the playground server asynchronously.
211
+
212
+ Args:
213
+ port: Port to run the server on. If None, finds an available port.
214
+ open_browser: Whether to open the browser automatically.
215
+ """
216
+ if port is None:
217
+ port = self._find_available_port()
218
+
219
+ self.port = port
220
+ host = f"http://localhost:{port}"
221
+
222
+ logger.info(f"Starting playground server on {host}")
223
+
224
+ if open_browser:
225
+ # Construct the playground URL
226
+ encoded_host = quote(host, safe="")
227
+ encoded_model = quote(self.agent_instance.model, safe="")
228
+ encoded_vnc_url = quote("http://localhost:8006/?autoconnect=true", safe="")
229
+
230
+ # Build URL with custom_model if agent instance is configured
231
+ playground_url = (
232
+ # f"http://cua.ai/dashboard/playground"
233
+ f"http://localhost:3000/dashboard/playground"
234
+ f"?host={encoded_host}"
235
+ f"&port={port}"
236
+ f"&id=localhost"
237
+ f"&name=localhost"
238
+ f"&custom_model={encoded_model}"
239
+ f"&custom_vnc_url={encoded_vnc_url}"
240
+ f"&vnc_password=null"
241
+ f"&resize=scale"
242
+ f"&fullscreen=true"
243
+ )
244
+
245
+ logger.info(f"Opening browser at: {playground_url}")
246
+ webbrowser.open(playground_url)
247
+
248
+ config = uvicorn.Config(
249
+ self.app,
250
+ host="0.0.0.0",
251
+ port=port,
252
+ log_level="info",
253
+ )
254
+ self.server = uvicorn.Server(config)
255
+ await self.server.serve()
256
+
257
+ def start(self, port: Optional[int] = None, open_browser: bool = False):
258
+ """
259
+ Start the playground server (blocking).
260
+
261
+ Args:
262
+ port: Port to run the server on. If None, finds an available port.
263
+ open_browser: Whether to open the browser automatically.
264
+ """
265
+ # Check if there's already a running event loop
266
+ try:
267
+ loop = asyncio.get_running_loop()
268
+ # If we're in an async context, schedule as a task
269
+ import threading
270
+
271
+ # Run the server in a separate thread to avoid blocking
272
+ server_thread = threading.Thread(
273
+ target=self._run_in_new_loop,
274
+ args=(port, open_browser),
275
+ daemon=True,
276
+ )
277
+ server_thread.start()
278
+
279
+ # Give the server a moment to start and open browser
280
+ import time
281
+
282
+ time.sleep(1)
283
+
284
+ except RuntimeError:
285
+ # No running loop, can use asyncio.run() safely
286
+ asyncio.run(self.start_async(port=port, open_browser=open_browser))
287
+
288
+ def _run_in_new_loop(self, port: Optional[int] = None, open_browser: bool = False):
289
+ """Helper to run server in a new event loop (for threading)."""
290
+ new_loop = asyncio.new_event_loop()
291
+ asyncio.set_event_loop(new_loop)
292
+ try:
293
+ new_loop.run_until_complete(self.start_async(port=port, open_browser=open_browser))
294
+ finally:
295
+ new_loop.close()
296
+
297
+ async def stop(self):
298
+ """Stop the playground server."""
299
+ if self.server:
300
+ logger.info("Stopping playground server")
301
+ await self.server.shutdown()
agent/proxy/examples.py CHANGED
@@ -1,34 +1,35 @@
1
1
  """
2
2
  Example usage of the proxy server and client requests.
3
3
  """
4
+
4
5
  import dotenv
6
+
5
7
  dotenv.load_dotenv()
6
8
 
7
9
  import asyncio
8
10
  import json
9
11
  import os
12
+ from typing import Any, Dict
13
+
10
14
  import aiohttp
11
- from typing import Dict, Any
12
15
 
13
16
 
14
17
  async def test_http_endpoint():
15
18
  """Test the HTTP /responses endpoint."""
16
-
19
+
17
20
  anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
18
21
  assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
19
22
 
20
23
  # Example 1: Simple text request
21
24
  simple_request = {
22
- "model": "anthropic/claude-3-5-sonnet-20241022",
25
+ "model": "anthropic/claude-sonnet-4-5-20250929",
23
26
  "input": "Tell me a three sentence bedtime story about a unicorn.",
24
- "env": {
25
- "ANTHROPIC_API_KEY": anthropic_api_key
26
- }
27
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
27
28
  }
28
-
29
+
29
30
  # Example 2: Multi-modal request with image
30
31
  multimodal_request = {
31
- "model": "anthropic/claude-3-5-sonnet-20241022",
32
+ "model": "anthropic/claude-sonnet-4-5-20250929",
32
33
  "input": [
33
34
  {
34
35
  "role": "user",
@@ -36,73 +37,75 @@ async def test_http_endpoint():
36
37
  {"type": "input_text", "text": "what is in this image?"},
37
38
  {
38
39
  "type": "input_image",
39
- "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
40
- }
41
- ]
40
+ "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
41
+ },
42
+ ],
42
43
  }
43
44
  ],
44
- "env": {
45
- "ANTHROPIC_API_KEY": anthropic_api_key
46
- }
45
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
47
46
  }
48
-
47
+
49
48
  # Example 3: Request with custom agent and computer kwargs
50
49
  custom_request = {
51
- "model": "anthropic/claude-3-5-sonnet-20241022",
50
+ "model": "anthropic/claude-sonnet-4-5-20250929",
52
51
  "input": "Take a screenshot and tell me what you see",
53
- "env": {
54
- "ANTHROPIC_API_KEY": anthropic_api_key
55
- }
52
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
56
53
  }
57
-
54
+
58
55
  # Test requests
59
56
  base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
60
57
  # base_url = "http://localhost:8000"
61
58
  api_key = os.getenv("CUA_API_KEY")
62
59
  assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
63
-
60
+
64
61
  async with aiohttp.ClientSession() as session:
65
- for i, request_data in enumerate([
66
- simple_request,
67
- # multimodal_request,
68
- custom_request
69
- ], 1):
62
+ for i, request_data in enumerate(
63
+ [
64
+ simple_request,
65
+ # multimodal_request,
66
+ custom_request,
67
+ ],
68
+ 1,
69
+ ):
70
70
  print(f"\n--- Test {i} ---")
71
71
  print(f"Request: {json.dumps(request_data, indent=2)}")
72
-
72
+
73
73
  try:
74
74
  print(f"Sending request to {base_url}/responses")
75
75
  async with session.post(
76
76
  f"{base_url}/responses",
77
77
  json=request_data,
78
- headers={"Content-Type": "application/json", "X-API-Key": api_key}
78
+ headers={"Content-Type": "application/json", "X-API-Key": api_key},
79
79
  ) as response:
80
80
  result = await response.json()
81
81
  print(f"Status: {response.status}")
82
82
  print(f"Response: {json.dumps(result, indent=2)}")
83
-
83
+
84
84
  except Exception as e:
85
85
  print(f"Error: {e}")
86
86
 
87
87
 
88
88
  def curl_examples():
89
89
  """Print curl command examples."""
90
-
90
+
91
91
  print("=== CURL Examples ===\n")
92
-
92
+
93
93
  print("1. Simple text request:")
94
- print("""curl http://localhost:8000/responses \\
94
+ print(
95
+ """curl http://localhost:8000/responses \\
95
96
  -H "Content-Type: application/json" \\
96
97
  -d '{
97
- "model": "anthropic/claude-3-5-sonnet-20241022",
98
+ "model": "anthropic/claude-sonnet-4-5-20250929",
98
99
  "input": "Tell me a three sentence bedtime story about a unicorn."
99
- }'""")
100
-
100
+ }'"""
101
+ )
102
+
101
103
  print("\n2. Multi-modal request with image:")
102
- print("""curl http://localhost:8000/responses \\
104
+ print(
105
+ """curl http://localhost:8000/responses \\
103
106
  -H "Content-Type: application/json" \\
104
107
  -d '{
105
- "model": "anthropic/claude-3-5-sonnet-20241022",
108
+ "model": "anthropic/claude-sonnet-4-5-20250929",
106
109
  "input": [
107
110
  {
108
111
  "role": "user",
@@ -115,13 +118,15 @@ def curl_examples():
115
118
  ]
116
119
  }
117
120
  ]
118
- }'""")
119
-
121
+ }'"""
122
+ )
123
+
120
124
  print("\n3. Request with custom configuration:")
121
- print("""curl http://localhost:8000/responses \\
125
+ print(
126
+ """curl http://localhost:8000/responses \\
122
127
  -H "Content-Type: application/json" \\
123
128
  -d '{
124
- "model": "anthropic/claude-3-5-sonnet-20241022",
129
+ "model": "anthropic/claude-sonnet-4-5-20250929",
125
130
  "input": "Take a screenshot and tell me what you see",
126
131
  "agent_kwargs": {
127
132
  "save_trajectory": true,
@@ -131,50 +136,49 @@ def curl_examples():
131
136
  "os_type": "linux",
132
137
  "provider_type": "cloud"
133
138
  }
134
- }'""")
139
+ }'"""
140
+ )
135
141
 
136
142
 
137
143
  async def test_p2p_client():
138
144
  """Example P2P client using peerjs-python."""
139
145
  try:
140
- from peerjs import Peer, PeerOptions, ConnectionEventType
141
146
  from aiortc import RTCConfiguration, RTCIceServer
142
-
147
+ from peerjs import ConnectionEventType, Peer, PeerOptions
148
+
143
149
  # Set up client peer
144
150
  options = PeerOptions(
145
151
  host="0.peerjs.com",
146
152
  port=443,
147
153
  secure=True,
148
- config=RTCConfiguration(
149
- iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]
150
- )
154
+ config=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]),
151
155
  )
152
-
156
+
153
157
  client_peer = Peer(id="test-client", peer_options=options)
154
158
  await client_peer.start()
155
-
159
+
156
160
  # Connect to proxy server
157
161
  connection = client_peer.connect("computer-agent-proxy")
158
-
162
+
159
163
  @connection.on(ConnectionEventType.Open)
160
164
  async def connection_open():
161
165
  print("Connected to proxy server")
162
-
166
+
163
167
  # Send a test request
164
168
  request = {
165
- "model": "anthropic/claude-3-5-sonnet-20241022",
166
- "input": "Hello from P2P client!"
169
+ "model": "anthropic/claude-sonnet-4-5-20250929",
170
+ "input": "Hello from P2P client!",
167
171
  }
168
172
  await connection.send(json.dumps(request))
169
-
173
+
170
174
  @connection.on(ConnectionEventType.Data)
171
175
  async def connection_data(data):
172
176
  print(f"Received response: {data}")
173
177
  await client_peer.destroy()
174
-
178
+
175
179
  # Wait for connection
176
180
  await asyncio.sleep(10)
177
-
181
+
178
182
  except ImportError:
179
183
  print("P2P dependencies not available. Install peerjs-python for P2P testing.")
180
184
  except Exception as e:
@@ -183,7 +187,7 @@ async def test_p2p_client():
183
187
 
184
188
  if __name__ == "__main__":
185
189
  import sys
186
-
190
+
187
191
  if len(sys.argv) > 1 and sys.argv[1] == "curl":
188
192
  curl_examples()
189
193
  elif len(sys.argv) > 1 and sys.argv[1] == "p2p":