cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +32 -19
  24. agent/computers/cua.py +33 -25
  25. agent/computers/custom.py +78 -71
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +215 -210
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +510 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.36.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/proxy/examples.py CHANGED
@@ -1,19 +1,22 @@
1
1
  """
2
2
  Example usage of the proxy server and client requests.
3
3
  """
4
+
4
5
  import dotenv
6
+
5
7
  dotenv.load_dotenv()
6
8
 
7
9
  import asyncio
8
10
  import json
9
11
  import os
12
+ from typing import Any, Dict
13
+
10
14
  import aiohttp
11
- from typing import Dict, Any
12
15
 
13
16
 
14
17
  async def test_http_endpoint():
15
18
  """Test the HTTP /responses endpoint."""
16
-
19
+
17
20
  anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
18
21
  assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
19
22
 
@@ -21,11 +24,9 @@ async def test_http_endpoint():
21
24
  simple_request = {
22
25
  "model": "anthropic/claude-3-5-sonnet-20241022",
23
26
  "input": "Tell me a three sentence bedtime story about a unicorn.",
24
- "env": {
25
- "ANTHROPIC_API_KEY": anthropic_api_key
26
- }
27
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
27
28
  }
28
-
29
+
29
30
  # Example 2: Multi-modal request with image
30
31
  multimodal_request = {
31
32
  "model": "anthropic/claude-3-5-sonnet-20241022",
@@ -36,70 +37,72 @@ async def test_http_endpoint():
36
37
  {"type": "input_text", "text": "what is in this image?"},
37
38
  {
38
39
  "type": "input_image",
39
- "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
40
- }
41
- ]
40
+ "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
41
+ },
42
+ ],
42
43
  }
43
44
  ],
44
- "env": {
45
- "ANTHROPIC_API_KEY": anthropic_api_key
46
- }
45
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
47
46
  }
48
-
47
+
49
48
  # Example 3: Request with custom agent and computer kwargs
50
49
  custom_request = {
51
50
  "model": "anthropic/claude-3-5-sonnet-20241022",
52
51
  "input": "Take a screenshot and tell me what you see",
53
- "env": {
54
- "ANTHROPIC_API_KEY": anthropic_api_key
55
- }
52
+ "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
56
53
  }
57
-
54
+
58
55
  # Test requests
59
56
  base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
60
57
  # base_url = "http://localhost:8000"
61
58
  api_key = os.getenv("CUA_API_KEY")
62
59
  assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
63
-
60
+
64
61
  async with aiohttp.ClientSession() as session:
65
- for i, request_data in enumerate([
66
- simple_request,
67
- # multimodal_request,
68
- custom_request
69
- ], 1):
62
+ for i, request_data in enumerate(
63
+ [
64
+ simple_request,
65
+ # multimodal_request,
66
+ custom_request,
67
+ ],
68
+ 1,
69
+ ):
70
70
  print(f"\n--- Test {i} ---")
71
71
  print(f"Request: {json.dumps(request_data, indent=2)}")
72
-
72
+
73
73
  try:
74
74
  print(f"Sending request to {base_url}/responses")
75
75
  async with session.post(
76
76
  f"{base_url}/responses",
77
77
  json=request_data,
78
- headers={"Content-Type": "application/json", "X-API-Key": api_key}
78
+ headers={"Content-Type": "application/json", "X-API-Key": api_key},
79
79
  ) as response:
80
80
  result = await response.json()
81
81
  print(f"Status: {response.status}")
82
82
  print(f"Response: {json.dumps(result, indent=2)}")
83
-
83
+
84
84
  except Exception as e:
85
85
  print(f"Error: {e}")
86
86
 
87
87
 
88
88
  def curl_examples():
89
89
  """Print curl command examples."""
90
-
90
+
91
91
  print("=== CURL Examples ===\n")
92
-
92
+
93
93
  print("1. Simple text request:")
94
- print("""curl http://localhost:8000/responses \\
94
+ print(
95
+ """curl http://localhost:8000/responses \\
95
96
  -H "Content-Type: application/json" \\
96
97
  -d '{
97
98
  "model": "anthropic/claude-3-5-sonnet-20241022",
98
99
  "input": "Tell me a three sentence bedtime story about a unicorn."
99
- }'""")
100
-
100
+ }'"""
101
+ )
102
+
101
103
  print("\n2. Multi-modal request with image:")
102
- print("""curl http://localhost:8000/responses \\
104
+ print(
105
+ """curl http://localhost:8000/responses \\
103
106
  -H "Content-Type: application/json" \\
104
107
  -d '{
105
108
  "model": "anthropic/claude-3-5-sonnet-20241022",
@@ -115,10 +118,12 @@ def curl_examples():
115
118
  ]
116
119
  }
117
120
  ]
118
- }'""")
119
-
121
+ }'"""
122
+ )
123
+
120
124
  print("\n3. Request with custom configuration:")
121
- print("""curl http://localhost:8000/responses \\
125
+ print(
126
+ """curl http://localhost:8000/responses \\
122
127
  -H "Content-Type: application/json" \\
123
128
  -d '{
124
129
  "model": "anthropic/claude-3-5-sonnet-20241022",
@@ -131,50 +136,49 @@ def curl_examples():
131
136
  "os_type": "linux",
132
137
  "provider_type": "cloud"
133
138
  }
134
- }'""")
139
+ }'"""
140
+ )
135
141
 
136
142
 
137
143
  async def test_p2p_client():
138
144
  """Example P2P client using peerjs-python."""
139
145
  try:
140
- from peerjs import Peer, PeerOptions, ConnectionEventType
141
146
  from aiortc import RTCConfiguration, RTCIceServer
142
-
147
+ from peerjs import ConnectionEventType, Peer, PeerOptions
148
+
143
149
  # Set up client peer
144
150
  options = PeerOptions(
145
151
  host="0.peerjs.com",
146
152
  port=443,
147
153
  secure=True,
148
- config=RTCConfiguration(
149
- iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]
150
- )
154
+ config=RTCConfiguration(iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]),
151
155
  )
152
-
156
+
153
157
  client_peer = Peer(id="test-client", peer_options=options)
154
158
  await client_peer.start()
155
-
159
+
156
160
  # Connect to proxy server
157
161
  connection = client_peer.connect("computer-agent-proxy")
158
-
162
+
159
163
  @connection.on(ConnectionEventType.Open)
160
164
  async def connection_open():
161
165
  print("Connected to proxy server")
162
-
166
+
163
167
  # Send a test request
164
168
  request = {
165
169
  "model": "anthropic/claude-3-5-sonnet-20241022",
166
- "input": "Hello from P2P client!"
170
+ "input": "Hello from P2P client!",
167
171
  }
168
172
  await connection.send(json.dumps(request))
169
-
173
+
170
174
  @connection.on(ConnectionEventType.Data)
171
175
  async def connection_data(data):
172
176
  print(f"Received response: {data}")
173
177
  await client_peer.destroy()
174
-
178
+
175
179
  # Wait for connection
176
180
  await asyncio.sleep(10)
177
-
181
+
178
182
  except ImportError:
179
183
  print("P2P dependencies not available. Install peerjs-python for P2P testing.")
180
184
  except Exception as e:
@@ -183,7 +187,7 @@ async def test_p2p_client():
183
187
 
184
188
  if __name__ == "__main__":
185
189
  import sys
186
-
190
+
187
191
  if len(sys.argv) > 1 and sys.argv[1] == "curl":
188
192
  curl_examples()
189
193
  elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
agent/proxy/handlers.py CHANGED
@@ -7,24 +7,25 @@ import json
7
7
  import logging
8
8
  import os
9
9
  from contextlib import contextmanager
10
- from typing import Dict, Any, List, Union, Optional
10
+ from typing import Any, Dict, List, Optional, Union
11
11
 
12
- from ..agent import ComputerAgent
13
12
  from computer import Computer
14
13
 
14
+ from ..agent import ComputerAgent
15
+
15
16
  logger = logging.getLogger(__name__)
16
17
 
17
18
 
18
19
  class ResponsesHandler:
19
20
  """Handler for /responses endpoint that processes agent requests."""
20
-
21
+
21
22
  def __init__(self):
22
23
  self.computer = None
23
24
  self.agent = None
24
25
  # Simple in-memory caches
25
26
  self._computer_cache: Dict[str, Any] = {}
26
27
  self._agent_cache: Dict[str, Any] = {}
27
-
28
+
28
29
  async def setup_computer_agent(
29
30
  self,
30
31
  model: str,
@@ -75,7 +76,9 @@ class ResponsesHandler:
75
76
  computer = Computer(**default_c_config)
76
77
  await computer.__aenter__()
77
78
  self._computer_cache[comp_key] = computer
78
- logger.info(f"Computer created and cached with key={comp_key} config={default_c_config}")
79
+ logger.info(
80
+ f"Computer created and cached with key={comp_key} config={default_c_config}"
81
+ )
79
82
  else:
80
83
  logger.info(f"Reusing cached computer for key={comp_key}")
81
84
 
@@ -115,14 +118,14 @@ class ResponsesHandler:
115
118
 
116
119
  # Bind current agent reference
117
120
  self.agent = agent
118
-
121
+
119
122
  async def process_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
120
123
  """
121
124
  Process a /responses request and return the result.
122
-
125
+
123
126
  Args:
124
127
  request_data: Dictionary containing model, input, and optional kwargs
125
-
128
+
126
129
  Returns:
127
130
  Dictionary with the agent's response
128
131
  """
@@ -133,12 +136,12 @@ class ResponsesHandler:
133
136
  agent_kwargs = request_data.get("agent_kwargs", {})
134
137
  computer_kwargs = request_data.get("computer_kwargs", {})
135
138
  env_overrides = request_data.get("env", {}) or {}
136
-
139
+
137
140
  if not model:
138
141
  raise ValueError("Model is required")
139
142
  if not input_data:
140
143
  raise ValueError("Input is required")
141
-
144
+
142
145
  # Apply env overrides for the duration of this request
143
146
  with self._env_overrides(env_overrides):
144
147
  # Set up (and possibly reuse) computer and agent via caches
@@ -155,28 +158,22 @@ class ResponsesHandler:
155
158
  # Run agent and get first result
156
159
  async for result in agent.run(messages):
157
160
  # Return the first result and break
158
- return {
159
- "success": True,
160
- "result": result,
161
- "model": model
162
- }
163
-
161
+ return {"success": True, "result": result, "model": model}
162
+
164
163
  # If no results were yielded
165
- return {
166
- "success": False,
167
- "error": "No results from agent",
168
- "model": model
169
- }
170
-
164
+ return {"success": False, "error": "No results from agent", "model": model}
165
+
171
166
  except Exception as e:
172
167
  logger.error(f"Error processing request: {e}")
173
168
  return {
174
169
  "success": False,
175
170
  "error": str(e),
176
- "model": request_data.get("model", "unknown")
171
+ "model": request_data.get("model", "unknown"),
177
172
  }
178
-
179
- def _convert_input_to_messages(self, input_data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
173
+
174
+ def _convert_input_to_messages(
175
+ self, input_data: Union[str, List[Dict[str, Any]]]
176
+ ) -> List[Dict[str, Any]]:
180
177
  """Convert input data to messages format."""
181
178
  if isinstance(input_data, str):
182
179
  # Simple string input
@@ -192,22 +189,18 @@ class ResponsesHandler:
192
189
  if part.get("type") == "input_text":
193
190
  content_parts.append({"type": "text", "text": part["text"]})
194
191
  elif part.get("type") == "input_image":
195
- content_parts.append({
196
- "type": "image_url",
197
- "image_url": {"url": part["image_url"]}
198
- })
192
+ content_parts.append(
193
+ {"type": "image_url", "image_url": {"url": part["image_url"]}}
194
+ )
199
195
  else:
200
196
  content_parts.append(part)
201
- messages.append({
202
- "role": msg["role"],
203
- "content": content_parts
204
- })
197
+ messages.append({"role": msg["role"], "content": content_parts})
205
198
  else:
206
199
  messages.append(msg)
207
200
  return messages
208
201
  else:
209
202
  raise ValueError("Input must be string or list of messages")
210
-
203
+
211
204
  async def cleanup(self):
212
205
  """Clean up resources."""
213
206
  if self.computer: