cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
@@ -1,22 +1,23 @@
1
- import os
2
1
  import asyncio
2
+ import os
3
+ from typing import Any, AsyncIterator, Dict, Iterator, List
4
+
3
5
  import requests
4
- from typing import List, Dict, Any, Iterator, AsyncIterator
5
- from litellm.types.utils import GenericStreamingChunk, ModelResponse
6
+ from litellm import acompletion, completion
6
7
  from litellm.llms.custom_llm import CustomLLM
7
- from litellm import completion, acompletion
8
+ from litellm.types.utils import GenericStreamingChunk, ModelResponse
8
9
 
9
10
 
10
11
  class HumanAdapter(CustomLLM):
11
12
  """Human Adapter for human-in-the-loop completions.
12
-
13
+
13
14
  This adapter sends completion requests to a human completion server
14
15
  where humans can review and respond to AI requests.
15
16
  """
16
-
17
+
17
18
  def __init__(self, base_url: str | None = None, timeout: float = 300.0, **kwargs):
18
19
  """Initialize the human adapter.
19
-
20
+
20
21
  Args:
21
22
  base_url: Base URL for the human completion server.
22
23
  Defaults to HUMAN_BASE_URL environment variable or http://localhost:8002
@@ -24,60 +25,58 @@ class HumanAdapter(CustomLLM):
24
25
  **kwargs: Additional arguments
25
26
  """
26
27
  super().__init__()
27
- self.base_url = base_url or os.getenv('HUMAN_BASE_URL', 'http://localhost:8002')
28
+ self.base_url = base_url or os.getenv("HUMAN_BASE_URL", "http://localhost:8002")
28
29
  self.timeout = timeout
29
-
30
+
30
31
  # Ensure base_url doesn't end with slash
31
- self.base_url = self.base_url.rstrip('/')
32
-
32
+ self.base_url = self.base_url.rstrip("/")
33
+
33
34
  def _queue_completion(self, messages: List[Dict[str, Any]], model: str) -> str:
34
35
  """Queue a completion request and return the call ID.
35
-
36
+
36
37
  Args:
37
38
  messages: Messages in OpenAI format
38
39
  model: Model name
39
-
40
+
40
41
  Returns:
41
42
  Call ID for tracking the request
42
-
43
+
43
44
  Raises:
44
45
  Exception: If queueing fails
45
46
  """
46
47
  try:
47
48
  response = requests.post(
48
- f"{self.base_url}/queue",
49
- json={"messages": messages, "model": model},
50
- timeout=10
49
+ f"{self.base_url}/queue", json={"messages": messages, "model": model}, timeout=10
51
50
  )
52
51
  response.raise_for_status()
53
52
  return response.json()["id"]
54
53
  except requests.RequestException as e:
55
54
  raise Exception(f"Failed to queue completion request: {e}")
56
-
55
+
57
56
  def _wait_for_completion(self, call_id: str) -> Dict[str, Any]:
58
57
  """Wait for human to complete the call.
59
-
58
+
60
59
  Args:
61
60
  call_id: ID of the queued completion call
62
-
61
+
63
62
  Returns:
64
63
  Dict containing response and/or tool_calls
65
-
64
+
66
65
  Raises:
67
66
  TimeoutError: If timeout is exceeded
68
67
  Exception: If completion fails
69
68
  """
70
69
  import time
71
-
70
+
72
71
  start_time = time.time()
73
-
72
+
74
73
  while True:
75
74
  try:
76
75
  # Check status
77
76
  status_response = requests.get(f"{self.base_url}/status/{call_id}")
78
77
  status_response.raise_for_status()
79
78
  status_data = status_response.json()
80
-
79
+
81
80
  if status_data["status"] == "completed":
82
81
  result = {}
83
82
  if "response" in status_data and status_data["response"]:
@@ -88,38 +87,41 @@ class HumanAdapter(CustomLLM):
88
87
  elif status_data["status"] == "failed":
89
88
  error_msg = status_data.get("error", "Unknown error")
90
89
  raise Exception(f"Completion failed: {error_msg}")
91
-
90
+
92
91
  # Check timeout
93
92
  if time.time() - start_time > self.timeout:
94
- raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds")
95
-
93
+ raise TimeoutError(
94
+ f"Timeout waiting for human response after {self.timeout} seconds"
95
+ )
96
+
96
97
  # Wait before checking again
97
98
  time.sleep(1.0)
98
-
99
+
99
100
  except requests.RequestException as e:
100
101
  if time.time() - start_time > self.timeout:
101
102
  raise TimeoutError(f"Timeout waiting for human response: {e}")
102
103
  # Continue trying if we haven't timed out
103
104
  time.sleep(1.0)
104
-
105
+
105
106
  async def _async_wait_for_completion(self, call_id: str) -> Dict[str, Any]:
106
107
  """Async version of wait_for_completion.
107
-
108
+
108
109
  Args:
109
110
  call_id: ID of the queued completion call
110
-
111
+
111
112
  Returns:
112
113
  Dict containing response and/or tool_calls
113
-
114
+
114
115
  Raises:
115
116
  TimeoutError: If timeout is exceeded
116
117
  Exception: If completion fails
117
118
  """
118
- import aiohttp
119
119
  import time
120
-
120
+
121
+ import aiohttp
122
+
121
123
  start_time = time.time()
122
-
124
+
123
125
  async with aiohttp.ClientSession() as session:
124
126
  while True:
125
127
  try:
@@ -127,7 +129,7 @@ class HumanAdapter(CustomLLM):
127
129
  async with session.get(f"{self.base_url}/status/{call_id}") as response:
128
130
  response.raise_for_status()
129
131
  status_data = await response.json()
130
-
132
+
131
133
  if status_data["status"] == "completed":
132
134
  result = {}
133
135
  if "response" in status_data and status_data["response"]:
@@ -138,166 +140,158 @@ class HumanAdapter(CustomLLM):
138
140
  elif status_data["status"] == "failed":
139
141
  error_msg = status_data.get("error", "Unknown error")
140
142
  raise Exception(f"Completion failed: {error_msg}")
141
-
143
+
142
144
  # Check timeout
143
145
  if time.time() - start_time > self.timeout:
144
- raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds")
145
-
146
+ raise TimeoutError(
147
+ f"Timeout waiting for human response after {self.timeout} seconds"
148
+ )
149
+
146
150
  # Wait before checking again
147
151
  await asyncio.sleep(1.0)
148
-
152
+
149
153
  except Exception as e:
150
154
  if time.time() - start_time > self.timeout:
151
155
  raise TimeoutError(f"Timeout waiting for human response: {e}")
152
156
  # Continue trying if we haven't timed out
153
157
  await asyncio.sleep(1.0)
154
-
158
+
155
159
  def _generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
156
160
  """Generate a human response for the given messages.
157
-
161
+
158
162
  Args:
159
163
  messages: Messages in OpenAI format
160
164
  model: Model name
161
-
165
+
162
166
  Returns:
163
167
  Dict containing response and/or tool_calls
164
168
  """
165
169
  # Queue the completion request
166
170
  call_id = self._queue_completion(messages, model)
167
-
171
+
168
172
  # Wait for human response
169
173
  response = self._wait_for_completion(call_id)
170
-
174
+
171
175
  return response
172
-
173
- async def _async_generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
176
+
177
+ async def _async_generate_response(
178
+ self, messages: List[Dict[str, Any]], model: str
179
+ ) -> Dict[str, Any]:
174
180
  """Async version of _generate_response.
175
-
181
+
176
182
  Args:
177
183
  messages: Messages in OpenAI format
178
184
  model: Model name
179
-
185
+
180
186
  Returns:
181
187
  Dict containing response and/or tool_calls
182
188
  """
183
189
  # Queue the completion request (sync operation)
184
190
  call_id = self._queue_completion(messages, model)
185
-
191
+
186
192
  # Wait for human response (async)
187
193
  response = await self._async_wait_for_completion(call_id)
188
-
194
+
189
195
  return response
190
-
196
+
191
197
  def completion(self, *args, **kwargs) -> ModelResponse:
192
198
  """Synchronous completion method.
193
-
199
+
194
200
  Returns:
195
201
  ModelResponse with human-generated text or tool calls
196
202
  """
197
- messages = kwargs.get('messages', [])
198
- model = kwargs.get('model', 'human')
199
-
203
+ messages = kwargs.get("messages", [])
204
+ model = kwargs.get("model", "human")
205
+
200
206
  # Generate human response
201
207
  human_response_data = self._generate_response(messages, model)
202
-
208
+
203
209
  # Create ModelResponse with proper structure
204
- from litellm.types.utils import ModelResponse, Choices, Message
205
- import uuid
206
210
  import time
207
-
211
+ import uuid
212
+
213
+ from litellm.types.utils import Choices, Message, ModelResponse
214
+
208
215
  # Create message content based on response type
209
216
  if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
210
217
  # Tool calls response
211
218
  message = Message(
212
219
  role="assistant",
213
220
  content=human_response_data.get("response", ""),
214
- tool_calls=human_response_data["tool_calls"]
221
+ tool_calls=human_response_data["tool_calls"],
215
222
  )
216
223
  else:
217
224
  # Text response
218
- message = Message(
219
- role="assistant",
220
- content=human_response_data.get("response", "")
221
- )
222
-
223
- choice = Choices(
224
- finish_reason="stop",
225
- index=0,
226
- message=message
227
- )
228
-
225
+ message = Message(role="assistant", content=human_response_data.get("response", ""))
226
+
227
+ choice = Choices(finish_reason="stop", index=0, message=message)
228
+
229
229
  result = ModelResponse(
230
230
  id=f"human-{uuid.uuid4()}",
231
231
  choices=[choice],
232
232
  created=int(time.time()),
233
233
  model=f"human/{model}",
234
- object="chat.completion"
234
+ object="chat.completion",
235
235
  )
236
-
236
+
237
237
  return result
238
-
238
+
239
239
  async def acompletion(self, *args, **kwargs) -> ModelResponse:
240
240
  """Asynchronous completion method.
241
-
241
+
242
242
  Returns:
243
243
  ModelResponse with human-generated text or tool calls
244
244
  """
245
- messages = kwargs.get('messages', [])
246
- model = kwargs.get('model', 'human')
247
-
245
+ messages = kwargs.get("messages", [])
246
+ model = kwargs.get("model", "human")
247
+
248
248
  # Generate human response
249
249
  human_response_data = await self._async_generate_response(messages, model)
250
-
250
+
251
251
  # Create ModelResponse with proper structure
252
- from litellm.types.utils import ModelResponse, Choices, Message
253
- import uuid
254
252
  import time
255
-
253
+ import uuid
254
+
255
+ from litellm.types.utils import Choices, Message, ModelResponse
256
+
256
257
  # Create message content based on response type
257
258
  if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
258
259
  # Tool calls response
259
260
  message = Message(
260
261
  role="assistant",
261
262
  content=human_response_data.get("response", ""),
262
- tool_calls=human_response_data["tool_calls"]
263
+ tool_calls=human_response_data["tool_calls"],
263
264
  )
264
265
  else:
265
266
  # Text response
266
- message = Message(
267
- role="assistant",
268
- content=human_response_data.get("response", "")
269
- )
270
-
271
- choice = Choices(
272
- finish_reason="stop",
273
- index=0,
274
- message=message
275
- )
276
-
267
+ message = Message(role="assistant", content=human_response_data.get("response", ""))
268
+
269
+ choice = Choices(finish_reason="stop", index=0, message=message)
270
+
277
271
  result = ModelResponse(
278
272
  id=f"human-{uuid.uuid4()}",
279
273
  choices=[choice],
280
274
  created=int(time.time()),
281
275
  model=f"human/{model}",
282
- object="chat.completion"
276
+ object="chat.completion",
283
277
  )
284
-
278
+
285
279
  return result
286
-
280
+
287
281
  def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
288
282
  """Synchronous streaming method.
289
-
283
+
290
284
  Yields:
291
285
  Streaming chunks with human-generated text or tool calls
292
286
  """
293
- messages = kwargs.get('messages', [])
294
- model = kwargs.get('model', 'human')
295
-
287
+ messages = kwargs.get("messages", [])
288
+ model = kwargs.get("model", "human")
289
+
296
290
  # Generate human response
297
291
  human_response_data = self._generate_response(messages, model)
298
-
292
+
299
293
  import time
300
-
294
+
301
295
  # Handle tool calls vs text response
302
296
  if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
303
297
  # Stream tool calls as a single chunk
@@ -319,22 +313,26 @@ class HumanAdapter(CustomLLM):
319
313
  "is_finished": True,
320
314
  "text": response_text,
321
315
  "tool_use": None,
322
- "usage": {"completion_tokens": len(response_text.split()), "prompt_tokens": 0, "total_tokens": len(response_text.split())},
316
+ "usage": {
317
+ "completion_tokens": len(response_text.split()),
318
+ "prompt_tokens": 0,
319
+ "total_tokens": len(response_text.split()),
320
+ },
323
321
  }
324
322
  yield generic_chunk
325
-
323
+
326
324
  async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
327
325
  """Asynchronous streaming method.
328
-
326
+
329
327
  Yields:
330
328
  Streaming chunks with human-generated text or tool calls
331
329
  """
332
- messages = kwargs.get('messages', [])
333
- model = kwargs.get('model', 'human')
334
-
330
+ messages = kwargs.get("messages", [])
331
+ model = kwargs.get("model", "human")
332
+
335
333
  # Generate human response
336
334
  human_response = await self._async_generate_response(messages, model)
337
-
335
+
338
336
  # Return as single streaming chunk
339
337
  generic_streaming_chunk: GenericStreamingChunk = {
340
338
  "finish_reason": "stop",
@@ -342,7 +340,11 @@ class HumanAdapter(CustomLLM):
342
340
  "is_finished": True,
343
341
  "text": human_response,
344
342
  "tool_use": None,
345
- "usage": {"completion_tokens": len(human_response.split()), "prompt_tokens": 0, "total_tokens": len(human_response.split())},
343
+ "usage": {
344
+ "completion_tokens": len(human_response.split()),
345
+ "prompt_tokens": 0,
346
+ "total_tokens": len(human_response.split()),
347
+ },
346
348
  }
347
-
348
- yield generic_streaming_chunk
349
+
350
+ yield generic_streaming_chunk