cua-agent 0.4.33__py3-none-any.whl → 0.4.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +26 -17
- agent/computers/cua.py +27 -23
- agent/computers/custom.py +72 -69
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +49 -20
- agent/loops/omniparser.py +212 -209
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +475 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/METADATA +22 -10
- cua_agent-0.4.35.dist-info/RECORD +64 -0
- cua_agent-0.4.33.dist-info/RECORD +0 -63
- {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/adapters/human_adapter.py
CHANGED
|
@@ -1,22 +1,23 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, AsyncIterator, Dict, Iterator, List
|
|
4
|
+
|
|
3
5
|
import requests
|
|
4
|
-
from
|
|
5
|
-
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
|
6
|
+
from litellm import acompletion, completion
|
|
6
7
|
from litellm.llms.custom_llm import CustomLLM
|
|
7
|
-
from litellm import
|
|
8
|
+
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class HumanAdapter(CustomLLM):
|
|
11
12
|
"""Human Adapter for human-in-the-loop completions.
|
|
12
|
-
|
|
13
|
+
|
|
13
14
|
This adapter sends completion requests to a human completion server
|
|
14
15
|
where humans can review and respond to AI requests.
|
|
15
16
|
"""
|
|
16
|
-
|
|
17
|
+
|
|
17
18
|
def __init__(self, base_url: str | None = None, timeout: float = 300.0, **kwargs):
|
|
18
19
|
"""Initialize the human adapter.
|
|
19
|
-
|
|
20
|
+
|
|
20
21
|
Args:
|
|
21
22
|
base_url: Base URL for the human completion server.
|
|
22
23
|
Defaults to HUMAN_BASE_URL environment variable or http://localhost:8002
|
|
@@ -24,60 +25,58 @@ class HumanAdapter(CustomLLM):
|
|
|
24
25
|
**kwargs: Additional arguments
|
|
25
26
|
"""
|
|
26
27
|
super().__init__()
|
|
27
|
-
self.base_url = base_url or os.getenv(
|
|
28
|
+
self.base_url = base_url or os.getenv("HUMAN_BASE_URL", "http://localhost:8002")
|
|
28
29
|
self.timeout = timeout
|
|
29
|
-
|
|
30
|
+
|
|
30
31
|
# Ensure base_url doesn't end with slash
|
|
31
|
-
self.base_url = self.base_url.rstrip(
|
|
32
|
-
|
|
32
|
+
self.base_url = self.base_url.rstrip("/")
|
|
33
|
+
|
|
33
34
|
def _queue_completion(self, messages: List[Dict[str, Any]], model: str) -> str:
|
|
34
35
|
"""Queue a completion request and return the call ID.
|
|
35
|
-
|
|
36
|
+
|
|
36
37
|
Args:
|
|
37
38
|
messages: Messages in OpenAI format
|
|
38
39
|
model: Model name
|
|
39
|
-
|
|
40
|
+
|
|
40
41
|
Returns:
|
|
41
42
|
Call ID for tracking the request
|
|
42
|
-
|
|
43
|
+
|
|
43
44
|
Raises:
|
|
44
45
|
Exception: If queueing fails
|
|
45
46
|
"""
|
|
46
47
|
try:
|
|
47
48
|
response = requests.post(
|
|
48
|
-
f"{self.base_url}/queue",
|
|
49
|
-
json={"messages": messages, "model": model},
|
|
50
|
-
timeout=10
|
|
49
|
+
f"{self.base_url}/queue", json={"messages": messages, "model": model}, timeout=10
|
|
51
50
|
)
|
|
52
51
|
response.raise_for_status()
|
|
53
52
|
return response.json()["id"]
|
|
54
53
|
except requests.RequestException as e:
|
|
55
54
|
raise Exception(f"Failed to queue completion request: {e}")
|
|
56
|
-
|
|
55
|
+
|
|
57
56
|
def _wait_for_completion(self, call_id: str) -> Dict[str, Any]:
|
|
58
57
|
"""Wait for human to complete the call.
|
|
59
|
-
|
|
58
|
+
|
|
60
59
|
Args:
|
|
61
60
|
call_id: ID of the queued completion call
|
|
62
|
-
|
|
61
|
+
|
|
63
62
|
Returns:
|
|
64
63
|
Dict containing response and/or tool_calls
|
|
65
|
-
|
|
64
|
+
|
|
66
65
|
Raises:
|
|
67
66
|
TimeoutError: If timeout is exceeded
|
|
68
67
|
Exception: If completion fails
|
|
69
68
|
"""
|
|
70
69
|
import time
|
|
71
|
-
|
|
70
|
+
|
|
72
71
|
start_time = time.time()
|
|
73
|
-
|
|
72
|
+
|
|
74
73
|
while True:
|
|
75
74
|
try:
|
|
76
75
|
# Check status
|
|
77
76
|
status_response = requests.get(f"{self.base_url}/status/{call_id}")
|
|
78
77
|
status_response.raise_for_status()
|
|
79
78
|
status_data = status_response.json()
|
|
80
|
-
|
|
79
|
+
|
|
81
80
|
if status_data["status"] == "completed":
|
|
82
81
|
result = {}
|
|
83
82
|
if "response" in status_data and status_data["response"]:
|
|
@@ -88,38 +87,41 @@ class HumanAdapter(CustomLLM):
|
|
|
88
87
|
elif status_data["status"] == "failed":
|
|
89
88
|
error_msg = status_data.get("error", "Unknown error")
|
|
90
89
|
raise Exception(f"Completion failed: {error_msg}")
|
|
91
|
-
|
|
90
|
+
|
|
92
91
|
# Check timeout
|
|
93
92
|
if time.time() - start_time > self.timeout:
|
|
94
|
-
raise TimeoutError(
|
|
95
|
-
|
|
93
|
+
raise TimeoutError(
|
|
94
|
+
f"Timeout waiting for human response after {self.timeout} seconds"
|
|
95
|
+
)
|
|
96
|
+
|
|
96
97
|
# Wait before checking again
|
|
97
98
|
time.sleep(1.0)
|
|
98
|
-
|
|
99
|
+
|
|
99
100
|
except requests.RequestException as e:
|
|
100
101
|
if time.time() - start_time > self.timeout:
|
|
101
102
|
raise TimeoutError(f"Timeout waiting for human response: {e}")
|
|
102
103
|
# Continue trying if we haven't timed out
|
|
103
104
|
time.sleep(1.0)
|
|
104
|
-
|
|
105
|
+
|
|
105
106
|
async def _async_wait_for_completion(self, call_id: str) -> Dict[str, Any]:
|
|
106
107
|
"""Async version of wait_for_completion.
|
|
107
|
-
|
|
108
|
+
|
|
108
109
|
Args:
|
|
109
110
|
call_id: ID of the queued completion call
|
|
110
|
-
|
|
111
|
+
|
|
111
112
|
Returns:
|
|
112
113
|
Dict containing response and/or tool_calls
|
|
113
|
-
|
|
114
|
+
|
|
114
115
|
Raises:
|
|
115
116
|
TimeoutError: If timeout is exceeded
|
|
116
117
|
Exception: If completion fails
|
|
117
118
|
"""
|
|
118
|
-
import aiohttp
|
|
119
119
|
import time
|
|
120
|
-
|
|
120
|
+
|
|
121
|
+
import aiohttp
|
|
122
|
+
|
|
121
123
|
start_time = time.time()
|
|
122
|
-
|
|
124
|
+
|
|
123
125
|
async with aiohttp.ClientSession() as session:
|
|
124
126
|
while True:
|
|
125
127
|
try:
|
|
@@ -127,7 +129,7 @@ class HumanAdapter(CustomLLM):
|
|
|
127
129
|
async with session.get(f"{self.base_url}/status/{call_id}") as response:
|
|
128
130
|
response.raise_for_status()
|
|
129
131
|
status_data = await response.json()
|
|
130
|
-
|
|
132
|
+
|
|
131
133
|
if status_data["status"] == "completed":
|
|
132
134
|
result = {}
|
|
133
135
|
if "response" in status_data and status_data["response"]:
|
|
@@ -138,166 +140,158 @@ class HumanAdapter(CustomLLM):
|
|
|
138
140
|
elif status_data["status"] == "failed":
|
|
139
141
|
error_msg = status_data.get("error", "Unknown error")
|
|
140
142
|
raise Exception(f"Completion failed: {error_msg}")
|
|
141
|
-
|
|
143
|
+
|
|
142
144
|
# Check timeout
|
|
143
145
|
if time.time() - start_time > self.timeout:
|
|
144
|
-
raise TimeoutError(
|
|
145
|
-
|
|
146
|
+
raise TimeoutError(
|
|
147
|
+
f"Timeout waiting for human response after {self.timeout} seconds"
|
|
148
|
+
)
|
|
149
|
+
|
|
146
150
|
# Wait before checking again
|
|
147
151
|
await asyncio.sleep(1.0)
|
|
148
|
-
|
|
152
|
+
|
|
149
153
|
except Exception as e:
|
|
150
154
|
if time.time() - start_time > self.timeout:
|
|
151
155
|
raise TimeoutError(f"Timeout waiting for human response: {e}")
|
|
152
156
|
# Continue trying if we haven't timed out
|
|
153
157
|
await asyncio.sleep(1.0)
|
|
154
|
-
|
|
158
|
+
|
|
155
159
|
def _generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
|
|
156
160
|
"""Generate a human response for the given messages.
|
|
157
|
-
|
|
161
|
+
|
|
158
162
|
Args:
|
|
159
163
|
messages: Messages in OpenAI format
|
|
160
164
|
model: Model name
|
|
161
|
-
|
|
165
|
+
|
|
162
166
|
Returns:
|
|
163
167
|
Dict containing response and/or tool_calls
|
|
164
168
|
"""
|
|
165
169
|
# Queue the completion request
|
|
166
170
|
call_id = self._queue_completion(messages, model)
|
|
167
|
-
|
|
171
|
+
|
|
168
172
|
# Wait for human response
|
|
169
173
|
response = self._wait_for_completion(call_id)
|
|
170
|
-
|
|
174
|
+
|
|
171
175
|
return response
|
|
172
|
-
|
|
173
|
-
async def _async_generate_response(
|
|
176
|
+
|
|
177
|
+
async def _async_generate_response(
|
|
178
|
+
self, messages: List[Dict[str, Any]], model: str
|
|
179
|
+
) -> Dict[str, Any]:
|
|
174
180
|
"""Async version of _generate_response.
|
|
175
|
-
|
|
181
|
+
|
|
176
182
|
Args:
|
|
177
183
|
messages: Messages in OpenAI format
|
|
178
184
|
model: Model name
|
|
179
|
-
|
|
185
|
+
|
|
180
186
|
Returns:
|
|
181
187
|
Dict containing response and/or tool_calls
|
|
182
188
|
"""
|
|
183
189
|
# Queue the completion request (sync operation)
|
|
184
190
|
call_id = self._queue_completion(messages, model)
|
|
185
|
-
|
|
191
|
+
|
|
186
192
|
# Wait for human response (async)
|
|
187
193
|
response = await self._async_wait_for_completion(call_id)
|
|
188
|
-
|
|
194
|
+
|
|
189
195
|
return response
|
|
190
|
-
|
|
196
|
+
|
|
191
197
|
def completion(self, *args, **kwargs) -> ModelResponse:
|
|
192
198
|
"""Synchronous completion method.
|
|
193
|
-
|
|
199
|
+
|
|
194
200
|
Returns:
|
|
195
201
|
ModelResponse with human-generated text or tool calls
|
|
196
202
|
"""
|
|
197
|
-
messages = kwargs.get(
|
|
198
|
-
model = kwargs.get(
|
|
199
|
-
|
|
203
|
+
messages = kwargs.get("messages", [])
|
|
204
|
+
model = kwargs.get("model", "human")
|
|
205
|
+
|
|
200
206
|
# Generate human response
|
|
201
207
|
human_response_data = self._generate_response(messages, model)
|
|
202
|
-
|
|
208
|
+
|
|
203
209
|
# Create ModelResponse with proper structure
|
|
204
|
-
from litellm.types.utils import ModelResponse, Choices, Message
|
|
205
|
-
import uuid
|
|
206
210
|
import time
|
|
207
|
-
|
|
211
|
+
import uuid
|
|
212
|
+
|
|
213
|
+
from litellm.types.utils import Choices, Message, ModelResponse
|
|
214
|
+
|
|
208
215
|
# Create message content based on response type
|
|
209
216
|
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
|
|
210
217
|
# Tool calls response
|
|
211
218
|
message = Message(
|
|
212
219
|
role="assistant",
|
|
213
220
|
content=human_response_data.get("response", ""),
|
|
214
|
-
tool_calls=human_response_data["tool_calls"]
|
|
221
|
+
tool_calls=human_response_data["tool_calls"],
|
|
215
222
|
)
|
|
216
223
|
else:
|
|
217
224
|
# Text response
|
|
218
|
-
message = Message(
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
choice = Choices(
|
|
224
|
-
finish_reason="stop",
|
|
225
|
-
index=0,
|
|
226
|
-
message=message
|
|
227
|
-
)
|
|
228
|
-
|
|
225
|
+
message = Message(role="assistant", content=human_response_data.get("response", ""))
|
|
226
|
+
|
|
227
|
+
choice = Choices(finish_reason="stop", index=0, message=message)
|
|
228
|
+
|
|
229
229
|
result = ModelResponse(
|
|
230
230
|
id=f"human-{uuid.uuid4()}",
|
|
231
231
|
choices=[choice],
|
|
232
232
|
created=int(time.time()),
|
|
233
233
|
model=f"human/{model}",
|
|
234
|
-
object="chat.completion"
|
|
234
|
+
object="chat.completion",
|
|
235
235
|
)
|
|
236
|
-
|
|
236
|
+
|
|
237
237
|
return result
|
|
238
|
-
|
|
238
|
+
|
|
239
239
|
async def acompletion(self, *args, **kwargs) -> ModelResponse:
|
|
240
240
|
"""Asynchronous completion method.
|
|
241
|
-
|
|
241
|
+
|
|
242
242
|
Returns:
|
|
243
243
|
ModelResponse with human-generated text or tool calls
|
|
244
244
|
"""
|
|
245
|
-
messages = kwargs.get(
|
|
246
|
-
model = kwargs.get(
|
|
247
|
-
|
|
245
|
+
messages = kwargs.get("messages", [])
|
|
246
|
+
model = kwargs.get("model", "human")
|
|
247
|
+
|
|
248
248
|
# Generate human response
|
|
249
249
|
human_response_data = await self._async_generate_response(messages, model)
|
|
250
|
-
|
|
250
|
+
|
|
251
251
|
# Create ModelResponse with proper structure
|
|
252
|
-
from litellm.types.utils import ModelResponse, Choices, Message
|
|
253
|
-
import uuid
|
|
254
252
|
import time
|
|
255
|
-
|
|
253
|
+
import uuid
|
|
254
|
+
|
|
255
|
+
from litellm.types.utils import Choices, Message, ModelResponse
|
|
256
|
+
|
|
256
257
|
# Create message content based on response type
|
|
257
258
|
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
|
|
258
259
|
# Tool calls response
|
|
259
260
|
message = Message(
|
|
260
261
|
role="assistant",
|
|
261
262
|
content=human_response_data.get("response", ""),
|
|
262
|
-
tool_calls=human_response_data["tool_calls"]
|
|
263
|
+
tool_calls=human_response_data["tool_calls"],
|
|
263
264
|
)
|
|
264
265
|
else:
|
|
265
266
|
# Text response
|
|
266
|
-
message = Message(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
choice = Choices(
|
|
272
|
-
finish_reason="stop",
|
|
273
|
-
index=0,
|
|
274
|
-
message=message
|
|
275
|
-
)
|
|
276
|
-
|
|
267
|
+
message = Message(role="assistant", content=human_response_data.get("response", ""))
|
|
268
|
+
|
|
269
|
+
choice = Choices(finish_reason="stop", index=0, message=message)
|
|
270
|
+
|
|
277
271
|
result = ModelResponse(
|
|
278
272
|
id=f"human-{uuid.uuid4()}",
|
|
279
273
|
choices=[choice],
|
|
280
274
|
created=int(time.time()),
|
|
281
275
|
model=f"human/{model}",
|
|
282
|
-
object="chat.completion"
|
|
276
|
+
object="chat.completion",
|
|
283
277
|
)
|
|
284
|
-
|
|
278
|
+
|
|
285
279
|
return result
|
|
286
|
-
|
|
280
|
+
|
|
287
281
|
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
|
|
288
282
|
"""Synchronous streaming method.
|
|
289
|
-
|
|
283
|
+
|
|
290
284
|
Yields:
|
|
291
285
|
Streaming chunks with human-generated text or tool calls
|
|
292
286
|
"""
|
|
293
|
-
messages = kwargs.get(
|
|
294
|
-
model = kwargs.get(
|
|
295
|
-
|
|
287
|
+
messages = kwargs.get("messages", [])
|
|
288
|
+
model = kwargs.get("model", "human")
|
|
289
|
+
|
|
296
290
|
# Generate human response
|
|
297
291
|
human_response_data = self._generate_response(messages, model)
|
|
298
|
-
|
|
292
|
+
|
|
299
293
|
import time
|
|
300
|
-
|
|
294
|
+
|
|
301
295
|
# Handle tool calls vs text response
|
|
302
296
|
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
|
|
303
297
|
# Stream tool calls as a single chunk
|
|
@@ -319,22 +313,26 @@ class HumanAdapter(CustomLLM):
|
|
|
319
313
|
"is_finished": True,
|
|
320
314
|
"text": response_text,
|
|
321
315
|
"tool_use": None,
|
|
322
|
-
"usage": {
|
|
316
|
+
"usage": {
|
|
317
|
+
"completion_tokens": len(response_text.split()),
|
|
318
|
+
"prompt_tokens": 0,
|
|
319
|
+
"total_tokens": len(response_text.split()),
|
|
320
|
+
},
|
|
323
321
|
}
|
|
324
322
|
yield generic_chunk
|
|
325
|
-
|
|
323
|
+
|
|
326
324
|
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
|
|
327
325
|
"""Asynchronous streaming method.
|
|
328
|
-
|
|
326
|
+
|
|
329
327
|
Yields:
|
|
330
328
|
Streaming chunks with human-generated text or tool calls
|
|
331
329
|
"""
|
|
332
|
-
messages = kwargs.get(
|
|
333
|
-
model = kwargs.get(
|
|
334
|
-
|
|
330
|
+
messages = kwargs.get("messages", [])
|
|
331
|
+
model = kwargs.get("model", "human")
|
|
332
|
+
|
|
335
333
|
# Generate human response
|
|
336
334
|
human_response = await self._async_generate_response(messages, model)
|
|
337
|
-
|
|
335
|
+
|
|
338
336
|
# Return as single streaming chunk
|
|
339
337
|
generic_streaming_chunk: GenericStreamingChunk = {
|
|
340
338
|
"finish_reason": "stop",
|
|
@@ -342,7 +340,11 @@ class HumanAdapter(CustomLLM):
|
|
|
342
340
|
"is_finished": True,
|
|
343
341
|
"text": human_response,
|
|
344
342
|
"tool_use": None,
|
|
345
|
-
"usage": {
|
|
343
|
+
"usage": {
|
|
344
|
+
"completion_tokens": len(human_response.split()),
|
|
345
|
+
"prompt_tokens": 0,
|
|
346
|
+
"total_tokens": len(human_response.split()),
|
|
347
|
+
},
|
|
346
348
|
}
|
|
347
|
-
|
|
348
|
-
yield generic_streaming_chunk
|
|
349
|
+
|
|
350
|
+
yield generic_streaming_chunk
|