cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/integrations/hud/proxy.py
CHANGED
|
@@ -7,26 +7,33 @@ OpenAI-like response blocks. We intentionally only support a single-step call
|
|
|
7
7
|
by consuming the first yielded result from `ComputerAgent.run()`.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
import traceback
|
|
11
10
|
import time
|
|
11
|
+
import traceback
|
|
12
12
|
import uuid
|
|
13
13
|
from typing import Any, Dict, List, Optional
|
|
14
14
|
|
|
15
15
|
from agent.agent import ComputerAgent as BaseComputerAgent
|
|
16
|
+
from agent.callbacks import PromptInstructionsCallback
|
|
17
|
+
from hud.agents import OperatorAgent
|
|
18
|
+
from hud.tools.computer.settings import computer_settings
|
|
16
19
|
|
|
17
20
|
# OpenAI Responses typed models (required)
|
|
18
21
|
from openai.types.responses import (
|
|
19
22
|
Response,
|
|
23
|
+
ResponseComputerToolCall,
|
|
20
24
|
ResponseInputParam,
|
|
21
25
|
ResponseOutputItem,
|
|
22
|
-
ResponseComputerToolCall,
|
|
23
26
|
ResponseOutputMessage,
|
|
24
27
|
ResponseOutputText,
|
|
25
28
|
ResponseReasoningItem,
|
|
26
29
|
ResponseUsage,
|
|
27
30
|
)
|
|
31
|
+
from PIL import Image
|
|
32
|
+
|
|
28
33
|
|
|
29
|
-
def _map_agent_output_to_openai_blocks(
|
|
34
|
+
def _map_agent_output_to_openai_blocks(
|
|
35
|
+
output_items: List[Dict[str, Any]],
|
|
36
|
+
) -> List[ResponseOutputItem]:
|
|
30
37
|
"""Map our agent output items to OpenAI ResponseOutputItem typed models.
|
|
31
38
|
|
|
32
39
|
Only a subset is supported: computer_call, assistant message (text), and reasoning.
|
|
@@ -36,14 +43,16 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
|
|
|
36
43
|
for item in output_items or []:
|
|
37
44
|
t = item.get("type")
|
|
38
45
|
if t == "computer_call":
|
|
39
|
-
comp = ResponseComputerToolCall.model_validate(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
46
|
+
comp = ResponseComputerToolCall.model_validate(
|
|
47
|
+
{
|
|
48
|
+
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
|
|
49
|
+
"type": "computer_call",
|
|
50
|
+
"call_id": item["call_id"],
|
|
51
|
+
"action": item["action"],
|
|
52
|
+
"pending_safety_checks": item.get("pending_safety_checks", []),
|
|
53
|
+
"status": "completed",
|
|
54
|
+
}
|
|
55
|
+
)
|
|
47
56
|
blocks.append(comp)
|
|
48
57
|
# we will exit early here as the responses api only supports a single step
|
|
49
58
|
break
|
|
@@ -51,31 +60,38 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
|
|
|
51
60
|
content_blocks: List[ResponseOutputText] = []
|
|
52
61
|
for c in item.get("content", []) or []:
|
|
53
62
|
content_blocks.append(
|
|
54
|
-
ResponseOutputText.model_validate(
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
63
|
+
ResponseOutputText.model_validate(
|
|
64
|
+
{
|
|
65
|
+
"type": "output_text",
|
|
66
|
+
"text": c["text"],
|
|
67
|
+
"annotations": [],
|
|
68
|
+
}
|
|
69
|
+
)
|
|
59
70
|
)
|
|
60
71
|
if content_blocks:
|
|
61
|
-
msg = ResponseOutputMessage.model_validate(
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
72
|
+
msg = ResponseOutputMessage.model_validate(
|
|
73
|
+
{
|
|
74
|
+
"id": item.get("id") or f"msg_{uuid.uuid4()}",
|
|
75
|
+
"type": "message",
|
|
76
|
+
"role": "assistant",
|
|
77
|
+
"status": "completed",
|
|
78
|
+
"content": [ct.model_dump() for ct in content_blocks],
|
|
79
|
+
}
|
|
80
|
+
)
|
|
68
81
|
blocks.append(msg)
|
|
69
82
|
elif t == "reasoning":
|
|
70
|
-
reasoning = ResponseReasoningItem.model_validate(
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
83
|
+
reasoning = ResponseReasoningItem.model_validate(
|
|
84
|
+
{
|
|
85
|
+
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
|
|
86
|
+
"type": "reasoning",
|
|
87
|
+
"summary": item["summary"],
|
|
88
|
+
}
|
|
89
|
+
)
|
|
75
90
|
blocks.append(reasoning)
|
|
76
91
|
# Unhandled types are ignored
|
|
77
92
|
return blocks
|
|
78
93
|
|
|
94
|
+
|
|
79
95
|
def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
|
80
96
|
out: List[Dict[str, Any]] = []
|
|
81
97
|
for it in list(items):
|
|
@@ -88,6 +104,7 @@ def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
|
|
88
104
|
out.append(dict(it)) # may raise if not mapping
|
|
89
105
|
return out
|
|
90
106
|
|
|
107
|
+
|
|
91
108
|
class FakeAsyncOpenAI:
|
|
92
109
|
"""Minimal fake OpenAI client with only `responses.create` implemented.
|
|
93
110
|
|
|
@@ -128,10 +145,12 @@ class FakeAsyncOpenAI:
|
|
|
128
145
|
# Pre-pend instructions message
|
|
129
146
|
effective_input = full_input
|
|
130
147
|
if instructions:
|
|
131
|
-
effective_input = [
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
148
|
+
effective_input = [
|
|
149
|
+
{
|
|
150
|
+
"role": "user",
|
|
151
|
+
"content": instructions,
|
|
152
|
+
}
|
|
153
|
+
] + full_input
|
|
135
154
|
|
|
136
155
|
# Run a single iteration of the ComputerAgent
|
|
137
156
|
agent_result: Optional[Dict[str, Any]] = None
|
|
@@ -148,36 +167,131 @@ class FakeAsyncOpenAI:
|
|
|
148
167
|
blocks_to_cache = full_input + output
|
|
149
168
|
for b in blocks_to_cache:
|
|
150
169
|
bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
|
|
151
|
-
self.blocks_cache[bid] = b
|
|
170
|
+
self.blocks_cache[bid] = b # type: ignore[assignment]
|
|
152
171
|
block_ids.append(bid)
|
|
153
172
|
response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
|
|
154
173
|
self.context_cache[response_id] = block_ids
|
|
155
174
|
|
|
156
175
|
try:
|
|
157
|
-
return Response.model_validate(
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
"
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
176
|
+
return Response.model_validate(
|
|
177
|
+
{
|
|
178
|
+
"id": response_id,
|
|
179
|
+
"created_at": time.time(),
|
|
180
|
+
"object": "response",
|
|
181
|
+
"model": model,
|
|
182
|
+
"output": output,
|
|
183
|
+
"parallel_tool_calls": False,
|
|
184
|
+
"tool_choice": "auto",
|
|
185
|
+
"tools": [],
|
|
186
|
+
"previous_response_id": previous_response_id,
|
|
187
|
+
"usage": ResponseUsage.model_validate(
|
|
188
|
+
{
|
|
189
|
+
"input_tokens": usage.get("input_tokens", 0),
|
|
190
|
+
"output_tokens": usage.get("output_tokens", 0),
|
|
191
|
+
"total_tokens": usage.get("total_tokens", 0),
|
|
192
|
+
"input_tokens_details": usage.get(
|
|
193
|
+
"input_tokens_details", {"cached_tokens": 0}
|
|
194
|
+
),
|
|
195
|
+
"output_tokens_details": usage.get(
|
|
196
|
+
"output_tokens_details", {"reasoning_tokens": 0}
|
|
197
|
+
),
|
|
198
|
+
}
|
|
199
|
+
),
|
|
200
|
+
}
|
|
201
|
+
)
|
|
175
202
|
except Exception as e:
|
|
176
|
-
print(
|
|
203
|
+
print(
|
|
204
|
+
f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ",
|
|
205
|
+
e,
|
|
206
|
+
)
|
|
177
207
|
if attempt == max_retries - 1:
|
|
178
208
|
print(traceback.format_exc())
|
|
179
209
|
raise e
|
|
180
210
|
|
|
211
|
+
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
# Proxy OperatorAgent (moved from __init__.py)
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class ProxyOperatorAgent(OperatorAgent):
|
|
218
|
+
"""OperatorAgent that proxies model calls through our ComputerAgent.
|
|
219
|
+
|
|
220
|
+
Accepts the same config keys we pass via hud.run_dataset `agent_config`:
|
|
221
|
+
- model: str | None
|
|
222
|
+
- allowed_tools: list[str] | None
|
|
223
|
+
Additional kwargs are forwarded to OperatorAgent (if any are supported).
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
def __init__(
|
|
227
|
+
self,
|
|
228
|
+
*,
|
|
229
|
+
model: str | None = None,
|
|
230
|
+
allowed_tools: list[str] | None = None,
|
|
231
|
+
trajectory_dir: str | dict | None = None,
|
|
232
|
+
# === ComputerAgent kwargs ===
|
|
233
|
+
tools: list[Any] | None = None,
|
|
234
|
+
custom_loop: Any | None = None,
|
|
235
|
+
only_n_most_recent_images: int | None = None,
|
|
236
|
+
callbacks: list[Any] | None = None,
|
|
237
|
+
instructions: str | None = None,
|
|
238
|
+
verbosity: int | None = None,
|
|
239
|
+
max_retries: int | None = 3,
|
|
240
|
+
screenshot_delay: float | int = 0.5,
|
|
241
|
+
use_prompt_caching: bool | None = False,
|
|
242
|
+
max_trajectory_budget: float | dict | None = None,
|
|
243
|
+
telemetry_enabled: bool | None = True,
|
|
244
|
+
**kwargs: Any,
|
|
245
|
+
) -> None:
|
|
246
|
+
model = model or "computer-use-preview"
|
|
247
|
+
allowed_tools = allowed_tools or ["openai_computer"]
|
|
248
|
+
|
|
249
|
+
computer_shim = {
|
|
250
|
+
"screenshot": lambda: Image.new(
|
|
251
|
+
"RGB",
|
|
252
|
+
(computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT),
|
|
253
|
+
),
|
|
254
|
+
"environment": "linux",
|
|
255
|
+
"dimensions": (
|
|
256
|
+
computer_settings.OPENAI_COMPUTER_WIDTH,
|
|
257
|
+
computer_settings.OPENAI_COMPUTER_HEIGHT,
|
|
258
|
+
),
|
|
259
|
+
}
|
|
260
|
+
# Build tools ensuring the computer_shim is included
|
|
261
|
+
agent_tools: list[Any] = [computer_shim]
|
|
262
|
+
if tools:
|
|
263
|
+
agent_tools.extend(tools)
|
|
264
|
+
|
|
265
|
+
# Build callbacks, injecting prompt instructions if provided
|
|
266
|
+
agent_callbacks = list(callbacks or [])
|
|
267
|
+
if instructions:
|
|
268
|
+
agent_callbacks.append(PromptInstructionsCallback(instructions))
|
|
269
|
+
|
|
270
|
+
computer_agent = BaseComputerAgent(
|
|
271
|
+
model=model,
|
|
272
|
+
tools=agent_tools,
|
|
273
|
+
custom_loop=custom_loop,
|
|
274
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
275
|
+
callbacks=agent_callbacks,
|
|
276
|
+
verbosity=verbosity,
|
|
277
|
+
trajectory_dir=trajectory_dir,
|
|
278
|
+
max_retries=max_retries,
|
|
279
|
+
screenshot_delay=screenshot_delay,
|
|
280
|
+
use_prompt_caching=use_prompt_caching,
|
|
281
|
+
max_trajectory_budget=max_trajectory_budget,
|
|
282
|
+
telemetry_enabled=telemetry_enabled,
|
|
283
|
+
)
|
|
284
|
+
model_client = FakeAsyncOpenAI(computer_agent)
|
|
285
|
+
|
|
286
|
+
super().__init__(
|
|
287
|
+
model_client=model_client, # type: ignore[arg-type]
|
|
288
|
+
model=model,
|
|
289
|
+
allowed_tools=allowed_tools,
|
|
290
|
+
**kwargs,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
|
|
181
294
|
__all__ = [
|
|
182
295
|
"FakeAsyncOpenAI",
|
|
296
|
+
"ProxyOperatorAgent",
|
|
183
297
|
]
|
agent/loops/__init__.py
CHANGED
|
@@ -1,14 +1,44 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Agent loops for agent
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
# Import the loops to register them
|
|
6
|
-
from . import
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
1
|
+
"""
|
|
2
|
+
Agent loops for agent
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# Import the loops to register them
|
|
6
|
+
from . import (
|
|
7
|
+
anthropic,
|
|
8
|
+
composed_grounded,
|
|
9
|
+
fara,
|
|
10
|
+
gelato,
|
|
11
|
+
gemini,
|
|
12
|
+
generic_vlm,
|
|
13
|
+
glm45v,
|
|
14
|
+
gta1,
|
|
15
|
+
holo,
|
|
16
|
+
internvl,
|
|
17
|
+
moondream3,
|
|
18
|
+
omniparser,
|
|
19
|
+
openai,
|
|
20
|
+
opencua,
|
|
21
|
+
uiins,
|
|
22
|
+
uitars,
|
|
23
|
+
uitars2,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"anthropic",
|
|
28
|
+
"composed_grounded",
|
|
29
|
+
"gelato",
|
|
30
|
+
"gemini",
|
|
31
|
+
"generic_vlm",
|
|
32
|
+
"fara",
|
|
33
|
+
"glm45v",
|
|
34
|
+
"gta1",
|
|
35
|
+
"holo",
|
|
36
|
+
"internvl",
|
|
37
|
+
"moondream3",
|
|
38
|
+
"omniparser",
|
|
39
|
+
"openai",
|
|
40
|
+
"opencua",
|
|
41
|
+
"uiins",
|
|
42
|
+
"uitars",
|
|
43
|
+
"uitars2",
|
|
44
|
+
]
|