cua-agent 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/integrations/hud/__init__.py +14 -97
- agent/integrations/hud/agent.py +351 -0
- agent/integrations/hud/proxy.py +81 -0
- {cua_agent-0.4.28.dist-info → cua_agent-0.4.30.dist-info}/METADATA +3 -3
- {cua_agent-0.4.28.dist-info → cua_agent-0.4.30.dist-info}/RECORD +7 -6
- {cua_agent-0.4.28.dist-info → cua_agent-0.4.30.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.28.dist-info → cua_agent-0.4.30.dist-info}/entry_points.txt +0 -0
|
@@ -1,102 +1,21 @@
|
|
|
1
|
-
"""HUD integration:
|
|
1
|
+
"""HUD integration: dataset runners and MCP-based computer agent export.
|
|
2
2
|
|
|
3
|
-
This module exposes
|
|
4
|
-
|
|
5
|
-
`FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).
|
|
3
|
+
This module exposes helpers to evaluate HUD-compatible datasets and exports
|
|
4
|
+
the MCP-compatible computer agent implementation.
|
|
6
5
|
|
|
7
6
|
Exports:
|
|
8
|
-
- run_single_task(
|
|
9
|
-
- run_full_dataset(
|
|
7
|
+
- run_single_task(dataset, ...)
|
|
8
|
+
- run_full_dataset(dataset, ...)
|
|
9
|
+
- MCPComputerAgent
|
|
10
10
|
"""
|
|
11
11
|
import time
|
|
12
12
|
from typing import Any, Optional
|
|
13
13
|
|
|
14
|
-
from PIL import Image
|
|
15
14
|
from datasets import load_dataset, Dataset
|
|
16
|
-
from hud.agents import OperatorAgent
|
|
17
15
|
from hud.datasets import Task, run_dataset
|
|
18
|
-
from hud.tools.computer.settings import computer_settings
|
|
19
16
|
from hud import trace
|
|
20
17
|
|
|
21
|
-
from
|
|
22
|
-
from .proxy import FakeAsyncOpenAI
|
|
23
|
-
from agent.callbacks import PromptInstructionsCallback
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
# ---------------------------------------------------------------------------
|
|
27
|
-
# Proxy OperatorAgent
|
|
28
|
-
# ---------------------------------------------------------------------------
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class ProxyOperatorAgent(OperatorAgent):
|
|
32
|
-
"""OperatorAgent that proxies model calls through our ComputerAgent.
|
|
33
|
-
|
|
34
|
-
Accepts the same config keys we pass via hud.run_dataset `agent_config`:
|
|
35
|
-
- model: str | None
|
|
36
|
-
- allowed_tools: list[str] | None
|
|
37
|
-
Additional kwargs are forwarded to OperatorAgent (if any are supported).
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
def __init__(
|
|
41
|
-
self,
|
|
42
|
-
*,
|
|
43
|
-
model: str | None = None,
|
|
44
|
-
allowed_tools: list[str] | None = None,
|
|
45
|
-
trajectory_dir: str | dict | None = None,
|
|
46
|
-
# === ComputerAgent kwargs ===
|
|
47
|
-
tools: list[Any] | None = None,
|
|
48
|
-
custom_loop: Any | None = None,
|
|
49
|
-
only_n_most_recent_images: int | None = None,
|
|
50
|
-
callbacks: list[Any] | None = None,
|
|
51
|
-
instructions: str | None = None,
|
|
52
|
-
verbosity: int | None = None,
|
|
53
|
-
max_retries: int | None = 3,
|
|
54
|
-
screenshot_delay: float | int = 0.5,
|
|
55
|
-
use_prompt_caching: bool | None = False,
|
|
56
|
-
max_trajectory_budget: float | dict | None = None,
|
|
57
|
-
telemetry_enabled: bool | None = True,
|
|
58
|
-
**kwargs: Any,
|
|
59
|
-
) -> None:
|
|
60
|
-
model = model or "computer-use-preview"
|
|
61
|
-
allowed_tools = allowed_tools or ["openai_computer"]
|
|
62
|
-
|
|
63
|
-
computer_shim = {
|
|
64
|
-
'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
|
|
65
|
-
'environment': 'linux',
|
|
66
|
-
'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
|
|
67
|
-
}
|
|
68
|
-
# Build tools ensuring the computer_shim is included
|
|
69
|
-
agent_tools: list[Any] = [computer_shim]
|
|
70
|
-
if tools:
|
|
71
|
-
agent_tools.extend(tools)
|
|
72
|
-
|
|
73
|
-
# Build callbacks, injecting prompt instructions if provided
|
|
74
|
-
agent_callbacks = list(callbacks or [])
|
|
75
|
-
if instructions:
|
|
76
|
-
agent_callbacks.append(PromptInstructionsCallback(instructions))
|
|
77
|
-
|
|
78
|
-
computer_agent = BaseComputerAgent(
|
|
79
|
-
model=model,
|
|
80
|
-
tools=agent_tools,
|
|
81
|
-
custom_loop=custom_loop,
|
|
82
|
-
only_n_most_recent_images=only_n_most_recent_images,
|
|
83
|
-
callbacks=agent_callbacks,
|
|
84
|
-
verbosity=verbosity,
|
|
85
|
-
trajectory_dir=trajectory_dir,
|
|
86
|
-
max_retries=max_retries,
|
|
87
|
-
screenshot_delay=screenshot_delay,
|
|
88
|
-
use_prompt_caching=use_prompt_caching,
|
|
89
|
-
max_trajectory_budget=max_trajectory_budget,
|
|
90
|
-
telemetry_enabled=telemetry_enabled,
|
|
91
|
-
)
|
|
92
|
-
model_client = FakeAsyncOpenAI(computer_agent)
|
|
93
|
-
|
|
94
|
-
super().__init__(
|
|
95
|
-
model_client=model_client, # type: ignore[arg-type]
|
|
96
|
-
model=model,
|
|
97
|
-
allowed_tools=allowed_tools,
|
|
98
|
-
**kwargs,
|
|
99
|
-
)
|
|
18
|
+
from .agent import MCPComputerAgent
|
|
100
19
|
|
|
101
20
|
|
|
102
21
|
# ---------------------------------------------------------------------------
|
|
@@ -123,7 +42,7 @@ async def run_single_task(
|
|
|
123
42
|
max_trajectory_budget: float | dict | None = None,
|
|
124
43
|
telemetry_enabled: bool | None = True,
|
|
125
44
|
) -> None:
|
|
126
|
-
"""Load one task from the dataset and execute it with
|
|
45
|
+
"""Load one task from the dataset and execute it with MCPComputerAgent."""
|
|
127
46
|
|
|
128
47
|
# Load dataset and pick a sample
|
|
129
48
|
if isinstance(dataset, str):
|
|
@@ -139,9 +58,9 @@ async def run_single_task(
|
|
|
139
58
|
with trace(name=task_prompt):
|
|
140
59
|
task = Task(**sample_task) # type: ignore[arg-type]
|
|
141
60
|
|
|
142
|
-
agent =
|
|
143
|
-
model=model,
|
|
144
|
-
allowed_tools=allowed_tools,
|
|
61
|
+
agent = MCPComputerAgent(
|
|
62
|
+
model=model or "computer-use-preview",
|
|
63
|
+
allowed_tools=allowed_tools or ["openai_computer"],
|
|
145
64
|
# === ComputerAgent kwargs passthrough ===
|
|
146
65
|
tools=tools,
|
|
147
66
|
custom_loop=custom_loop,
|
|
@@ -190,9 +109,7 @@ async def run_full_dataset(
|
|
|
190
109
|
) -> list[Any]:
|
|
191
110
|
"""Run evaluation across the entire dataset using hud.datasets.run_dataset."""
|
|
192
111
|
|
|
193
|
-
#
|
|
194
|
-
# FakeAsyncOpenAI per agent instantiation.
|
|
195
|
-
|
|
112
|
+
# Run with our MCP-based agent class.
|
|
196
113
|
if isinstance(dataset, str):
|
|
197
114
|
dataset_name = dataset.split('/')[-1]
|
|
198
115
|
job_name = job_name or f"Evaluation {dataset_name}"
|
|
@@ -205,7 +122,7 @@ async def run_full_dataset(
|
|
|
205
122
|
return await run_dataset(
|
|
206
123
|
name=job_name,
|
|
207
124
|
dataset=dataset,
|
|
208
|
-
agent_class=
|
|
125
|
+
agent_class=MCPComputerAgent,
|
|
209
126
|
agent_config={
|
|
210
127
|
"model": model,
|
|
211
128
|
"allowed_tools": allowed_tools,
|
|
@@ -233,5 +150,5 @@ async def run_full_dataset(
|
|
|
233
150
|
__all__ = [
|
|
234
151
|
"run_single_task",
|
|
235
152
|
"run_full_dataset",
|
|
236
|
-
"
|
|
153
|
+
"MCPComputerAgent",
|
|
237
154
|
]
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""MCP-compatible Computer Agent for HUD integration.
|
|
2
|
+
|
|
3
|
+
This agent subclasses HUD's MCPAgent and delegates planning/execution to
|
|
4
|
+
our core ComputerAgent while using the Agent SDK's plain-dict message
|
|
5
|
+
format documented in `docs/content/docs/agent-sdk/message-format.mdx`.
|
|
6
|
+
|
|
7
|
+
Key differences from the OpenAI OperatorAgent variant:
|
|
8
|
+
- No OpenAI types are used; everything is standard Python dicts.
|
|
9
|
+
- Planning is executed via `ComputerAgent.run(messages)`.
|
|
10
|
+
- The first yielded result per step is returned as the agent response.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import io
|
|
15
|
+
from typing import Any, ClassVar, Optional
|
|
16
|
+
|
|
17
|
+
from agent.agent import ComputerAgent as BaseComputerAgent
|
|
18
|
+
from agent.callbacks import PromptInstructionsCallback
|
|
19
|
+
from agent.callbacks.trajectory_saver import TrajectorySaverCallback
|
|
20
|
+
from hud.agents import MCPAgent
|
|
21
|
+
from hud.tools.computer.settings import computer_settings
|
|
22
|
+
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
|
23
|
+
|
|
24
|
+
from agent.responses import make_failed_tool_call_items
|
|
25
|
+
from agent.computers import is_agent_computer
|
|
26
|
+
from PIL import Image
|
|
27
|
+
import mcp.types as types
|
|
28
|
+
import hud
|
|
29
|
+
import uuid
|
|
30
|
+
import base64
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MCPComputerAgent(MCPAgent):
|
|
35
|
+
"""MCP agent that uses ComputerAgent for planning and tools for execution.
|
|
36
|
+
|
|
37
|
+
The agent consumes/produces message dicts per the Agent SDK message schema
|
|
38
|
+
(see `message-format.mdx`).
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
metadata: ClassVar[dict[str, Any]] = {
|
|
42
|
+
"display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
|
|
43
|
+
"display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
required_tools: ClassVar[list[str]] = ["openai_computer"]
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
*,
|
|
51
|
+
model: str | None = None,
|
|
52
|
+
allowed_tools: list[str] | None = None,
|
|
53
|
+
trajectory_dir: str | dict | None = None,
|
|
54
|
+
# === ComputerAgent kwargs ===
|
|
55
|
+
tools: list[Any] | None = None,
|
|
56
|
+
custom_loop: Any | None = None,
|
|
57
|
+
only_n_most_recent_images: int | None = None,
|
|
58
|
+
callbacks: list[Any] | None = None,
|
|
59
|
+
instructions: str | None = None,
|
|
60
|
+
verbosity: int | None = None,
|
|
61
|
+
max_retries: int | None = 3,
|
|
62
|
+
screenshot_delay: float | int = 0.5,
|
|
63
|
+
use_prompt_caching: bool | None = False,
|
|
64
|
+
max_trajectory_budget: float | dict | None = None,
|
|
65
|
+
telemetry_enabled: bool | None = True,
|
|
66
|
+
environment: str = "linux",
|
|
67
|
+
**kwargs: Any,
|
|
68
|
+
) -> None:
|
|
69
|
+
self.allowed_tools = allowed_tools or ["openai_computer"]
|
|
70
|
+
super().__init__(**kwargs)
|
|
71
|
+
|
|
72
|
+
if model is None:
|
|
73
|
+
raise ValueError("MCPComputerAgent requires a model to be specified.")
|
|
74
|
+
|
|
75
|
+
self.model = model
|
|
76
|
+
self.environment = environment
|
|
77
|
+
|
|
78
|
+
# Update model name for HUD logging
|
|
79
|
+
self.model_name = "cua-" + self.model
|
|
80
|
+
|
|
81
|
+
# Stateful tracking of tool call inputs
|
|
82
|
+
self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {}
|
|
83
|
+
self.previous_output: list[dict[str, Any]] = []
|
|
84
|
+
|
|
85
|
+
# Build system prompt
|
|
86
|
+
operator_instructions = """
|
|
87
|
+
You are an autonomous computer-using agent. Follow these guidelines:
|
|
88
|
+
|
|
89
|
+
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
90
|
+
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
|
91
|
+
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
|
92
|
+
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
|
93
|
+
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
|
94
|
+
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
|
95
|
+
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
96
|
+
|
|
97
|
+
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
98
|
+
""".strip() # noqa: E501
|
|
99
|
+
# Append Operator instructions to the system prompt
|
|
100
|
+
if not self.system_prompt:
|
|
101
|
+
self.system_prompt = operator_instructions
|
|
102
|
+
else:
|
|
103
|
+
self.system_prompt += f"\n\n{operator_instructions}"
|
|
104
|
+
# Append user instructions to the system prompt
|
|
105
|
+
if instructions:
|
|
106
|
+
self.system_prompt += f"\n\n{instructions}"
|
|
107
|
+
|
|
108
|
+
# Configure trajectory_dir for HUD
|
|
109
|
+
if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path):
|
|
110
|
+
trajectory_dir = {"trajectory_dir": str(trajectory_dir)}
|
|
111
|
+
if isinstance(trajectory_dir, dict):
|
|
112
|
+
trajectory_dir["reset_on_run"] = False
|
|
113
|
+
|
|
114
|
+
self.last_screenshot_b64 = None
|
|
115
|
+
|
|
116
|
+
buffer = io.BytesIO()
|
|
117
|
+
Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG')
|
|
118
|
+
self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
|
119
|
+
|
|
120
|
+
# Ensure a computer shim is present so width/height/environment are known
|
|
121
|
+
computer_shim = {
|
|
122
|
+
"screenshot": lambda: self.last_screenshot_b64,
|
|
123
|
+
"environment": self.environment,
|
|
124
|
+
"dimensions": (
|
|
125
|
+
self.metadata["display_width"],
|
|
126
|
+
self.metadata["display_height"],
|
|
127
|
+
),
|
|
128
|
+
}
|
|
129
|
+
agent_tools: list[Any] = [computer_shim]
|
|
130
|
+
if tools:
|
|
131
|
+
agent_tools.extend([
|
|
132
|
+
tool
|
|
133
|
+
for tool in tools
|
|
134
|
+
if not is_agent_computer(tool)
|
|
135
|
+
])
|
|
136
|
+
|
|
137
|
+
agent_kwargs = {
|
|
138
|
+
"model": self.model,
|
|
139
|
+
"trajectory_dir": trajectory_dir,
|
|
140
|
+
"tools": agent_tools,
|
|
141
|
+
"custom_loop": custom_loop,
|
|
142
|
+
"only_n_most_recent_images": only_n_most_recent_images,
|
|
143
|
+
"callbacks": callbacks,
|
|
144
|
+
"instructions": self.system_prompt,
|
|
145
|
+
"verbosity": verbosity,
|
|
146
|
+
"max_retries": max_retries,
|
|
147
|
+
"screenshot_delay": screenshot_delay,
|
|
148
|
+
"use_prompt_caching": use_prompt_caching,
|
|
149
|
+
"max_trajectory_budget": max_trajectory_budget,
|
|
150
|
+
"telemetry_enabled": telemetry_enabled,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
self.computer_agent = BaseComputerAgent(
|
|
154
|
+
**agent_kwargs
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
async def get_system_messages(self) -> list[Any]:
|
|
158
|
+
"""Create initial messages.
|
|
159
|
+
|
|
160
|
+
Unused - ComputerAgent handles this with the 'instructions' parameter.
|
|
161
|
+
"""
|
|
162
|
+
return []
|
|
163
|
+
|
|
164
|
+
async def format_blocks(
|
|
165
|
+
self, blocks: list[types.ContentBlock]
|
|
166
|
+
) -> list[dict[str, Any]]:
|
|
167
|
+
"""
|
|
168
|
+
Format blocks for OpenAI input format.
|
|
169
|
+
|
|
170
|
+
Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
|
|
171
|
+
""" # noqa: E501
|
|
172
|
+
formatted = []
|
|
173
|
+
for block in blocks:
|
|
174
|
+
if isinstance(block, types.TextContent):
|
|
175
|
+
formatted.append({"type": "input_text", "text": block.text})
|
|
176
|
+
elif isinstance(block, types.ImageContent):
|
|
177
|
+
mime_type = getattr(block, "mimeType", "image/png")
|
|
178
|
+
formatted.append(
|
|
179
|
+
{"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
|
|
180
|
+
)
|
|
181
|
+
self.last_screenshot_b64 = block.data
|
|
182
|
+
return [{"role": "user", "content": formatted}]
|
|
183
|
+
|
|
184
|
+
@hud.instrument(
|
|
185
|
+
span_type="agent",
|
|
186
|
+
record_args=False, # Messages can be large
|
|
187
|
+
record_result=True,
|
|
188
|
+
)
|
|
189
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
190
|
+
"""Get a single-step response by delegating to ComputerAgent.run.
|
|
191
|
+
|
|
192
|
+
Returns an Agent SDK-style response dict:
|
|
193
|
+
{ "output": [AgentMessage, ...], "usage": Usage }
|
|
194
|
+
"""
|
|
195
|
+
tool_calls: list[MCPToolCall] = []
|
|
196
|
+
output_text: list[str] = []
|
|
197
|
+
is_done: bool = True
|
|
198
|
+
|
|
199
|
+
agent_result: list[dict[str, Any]] = []
|
|
200
|
+
|
|
201
|
+
# Call the ComputerAgent LLM API
|
|
202
|
+
async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
|
|
203
|
+
items = result['output']
|
|
204
|
+
if not items or tool_calls:
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
for item in items:
|
|
208
|
+
if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']:
|
|
209
|
+
agent_result.append(item)
|
|
210
|
+
|
|
211
|
+
# Add messages to output text
|
|
212
|
+
if item['type'] == 'reasoning':
|
|
213
|
+
output_text.extend(
|
|
214
|
+
f"Reasoning: {summary['text']}"
|
|
215
|
+
for summary in item['summary']
|
|
216
|
+
)
|
|
217
|
+
elif item['type'] == 'message':
|
|
218
|
+
if isinstance(item['content'], list):
|
|
219
|
+
output_text.extend(
|
|
220
|
+
item['text']
|
|
221
|
+
for item in item['content']
|
|
222
|
+
if item['type'] == 'output_text'
|
|
223
|
+
)
|
|
224
|
+
elif isinstance(item['content'], str):
|
|
225
|
+
output_text.append(item['content'])
|
|
226
|
+
|
|
227
|
+
# If we get a tool call, we're not done
|
|
228
|
+
if item['type'] == 'computer_call':
|
|
229
|
+
id = item["call_id"]
|
|
230
|
+
tool_calls.append(MCPToolCall(
|
|
231
|
+
name="openai_computer",
|
|
232
|
+
arguments=item["action"],
|
|
233
|
+
id=id,
|
|
234
|
+
))
|
|
235
|
+
is_done = False
|
|
236
|
+
self.tool_call_inputs[id] = agent_result
|
|
237
|
+
break
|
|
238
|
+
|
|
239
|
+
# if we have tool calls, we should exit the loop
|
|
240
|
+
if tool_calls:
|
|
241
|
+
break
|
|
242
|
+
|
|
243
|
+
self.previous_output = agent_result
|
|
244
|
+
|
|
245
|
+
return AgentResponse(
|
|
246
|
+
content="\n".join(output_text),
|
|
247
|
+
tool_calls=tool_calls,
|
|
248
|
+
done=is_done,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def _log_image(self, image_b64: str):
|
|
252
|
+
callbacks = self.computer_agent.callbacks
|
|
253
|
+
for callback in callbacks:
|
|
254
|
+
if isinstance(callback, TrajectorySaverCallback):
|
|
255
|
+
# convert str to bytes
|
|
256
|
+
image_bytes = base64.b64decode(image_b64)
|
|
257
|
+
callback._save_artifact("screenshot_after", image_bytes)
|
|
258
|
+
|
|
259
|
+
async def format_tool_results(
|
|
260
|
+
self,
|
|
261
|
+
tool_calls: list[MCPToolCall],
|
|
262
|
+
tool_results: list[MCPToolResult]
|
|
263
|
+
) -> list[dict[str, Any]]:
|
|
264
|
+
"""Extract latest screenshot from tool results in dict form.
|
|
265
|
+
|
|
266
|
+
Expects results to already be in the message-format content dicts.
|
|
267
|
+
Returns a list of input content dicts suitable for follow-up calls.
|
|
268
|
+
"""
|
|
269
|
+
messages = []
|
|
270
|
+
|
|
271
|
+
for call, result in zip(tool_calls, tool_results):
|
|
272
|
+
if call.id not in self.tool_call_inputs:
|
|
273
|
+
# If we don't have the tool call inputs, we should just use the previous output
|
|
274
|
+
previous_output = self.previous_output.copy() or []
|
|
275
|
+
|
|
276
|
+
# First we need to remove any pending computer_calls from the end of previous_output
|
|
277
|
+
while previous_output and previous_output[-1]['type'] == 'computer_call':
|
|
278
|
+
previous_output.pop()
|
|
279
|
+
messages.extend(previous_output)
|
|
280
|
+
|
|
281
|
+
# If the call is a 'response', don't add the result
|
|
282
|
+
if call.name == 'response':
|
|
283
|
+
continue
|
|
284
|
+
# Otherwise, if we have a result, we should add it to the messages
|
|
285
|
+
content = [
|
|
286
|
+
{ "type": "input_text", "text": content.text } if isinstance(content, types.TextContent)
|
|
287
|
+
else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent)
|
|
288
|
+
else { "type": "input_text", "text": "" }
|
|
289
|
+
for content in result.content
|
|
290
|
+
]
|
|
291
|
+
messages.append({
|
|
292
|
+
"role": "user",
|
|
293
|
+
"content": content,
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
# Add the assistant's computer call
|
|
299
|
+
messages.extend(self.tool_call_inputs[call.id])
|
|
300
|
+
|
|
301
|
+
if result.isError:
|
|
302
|
+
error_text = "".join([
|
|
303
|
+
content.text
|
|
304
|
+
for content in result.content
|
|
305
|
+
if isinstance(content, types.TextContent)
|
|
306
|
+
])
|
|
307
|
+
|
|
308
|
+
# Replace computer call with failed tool call
|
|
309
|
+
messages.pop()
|
|
310
|
+
messages.extend(make_failed_tool_call_items(
|
|
311
|
+
tool_name=call.name,
|
|
312
|
+
tool_kwargs=call.arguments or {},
|
|
313
|
+
error_message=error_text,
|
|
314
|
+
call_id=call.id,
|
|
315
|
+
))
|
|
316
|
+
else:
|
|
317
|
+
# Get the latest screenshot
|
|
318
|
+
screenshots = [
|
|
319
|
+
content.data
|
|
320
|
+
for content in result.content
|
|
321
|
+
if isinstance(content, types.ImageContent)
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
# Add the resulting screenshot
|
|
325
|
+
if screenshots:
|
|
326
|
+
self._log_image(screenshots[0])
|
|
327
|
+
self.last_screenshot_b64 = screenshots[0]
|
|
328
|
+
messages.append({
|
|
329
|
+
"type": "computer_call_output",
|
|
330
|
+
"call_id": call.id,
|
|
331
|
+
"output": {
|
|
332
|
+
"type": "input_image",
|
|
333
|
+
"image_url": f"data:image/png;base64,{screenshots[0]}"
|
|
334
|
+
},
|
|
335
|
+
})
|
|
336
|
+
else:
|
|
337
|
+
# Otherwise, replace computer call with failed tool call
|
|
338
|
+
messages.pop()
|
|
339
|
+
messages.extend(make_failed_tool_call_items(
|
|
340
|
+
tool_name=call.name,
|
|
341
|
+
tool_kwargs=call.arguments or {},
|
|
342
|
+
error_message="No screenshots returned.",
|
|
343
|
+
call_id=call.id,
|
|
344
|
+
))
|
|
345
|
+
|
|
346
|
+
return messages
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
__all__ = [
|
|
350
|
+
"MCPComputerAgent",
|
|
351
|
+
]
|
agent/integrations/hud/proxy.py
CHANGED
|
@@ -13,6 +13,10 @@ import uuid
|
|
|
13
13
|
from typing import Any, Dict, List, Optional
|
|
14
14
|
|
|
15
15
|
from agent.agent import ComputerAgent as BaseComputerAgent
|
|
16
|
+
from agent.callbacks import PromptInstructionsCallback
|
|
17
|
+
from hud.tools.computer.settings import computer_settings
|
|
18
|
+
from PIL import Image
|
|
19
|
+
from hud.agents import OperatorAgent
|
|
16
20
|
|
|
17
21
|
# OpenAI Responses typed models (required)
|
|
18
22
|
from openai.types.responses import (
|
|
@@ -178,6 +182,83 @@ class FakeAsyncOpenAI:
|
|
|
178
182
|
print(traceback.format_exc())
|
|
179
183
|
raise e
|
|
180
184
|
|
|
185
|
+
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
# Proxy OperatorAgent (moved from __init__.py)
|
|
188
|
+
# ---------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class ProxyOperatorAgent(OperatorAgent):
|
|
192
|
+
"""OperatorAgent that proxies model calls through our ComputerAgent.
|
|
193
|
+
|
|
194
|
+
Accepts the same config keys we pass via hud.run_dataset `agent_config`:
|
|
195
|
+
- model: str | None
|
|
196
|
+
- allowed_tools: list[str] | None
|
|
197
|
+
Additional kwargs are forwarded to OperatorAgent (if any are supported).
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
def __init__(
|
|
201
|
+
self,
|
|
202
|
+
*,
|
|
203
|
+
model: str | None = None,
|
|
204
|
+
allowed_tools: list[str] | None = None,
|
|
205
|
+
trajectory_dir: str | dict | None = None,
|
|
206
|
+
# === ComputerAgent kwargs ===
|
|
207
|
+
tools: list[Any] | None = None,
|
|
208
|
+
custom_loop: Any | None = None,
|
|
209
|
+
only_n_most_recent_images: int | None = None,
|
|
210
|
+
callbacks: list[Any] | None = None,
|
|
211
|
+
instructions: str | None = None,
|
|
212
|
+
verbosity: int | None = None,
|
|
213
|
+
max_retries: int | None = 3,
|
|
214
|
+
screenshot_delay: float | int = 0.5,
|
|
215
|
+
use_prompt_caching: bool | None = False,
|
|
216
|
+
max_trajectory_budget: float | dict | None = None,
|
|
217
|
+
telemetry_enabled: bool | None = True,
|
|
218
|
+
**kwargs: Any,
|
|
219
|
+
) -> None:
|
|
220
|
+
model = model or "computer-use-preview"
|
|
221
|
+
allowed_tools = allowed_tools or ["openai_computer"]
|
|
222
|
+
|
|
223
|
+
computer_shim = {
|
|
224
|
+
'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
|
|
225
|
+
'environment': 'linux',
|
|
226
|
+
'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
|
|
227
|
+
}
|
|
228
|
+
# Build tools ensuring the computer_shim is included
|
|
229
|
+
agent_tools: list[Any] = [computer_shim]
|
|
230
|
+
if tools:
|
|
231
|
+
agent_tools.extend(tools)
|
|
232
|
+
|
|
233
|
+
# Build callbacks, injecting prompt instructions if provided
|
|
234
|
+
agent_callbacks = list(callbacks or [])
|
|
235
|
+
if instructions:
|
|
236
|
+
agent_callbacks.append(PromptInstructionsCallback(instructions))
|
|
237
|
+
|
|
238
|
+
computer_agent = BaseComputerAgent(
|
|
239
|
+
model=model,
|
|
240
|
+
tools=agent_tools,
|
|
241
|
+
custom_loop=custom_loop,
|
|
242
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
243
|
+
callbacks=agent_callbacks,
|
|
244
|
+
verbosity=verbosity,
|
|
245
|
+
trajectory_dir=trajectory_dir,
|
|
246
|
+
max_retries=max_retries,
|
|
247
|
+
screenshot_delay=screenshot_delay,
|
|
248
|
+
use_prompt_caching=use_prompt_caching,
|
|
249
|
+
max_trajectory_budget=max_trajectory_budget,
|
|
250
|
+
telemetry_enabled=telemetry_enabled,
|
|
251
|
+
)
|
|
252
|
+
model_client = FakeAsyncOpenAI(computer_agent)
|
|
253
|
+
|
|
254
|
+
super().__init__(
|
|
255
|
+
model_client=model_client, # type: ignore[arg-type]
|
|
256
|
+
model=model,
|
|
257
|
+
allowed_tools=allowed_tools,
|
|
258
|
+
**kwargs,
|
|
259
|
+
)
|
|
260
|
+
|
|
181
261
|
__all__ = [
|
|
182
262
|
"FakeAsyncOpenAI",
|
|
263
|
+
"ProxyOperatorAgent",
|
|
183
264
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.30
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.12
|
|
@@ -37,7 +37,7 @@ Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
|
|
|
37
37
|
Provides-Extra: cli
|
|
38
38
|
Requires-Dist: yaspin>=3.1.0; extra == "cli"
|
|
39
39
|
Provides-Extra: hud
|
|
40
|
-
Requires-Dist: hud-python
|
|
40
|
+
Requires-Dist: hud-python==0.4.19; extra == "hud"
|
|
41
41
|
Provides-Extra: all
|
|
42
42
|
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
|
|
43
43
|
Requires-Dist: accelerate; extra == "all"
|
|
@@ -46,7 +46,7 @@ Requires-Dist: transformers>=4.54.0; extra == "all"
|
|
|
46
46
|
Requires-Dist: gradio>=5.23.3; extra == "all"
|
|
47
47
|
Requires-Dist: python-dotenv>=1.0.1; extra == "all"
|
|
48
48
|
Requires-Dist: yaspin>=3.1.0; extra == "all"
|
|
49
|
-
Requires-Dist: hud-python
|
|
49
|
+
Requires-Dist: hud-python==0.4.19; extra == "all"
|
|
50
50
|
Description-Content-Type: text/markdown
|
|
51
51
|
|
|
52
52
|
<div align="center">
|
|
@@ -25,8 +25,9 @@ agent/human_tool/__init__.py,sha256=3m5_g-Fo_0yX5vi7eg-A92oTqO0N3aY929Ajp78HKsE,
|
|
|
25
25
|
agent/human_tool/__main__.py,sha256=VsW2BAghlonOuqZbP_xuCsaec9bemA1I_ibnDcED9D4,1068
|
|
26
26
|
agent/human_tool/server.py,sha256=ceuL5kw_RjgAi8fueLU3nTjyzOLE25Shv1oTJnSHsoQ,7964
|
|
27
27
|
agent/human_tool/ui.py,sha256=wu9eZorhxCkyPTlBSZjYaVzutoHMlucAz8UGNpAT4bM,30644
|
|
28
|
-
agent/integrations/hud/__init__.py,sha256=
|
|
29
|
-
agent/integrations/hud/
|
|
28
|
+
agent/integrations/hud/__init__.py,sha256=OI6GtXm2XgyF5SWGA5mSA6IWhrO7faxFgXJSehoCjW0,5433
|
|
29
|
+
agent/integrations/hud/agent.py,sha256=GBikd9MhjDNKMiMG8J7PE3OMSmvmC_JLZ1p5xr2cZoc,14006
|
|
30
|
+
agent/integrations/hud/proxy.py,sha256=8HUoh7uZ8Z3vkhPXK0dskgePGsP8oCqyYij0mE_E7X8,10902
|
|
30
31
|
agent/loops/__init__.py,sha256=Ef8aj07l3osibwDk-DTo80PrpL4_GdKRTP1ikl_b-BQ,328
|
|
31
32
|
agent/loops/anthropic.py,sha256=q7lr1PjI6VPtlozoweluY2c3hCGqa_2s-whzxa37iKE,70250
|
|
32
33
|
agent/loops/base.py,sha256=LK7kSTnc2CB88LI7qr2VP7LMq0eS5r2bSEnrxO6IN5U,2345
|
|
@@ -46,7 +47,7 @@ agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
|
|
|
46
47
|
agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
|
|
47
48
|
agent/ui/gradio/app.py,sha256=Ol97YEbwREZZQ9_PMjVHlfOcu9BGsawxgAGAm79hT80,9117
|
|
48
49
|
agent/ui/gradio/ui_components.py,sha256=dJUvKDmc1oSejtoR_gU_oWWYwxaOOQyPloSYRGMrUCQ,36068
|
|
49
|
-
cua_agent-0.4.
|
|
50
|
-
cua_agent-0.4.
|
|
51
|
-
cua_agent-0.4.
|
|
52
|
-
cua_agent-0.4.
|
|
50
|
+
cua_agent-0.4.30.dist-info/METADATA,sha256=QiP-FZkf38c0q6Wptu-Z2aBXtpomHqA7JvA8NQRYvS0,5610
|
|
51
|
+
cua_agent-0.4.30.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
|
|
52
|
+
cua_agent-0.4.30.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
53
|
+
cua_agent-0.4.30.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|