cua-agent 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/adapters/__init__.py +2 -0
- agent/adapters/mlxvlm_adapter.py +359 -0
- agent/agent.py +14 -3
- agent/callbacks/__init__.py +2 -0
- agent/callbacks/operator_validator.py +138 -0
- agent/callbacks/trajectory_saver.py +87 -5
- agent/integrations/hud/__init__.py +223 -72
- agent/integrations/hud/proxy.py +183 -0
- agent/loops/anthropic.py +12 -1
- agent/loops/composed_grounded.py +26 -14
- agent/loops/openai.py +15 -7
- agent/loops/uitars.py +17 -8
- agent/proxy/examples.py +192 -0
- agent/proxy/handlers.py +248 -0
- {cua_agent-0.4.17.dist-info → cua_agent-0.4.19.dist-info}/METADATA +3 -3
- {cua_agent-0.4.17.dist-info → cua_agent-0.4.19.dist-info}/RECORD +18 -16
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/agent.py +0 -373
- agent/integrations/hud/computer_handler.py +0 -187
- {cua_agent-0.4.17.dist-info → cua_agent-0.4.19.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.17.dist-info → cua_agent-0.4.19.dist-info}/entry_points.txt +0 -0
|
@@ -11,6 +11,8 @@ from pathlib import Path
|
|
|
11
11
|
from typing import List, Dict, Any, Optional, Union, override
|
|
12
12
|
from PIL import Image, ImageDraw
|
|
13
13
|
import io
|
|
14
|
+
from copy import deepcopy
|
|
15
|
+
|
|
14
16
|
from .base import AsyncCallbackHandler
|
|
15
17
|
|
|
16
18
|
def sanitize_image_urls(data: Any) -> Any:
|
|
@@ -43,6 +45,64 @@ def sanitize_image_urls(data: Any) -> Any:
|
|
|
43
45
|
return data
|
|
44
46
|
|
|
45
47
|
|
|
48
|
+
def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: Optional[Path]) -> List[Dict[str, Any]]:
|
|
49
|
+
"""
|
|
50
|
+
Save any base64-encoded screenshots from computer_call_output entries to files and
|
|
51
|
+
replace their image_url with the saved file path when a call_id is present.
|
|
52
|
+
|
|
53
|
+
Only operates if screenshot_dir is provided and exists; otherwise returns items unchanged.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
items: List of message/result dicts potentially containing computer_call_output entries
|
|
57
|
+
screenshot_dir: Directory to write screenshots into
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
A new list with updated image_url fields when applicable.
|
|
61
|
+
"""
|
|
62
|
+
if not items:
|
|
63
|
+
return items
|
|
64
|
+
if not screenshot_dir or not screenshot_dir.exists():
|
|
65
|
+
return items
|
|
66
|
+
|
|
67
|
+
updated: List[Dict[str, Any]] = []
|
|
68
|
+
for item in items:
|
|
69
|
+
# work on a shallow copy; deep copy nested 'output' if we modify it
|
|
70
|
+
msg = dict(item)
|
|
71
|
+
try:
|
|
72
|
+
if msg.get("type") == "computer_call_output":
|
|
73
|
+
call_id = msg.get("call_id")
|
|
74
|
+
output = msg.get("output", {})
|
|
75
|
+
image_url = output.get("image_url")
|
|
76
|
+
if call_id and isinstance(image_url, str) and image_url.startswith("data:"):
|
|
77
|
+
# derive extension from MIME type e.g. data:image/png;base64,
|
|
78
|
+
try:
|
|
79
|
+
ext = image_url.split(";", 1)[0].split("/")[-1]
|
|
80
|
+
if not ext:
|
|
81
|
+
ext = "png"
|
|
82
|
+
except Exception:
|
|
83
|
+
ext = "png"
|
|
84
|
+
out_path = screenshot_dir / f"{call_id}.{ext}"
|
|
85
|
+
# write file if it doesn't exist
|
|
86
|
+
if not out_path.exists():
|
|
87
|
+
try:
|
|
88
|
+
b64_payload = image_url.split(",", 1)[1]
|
|
89
|
+
img_bytes = base64.b64decode(b64_payload)
|
|
90
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
with open(out_path, "wb") as f:
|
|
92
|
+
f.write(img_bytes)
|
|
93
|
+
except Exception:
|
|
94
|
+
# if anything fails, skip modifying this message
|
|
95
|
+
pass
|
|
96
|
+
# update image_url to file path
|
|
97
|
+
new_output = dict(output)
|
|
98
|
+
new_output["image_url"] = str(out_path)
|
|
99
|
+
msg["output"] = new_output
|
|
100
|
+
except Exception:
|
|
101
|
+
# do not block on malformed entries; keep original
|
|
102
|
+
pass
|
|
103
|
+
updated.append(msg)
|
|
104
|
+
return updated
|
|
105
|
+
|
|
46
106
|
class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
47
107
|
"""
|
|
48
108
|
Callback handler that saves agent trajectories to disk.
|
|
@@ -51,7 +111,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
51
111
|
within the trajectory gets its own folder with screenshots and responses.
|
|
52
112
|
"""
|
|
53
113
|
|
|
54
|
-
def __init__(self, trajectory_dir: str, reset_on_run: bool = True):
|
|
114
|
+
def __init__(self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None):
|
|
55
115
|
"""
|
|
56
116
|
Initialize trajectory saver.
|
|
57
117
|
|
|
@@ -67,10 +127,12 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
67
127
|
self.model: Optional[str] = None
|
|
68
128
|
self.total_usage: Dict[str, Any] = {}
|
|
69
129
|
self.reset_on_run = reset_on_run
|
|
130
|
+
# Optional directory to store extracted screenshots from metadata/new_items
|
|
131
|
+
self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
|
|
70
132
|
|
|
71
133
|
# Ensure trajectory directory exists
|
|
72
134
|
self.trajectory_dir.mkdir(parents=True, exist_ok=True)
|
|
73
|
-
|
|
135
|
+
|
|
74
136
|
def _get_turn_dir(self) -> Path:
|
|
75
137
|
"""Get the directory for the current turn."""
|
|
76
138
|
if not self.trajectory_id:
|
|
@@ -94,6 +156,10 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
94
156
|
# format: turn_000/0000_name.json
|
|
95
157
|
artifact_filename = f"{self.current_artifact:04d}_{name}"
|
|
96
158
|
artifact_path = turn_dir / f"{artifact_filename}.json"
|
|
159
|
+
# add created_at
|
|
160
|
+
if isinstance(artifact, dict):
|
|
161
|
+
artifact = artifact.copy()
|
|
162
|
+
artifact["created_at"] = str(uuid.uuid1().time)
|
|
97
163
|
with open(artifact_path, "w") as f:
|
|
98
164
|
json.dump(sanitize_image_urls(artifact), f, indent=2)
|
|
99
165
|
self.current_artifact += 1
|
|
@@ -135,12 +201,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
135
201
|
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
136
202
|
trajectory_path.mkdir(parents=True, exist_ok=True)
|
|
137
203
|
|
|
138
|
-
# Save trajectory metadata
|
|
204
|
+
# Save trajectory metadata (optionally extract screenshots to screenshot_dir)
|
|
205
|
+
kwargs_to_save = kwargs.copy()
|
|
206
|
+
try:
|
|
207
|
+
if "messages" in kwargs_to_save:
|
|
208
|
+
kwargs_to_save["messages"] = extract_computer_call_outputs(
|
|
209
|
+
kwargs_to_save["messages"], self.screenshot_dir
|
|
210
|
+
)
|
|
211
|
+
except Exception:
|
|
212
|
+
# If extraction fails, fall back to original messages
|
|
213
|
+
pass
|
|
139
214
|
metadata = {
|
|
140
215
|
"trajectory_id": self.trajectory_id,
|
|
141
216
|
"created_at": str(uuid.uuid1().time),
|
|
142
217
|
"status": "running",
|
|
143
|
-
"kwargs":
|
|
218
|
+
"kwargs": kwargs_to_save,
|
|
144
219
|
}
|
|
145
220
|
|
|
146
221
|
with open(trajectory_path / "metadata.json", "w") as f:
|
|
@@ -167,11 +242,18 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
167
242
|
metadata = {}
|
|
168
243
|
|
|
169
244
|
# Update metadata with completion info
|
|
245
|
+
# Optionally extract screenshots from new_items before persisting
|
|
246
|
+
new_items_to_save = new_items
|
|
247
|
+
try:
|
|
248
|
+
new_items_to_save = extract_computer_call_outputs(new_items, self.screenshot_dir)
|
|
249
|
+
except Exception:
|
|
250
|
+
pass
|
|
251
|
+
|
|
170
252
|
metadata.update({
|
|
171
253
|
"status": "completed",
|
|
172
254
|
"completed_at": str(uuid.uuid1().time),
|
|
173
255
|
"total_usage": self.total_usage,
|
|
174
|
-
"new_items":
|
|
256
|
+
"new_items": new_items_to_save,
|
|
175
257
|
"total_turns": self.current_turn
|
|
176
258
|
})
|
|
177
259
|
|
|
@@ -1,77 +1,228 @@
|
|
|
1
|
-
"""HUD integration
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
adapter_cls: Custom adapter class (defaults to ComputerAgentAdapter)
|
|
38
|
-
adapter_kwargs: Additional kwargs for the adapter
|
|
39
|
-
max_steps_per_task: Maximum steps per task
|
|
40
|
-
run_parallel: Whether to run tasks in parallel
|
|
41
|
-
job_metadata: Additional metadata for the job
|
|
42
|
-
show_progress: Whether to show progress
|
|
43
|
-
max_concurrent_env_creations: Max concurrent environment creations
|
|
44
|
-
max_concurrent_agent_predictions: Max concurrent agent predictions
|
|
45
|
-
max_concurrent_tasks: Max concurrent tasks
|
|
46
|
-
**agent_kwargs: Additional kwargs to pass to ComputerAgent
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
Job instance from HUD
|
|
1
|
+
"""HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy).
|
|
2
|
+
|
|
3
|
+
This module exposes two helpers to evaluate HUD-compatible datasets using
|
|
4
|
+
HUD's OperatorAgent, while proxying model calls through our ComputerAgent via
|
|
5
|
+
`FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).
|
|
6
|
+
|
|
7
|
+
Exports:
|
|
8
|
+
- run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None)
|
|
9
|
+
- run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50)
|
|
10
|
+
"""
|
|
11
|
+
import time
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from PIL import Image
|
|
15
|
+
from datasets import load_dataset, Dataset
|
|
16
|
+
from hud.agents import OperatorAgent
|
|
17
|
+
from hud.datasets import Task, run_dataset
|
|
18
|
+
from hud.tools.computer.settings import computer_settings
|
|
19
|
+
from hud import trace
|
|
20
|
+
|
|
21
|
+
from agent.agent import ComputerAgent as BaseComputerAgent
|
|
22
|
+
from .proxy import FakeAsyncOpenAI
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Proxy OperatorAgent
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ProxyOperatorAgent(OperatorAgent):
|
|
31
|
+
"""OperatorAgent that proxies model calls through our ComputerAgent.
|
|
32
|
+
|
|
33
|
+
Accepts the same config keys we pass via hud.run_dataset `agent_config`:
|
|
34
|
+
- model: str | None
|
|
35
|
+
- allowed_tools: list[str] | None
|
|
36
|
+
Additional kwargs are forwarded to OperatorAgent (if any are supported).
|
|
50
37
|
"""
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
*,
|
|
42
|
+
model: str | None = None,
|
|
43
|
+
allowed_tools: list[str] | None = None,
|
|
44
|
+
trajectory_dir: str | dict | None = None,
|
|
45
|
+
# === ComputerAgent kwargs ===
|
|
46
|
+
tools: list[Any] | None = None,
|
|
47
|
+
custom_loop: Any | None = None,
|
|
48
|
+
only_n_most_recent_images: int | None = None,
|
|
49
|
+
callbacks: list[Any] | None = None,
|
|
50
|
+
verbosity: int | None = None,
|
|
51
|
+
max_retries: int | None = 3,
|
|
52
|
+
screenshot_delay: float | int = 0.5,
|
|
53
|
+
use_prompt_caching: bool | None = False,
|
|
54
|
+
max_trajectory_budget: float | dict | None = None,
|
|
55
|
+
telemetry_enabled: bool | None = True,
|
|
56
|
+
**kwargs: Any,
|
|
57
|
+
) -> None:
|
|
58
|
+
model = model or "computer-use-preview"
|
|
59
|
+
allowed_tools = allowed_tools or ["openai_computer"]
|
|
60
|
+
|
|
61
|
+
computer_shim = {
|
|
62
|
+
'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
|
|
63
|
+
'environment': 'linux',
|
|
64
|
+
'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
|
|
65
|
+
}
|
|
66
|
+
# Build tools ensuring the computer_shim is included
|
|
67
|
+
agent_tools: list[Any] = [computer_shim]
|
|
68
|
+
if tools:
|
|
69
|
+
agent_tools.extend(tools)
|
|
70
|
+
|
|
71
|
+
computer_agent = BaseComputerAgent(
|
|
72
|
+
model=model,
|
|
73
|
+
tools=agent_tools,
|
|
74
|
+
custom_loop=custom_loop,
|
|
75
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
76
|
+
callbacks=callbacks,
|
|
77
|
+
verbosity=verbosity,
|
|
78
|
+
trajectory_dir=trajectory_dir,
|
|
79
|
+
max_retries=max_retries,
|
|
80
|
+
screenshot_delay=screenshot_delay,
|
|
81
|
+
use_prompt_caching=use_prompt_caching,
|
|
82
|
+
max_trajectory_budget=max_trajectory_budget,
|
|
83
|
+
telemetry_enabled=telemetry_enabled,
|
|
84
|
+
)
|
|
85
|
+
model_client = FakeAsyncOpenAI(computer_agent)
|
|
86
|
+
|
|
87
|
+
super().__init__(
|
|
88
|
+
model_client=model_client, # type: ignore[arg-type]
|
|
89
|
+
model=model,
|
|
90
|
+
allowed_tools=allowed_tools,
|
|
91
|
+
**kwargs,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
# Single-task runner
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
async def run_single_task(
|
|
101
|
+
dataset: str | Dataset | list[dict[str, Any]],
|
|
102
|
+
*,
|
|
103
|
+
task_id: int = 0,
|
|
104
|
+
model: str | None = None,
|
|
105
|
+
allowed_tools: list[str] | None = None,
|
|
106
|
+
# === ComputerAgent kwargs ===
|
|
107
|
+
tools: list[Any] | None = None,
|
|
108
|
+
custom_loop: Any | None = None,
|
|
109
|
+
only_n_most_recent_images: int | None = None,
|
|
110
|
+
callbacks: list[Any] | None = None,
|
|
111
|
+
verbosity: int | None = None,
|
|
112
|
+
trajectory_dir: str | dict | None = None,
|
|
113
|
+
max_retries: int | None = 3,
|
|
114
|
+
screenshot_delay: float | int = 0.5,
|
|
115
|
+
use_prompt_caching: bool | None = False,
|
|
116
|
+
max_trajectory_budget: float | dict | None = None,
|
|
117
|
+
telemetry_enabled: bool | None = True,
|
|
118
|
+
) -> None:
|
|
119
|
+
"""Load one task from the dataset and execute it with Operator+CUA proxy."""
|
|
120
|
+
|
|
121
|
+
# Load dataset and pick a sample
|
|
122
|
+
if isinstance(dataset, str):
|
|
123
|
+
dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
|
|
124
|
+
elif isinstance(dataset, list):
|
|
125
|
+
dataset = dataset
|
|
126
|
+
else:
|
|
127
|
+
dataset = dataset["train"]
|
|
56
128
|
|
|
57
|
-
#
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
129
|
+
sample_task = dataset[task_id] # type: ignore[index]
|
|
130
|
+
task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
|
|
131
|
+
|
|
132
|
+
with trace(name=task_prompt):
|
|
133
|
+
task = Task(**sample_task) # type: ignore[arg-type]
|
|
134
|
+
|
|
135
|
+
agent = ProxyOperatorAgent(
|
|
136
|
+
model=model,
|
|
137
|
+
allowed_tools=allowed_tools,
|
|
138
|
+
# === ComputerAgent kwargs passthrough ===
|
|
139
|
+
tools=tools,
|
|
140
|
+
custom_loop=custom_loop,
|
|
141
|
+
only_n_most_recent_images=only_n_most_recent_images,
|
|
142
|
+
callbacks=callbacks,
|
|
143
|
+
verbosity=verbosity,
|
|
144
|
+
trajectory_dir=trajectory_dir,
|
|
145
|
+
max_retries=max_retries,
|
|
146
|
+
screenshot_delay=screenshot_delay,
|
|
147
|
+
use_prompt_caching=use_prompt_caching,
|
|
148
|
+
max_trajectory_budget=max_trajectory_budget,
|
|
149
|
+
telemetry_enabled=telemetry_enabled,
|
|
150
|
+
)
|
|
151
|
+
print(f"Running: {task_prompt}")
|
|
152
|
+
result = await agent.run(task, max_steps=10)
|
|
153
|
+
print(f"✅ Reward: {getattr(result, 'reward')}")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
# Full-dataset runner
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
async def run_full_dataset(
|
|
162
|
+
dataset: str | Dataset | list[dict[str, Any]],
|
|
163
|
+
*,
|
|
164
|
+
job_name: Optional[str] = None,
|
|
165
|
+
model: str | None = None,
|
|
166
|
+
allowed_tools: list[str] | None = None,
|
|
167
|
+
max_concurrent: int = 30,
|
|
168
|
+
max_steps: int = 50,
|
|
169
|
+
split: str = "train",
|
|
170
|
+
trajectory_dir: str | dict | None = None,
|
|
171
|
+
# === ComputerAgent kwargs ===
|
|
172
|
+
tools: list[Any] | None = None,
|
|
173
|
+
custom_loop: Any | None = None,
|
|
174
|
+
only_n_most_recent_images: int | None = 5,
|
|
175
|
+
callbacks: list[Any] | None = None,
|
|
176
|
+
verbosity: int | None = None,
|
|
177
|
+
max_retries: int | None = 3,
|
|
178
|
+
screenshot_delay: float | int = 0.5,
|
|
179
|
+
use_prompt_caching: bool | None = False,
|
|
180
|
+
max_trajectory_budget: float | dict | None = None,
|
|
181
|
+
telemetry_enabled: bool | None = True,
|
|
182
|
+
) -> list[Any]:
|
|
183
|
+
"""Run evaluation across the entire dataset using hud.datasets.run_dataset."""
|
|
184
|
+
|
|
185
|
+
# We pass OperatorAgent as the class and provide a config that injects our
|
|
186
|
+
# FakeAsyncOpenAI per agent instantiation.
|
|
187
|
+
|
|
188
|
+
if isinstance(dataset, str):
|
|
189
|
+
dataset_name = dataset.split('/')[-1]
|
|
190
|
+
job_name = job_name or f"Evaluation {dataset_name}"
|
|
191
|
+
dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
|
|
192
|
+
else:
|
|
193
|
+
dataset_name = "custom"
|
|
194
|
+
job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
|
|
195
|
+
|
|
196
|
+
# Execute evaluation
|
|
197
|
+
return await run_dataset(
|
|
198
|
+
name=job_name,
|
|
199
|
+
dataset=dataset,
|
|
200
|
+
agent_class=ProxyOperatorAgent,
|
|
201
|
+
agent_config={
|
|
202
|
+
"model": model,
|
|
203
|
+
"allowed_tools": allowed_tools,
|
|
204
|
+
"trajectory_dir": trajectory_dir,
|
|
205
|
+
# === ComputerAgent kwargs passthrough ===
|
|
206
|
+
"tools": tools,
|
|
207
|
+
"custom_loop": custom_loop,
|
|
208
|
+
"only_n_most_recent_images": only_n_most_recent_images,
|
|
209
|
+
"callbacks": callbacks,
|
|
210
|
+
"verbosity": verbosity,
|
|
211
|
+
"max_retries": max_retries,
|
|
212
|
+
"screenshot_delay": screenshot_delay,
|
|
213
|
+
"use_prompt_caching": use_prompt_caching,
|
|
214
|
+
"max_trajectory_budget": max_trajectory_budget,
|
|
215
|
+
"telemetry_enabled": telemetry_enabled,
|
|
216
|
+
},
|
|
217
|
+
max_concurrent=max_concurrent,
|
|
218
|
+
metadata={"dataset": dataset_name},
|
|
219
|
+
max_steps=max_steps,
|
|
220
|
+
auto_respond=True,
|
|
74
221
|
)
|
|
75
222
|
|
|
76
223
|
|
|
77
|
-
__all__ = [
|
|
224
|
+
__all__ = [
|
|
225
|
+
"run_single_task",
|
|
226
|
+
"run_full_dataset",
|
|
227
|
+
"ProxyOperatorAgent",
|
|
228
|
+
]
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""HUD ComputerAgent wrapper and Fake AsyncOpenAI client.
|
|
2
|
+
|
|
3
|
+
Provides FakeAsyncOpenAI that adapts our ComputerAgent to the OpenAI Responses
|
|
4
|
+
interface needed by HUD's OperatorAgent. It implements only `responses.create`
|
|
5
|
+
and returns an OpenAI Response object with `id` and `output` fields, where `output` is a list of
|
|
6
|
+
OpenAI-like response blocks. We intentionally only support a single-step call
|
|
7
|
+
by consuming the first yielded result from `ComputerAgent.run()`.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import traceback
|
|
11
|
+
import time
|
|
12
|
+
import uuid
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
from agent.agent import ComputerAgent as BaseComputerAgent
|
|
16
|
+
|
|
17
|
+
# OpenAI Responses typed models (required)
|
|
18
|
+
from openai.types.responses import (
|
|
19
|
+
Response,
|
|
20
|
+
ResponseInputParam,
|
|
21
|
+
ResponseOutputItem,
|
|
22
|
+
ResponseComputerToolCall,
|
|
23
|
+
ResponseOutputMessage,
|
|
24
|
+
ResponseOutputText,
|
|
25
|
+
ResponseReasoningItem,
|
|
26
|
+
ResponseUsage,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
|
|
30
|
+
"""Map our agent output items to OpenAI ResponseOutputItem typed models.
|
|
31
|
+
|
|
32
|
+
Only a subset is supported: computer_call, assistant message (text), and reasoning.
|
|
33
|
+
Unknown types are ignored.
|
|
34
|
+
"""
|
|
35
|
+
blocks: List[ResponseOutputItem] = []
|
|
36
|
+
for item in output_items or []:
|
|
37
|
+
t = item.get("type")
|
|
38
|
+
if t == "computer_call":
|
|
39
|
+
comp = ResponseComputerToolCall.model_validate({
|
|
40
|
+
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
|
|
41
|
+
"type": "computer_call",
|
|
42
|
+
"call_id": item["call_id"],
|
|
43
|
+
"action": item["action"],
|
|
44
|
+
"pending_safety_checks": item.get("pending_safety_checks", []),
|
|
45
|
+
"status": "completed",
|
|
46
|
+
})
|
|
47
|
+
blocks.append(comp)
|
|
48
|
+
# we will exit early here as the responses api only supports a single step
|
|
49
|
+
break
|
|
50
|
+
elif t == "message" and item.get("role") == "assistant":
|
|
51
|
+
content_blocks: List[ResponseOutputText] = []
|
|
52
|
+
for c in item.get("content", []) or []:
|
|
53
|
+
content_blocks.append(
|
|
54
|
+
ResponseOutputText.model_validate({
|
|
55
|
+
"type": "output_text",
|
|
56
|
+
"text": c["text"],
|
|
57
|
+
"annotations": [],
|
|
58
|
+
})
|
|
59
|
+
)
|
|
60
|
+
if content_blocks:
|
|
61
|
+
msg = ResponseOutputMessage.model_validate({
|
|
62
|
+
"id": item.get("id") or f"msg_{uuid.uuid4()}",
|
|
63
|
+
"type": "message",
|
|
64
|
+
"role": "assistant",
|
|
65
|
+
"status": "completed",
|
|
66
|
+
"content": [ct.model_dump() for ct in content_blocks],
|
|
67
|
+
})
|
|
68
|
+
blocks.append(msg)
|
|
69
|
+
elif t == "reasoning":
|
|
70
|
+
reasoning = ResponseReasoningItem.model_validate({
|
|
71
|
+
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
|
|
72
|
+
"type": "reasoning",
|
|
73
|
+
"summary": item["summary"],
|
|
74
|
+
})
|
|
75
|
+
blocks.append(reasoning)
|
|
76
|
+
# Unhandled types are ignored
|
|
77
|
+
return blocks
|
|
78
|
+
|
|
79
|
+
def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
|
80
|
+
out: List[Dict[str, Any]] = []
|
|
81
|
+
for it in list(items):
|
|
82
|
+
if hasattr(it, "model_dump"):
|
|
83
|
+
out.append(it.model_dump()) # type: ignore[attr-defined]
|
|
84
|
+
elif isinstance(it, dict):
|
|
85
|
+
out.append(it)
|
|
86
|
+
else:
|
|
87
|
+
# Strict: rely on default __dict__ if present
|
|
88
|
+
out.append(dict(it)) # may raise if not mapping
|
|
89
|
+
return out
|
|
90
|
+
|
|
91
|
+
class FakeAsyncOpenAI:
|
|
92
|
+
"""Minimal fake OpenAI client with only `responses.create` implemented.
|
|
93
|
+
|
|
94
|
+
It uses a provided `ComputerAgent` instance to produce a single-step
|
|
95
|
+
response compatible with HUD's OperatorAgent loop.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(self, computer_agent: BaseComputerAgent) -> None:
|
|
99
|
+
self._agent = computer_agent
|
|
100
|
+
self.responses = self._Responses(self)
|
|
101
|
+
|
|
102
|
+
class _Responses:
|
|
103
|
+
def __init__(self, parent: "FakeAsyncOpenAI") -> None:
|
|
104
|
+
# Caches for cross-call context when using previous_response_id
|
|
105
|
+
self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {}
|
|
106
|
+
self.context_cache: Dict[str, List[str]] = {}
|
|
107
|
+
self.agent = parent._agent
|
|
108
|
+
|
|
109
|
+
async def create(
|
|
110
|
+
self,
|
|
111
|
+
*,
|
|
112
|
+
model: str,
|
|
113
|
+
input: ResponseInputParam,
|
|
114
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
115
|
+
instructions: Optional[str] = None,
|
|
116
|
+
previous_response_id: Optional[str] = None,
|
|
117
|
+
max_retries: int = 5,
|
|
118
|
+
**_: Any,
|
|
119
|
+
) -> Any:
|
|
120
|
+
for attempt in range(max_retries):
|
|
121
|
+
# Prepend cached blocks from previous_response_id to input
|
|
122
|
+
full_input = input
|
|
123
|
+
if previous_response_id is not None:
|
|
124
|
+
prev_block_ids = self.context_cache[previous_response_id]
|
|
125
|
+
prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids]
|
|
126
|
+
full_input = _to_plain_dict_list(prev_blocks + input)
|
|
127
|
+
|
|
128
|
+
# Pre-pend instructions message
|
|
129
|
+
effective_input = full_input
|
|
130
|
+
if instructions:
|
|
131
|
+
effective_input = [{
|
|
132
|
+
"role": "user",
|
|
133
|
+
"content": instructions,
|
|
134
|
+
}] + full_input
|
|
135
|
+
|
|
136
|
+
# Run a single iteration of the ComputerAgent
|
|
137
|
+
agent_result: Optional[Dict[str, Any]] = None
|
|
138
|
+
async for result in self.agent.run(effective_input): # type: ignore[arg-type]
|
|
139
|
+
agent_result = result
|
|
140
|
+
break
|
|
141
|
+
assert agent_result is not None, "Agent failed to produce result"
|
|
142
|
+
|
|
143
|
+
output = _map_agent_output_to_openai_blocks(agent_result["output"])
|
|
144
|
+
usage = agent_result["usage"]
|
|
145
|
+
|
|
146
|
+
# Cache conversation context using the last response id
|
|
147
|
+
block_ids: List[str] = []
|
|
148
|
+
blocks_to_cache = full_input + output
|
|
149
|
+
for b in blocks_to_cache:
|
|
150
|
+
bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
|
|
151
|
+
self.blocks_cache[bid] = b # type: ignore[assignment]
|
|
152
|
+
block_ids.append(bid)
|
|
153
|
+
response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
|
|
154
|
+
self.context_cache[response_id] = block_ids
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
return Response.model_validate({
|
|
158
|
+
"id": response_id,
|
|
159
|
+
"created_at": time.time(),
|
|
160
|
+
"object": "response",
|
|
161
|
+
"model": model,
|
|
162
|
+
"output": output,
|
|
163
|
+
"parallel_tool_calls": False,
|
|
164
|
+
"tool_choice": "auto",
|
|
165
|
+
"tools": [],
|
|
166
|
+
"previous_response_id": previous_response_id,
|
|
167
|
+
"usage": ResponseUsage.model_validate({
|
|
168
|
+
"input_tokens": usage.get("input_tokens", 0),
|
|
169
|
+
"output_tokens": usage.get("output_tokens", 0),
|
|
170
|
+
"total_tokens": usage.get("total_tokens", 0),
|
|
171
|
+
"input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
|
|
172
|
+
"output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
|
|
173
|
+
}),
|
|
174
|
+
})
|
|
175
|
+
except Exception as e:
|
|
176
|
+
print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
|
|
177
|
+
if attempt == max_retries - 1:
|
|
178
|
+
print(traceback.format_exc())
|
|
179
|
+
raise e
|
|
180
|
+
|
|
181
|
+
__all__ = [
|
|
182
|
+
"FakeAsyncOpenAI",
|
|
183
|
+
]
|
agent/loops/anthropic.py
CHANGED
|
@@ -1530,7 +1530,18 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
|
|
1530
1530
|
"content": [
|
|
1531
1531
|
{
|
|
1532
1532
|
"type": "text",
|
|
1533
|
-
"text": f"You are a UI grounding expert.
|
|
1533
|
+
"text": f"""You are a UI grounding expert. Follow these guidelines:
|
|
1534
|
+
|
|
1535
|
+
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
1536
|
+
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
|
1537
|
+
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
|
1538
|
+
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
|
1539
|
+
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
|
1540
|
+
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
|
1541
|
+
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
1542
|
+
|
|
1543
|
+
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
|
1544
|
+
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
|
1534
1545
|
},
|
|
1535
1546
|
{
|
|
1536
1547
|
"type": "image_url",
|