cua-agent 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -0,0 +1,77 @@
1
+ """HUD integration for ComputerAgent."""
2
+
3
+ import logging
4
+ from typing import Any, Optional, Dict
5
+ from hud import run_job as hud_run_job
6
+
7
+ from .agent import ComputerAgent
8
+ from .adapter import ComputerAgentAdapter
9
+ from .computer_handler import HUDComputerHandler
10
+
11
+
12
+ async def run_job(
13
+ model: str,
14
+ task_or_taskset: Any,
15
+ job_name: str,
16
+ # Job kwargs
17
+ auto_reply_question: bool = False,
18
+ adapter_cls: Any = None,
19
+ adapter_kwargs: Optional[Dict[str, Any]] = None,
20
+ max_steps_per_task: int = 20,
21
+ run_parallel: bool = True,
22
+ job_metadata: Optional[Dict[str, Any]] = None,
23
+ show_progress: bool = True,
24
+ max_concurrent_env_creations: Optional[int] = 30, # Limits gym.make calls
25
+ max_concurrent_agent_predictions: Optional[int] = None, # No limit on LLM calls
26
+ max_concurrent_tasks: Optional[int] = 30, # Limits overall task concurrency
27
+ **agent_kwargs: Any
28
+ ) -> Any:
29
+ """
30
+ Run a job using ComputerAgent with the specified model.
31
+
32
+ Args:
33
+ model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
34
+ task_or_taskset: Task or TaskSet to run
35
+ job_name: Name for the job
36
+ auto_reply_question: Whether to auto-reply to questions
37
+ adapter_cls: Custom adapter class (defaults to ComputerAgentAdapter)
38
+ adapter_kwargs: Additional kwargs for the adapter
39
+ max_steps_per_task: Maximum steps per task
40
+ run_parallel: Whether to run tasks in parallel
41
+ job_metadata: Additional metadata for the job
42
+ show_progress: Whether to show progress
43
+ max_concurrent_env_creations: Max concurrent environment creations
44
+ max_concurrent_agent_predictions: Max concurrent agent predictions
45
+ max_concurrent_tasks: Max concurrent tasks
46
+ **agent_kwargs: Additional kwargs to pass to ComputerAgent
47
+
48
+ Returns:
49
+ Job instance from HUD
50
+ """
51
+ # combine verbose and verbosity kwargs
52
+ if "verbose" in agent_kwargs:
53
+ agent_kwargs["verbosity"] = logging.INFO
54
+ del agent_kwargs["verbose"]
55
+ verbose = True if agent_kwargs.get("verbosity", logging.WARNING) > logging.INFO else False
56
+
57
+ # run job
58
+ return await hud_run_job(
59
+ agent_cls=ComputerAgent,
60
+ agent_kwargs={"model": model, **agent_kwargs},
61
+ task_or_taskset=task_or_taskset,
62
+ job_name=job_name,
63
+ auto_reply_question=auto_reply_question,
64
+ adapter_cls=adapter_cls,
65
+ adapter_kwargs=adapter_kwargs,
66
+ max_steps_per_task=max_steps_per_task,
67
+ run_parallel=run_parallel,
68
+ job_metadata=job_metadata,
69
+ show_progress=show_progress,
70
+ verbose=verbose,
71
+ max_concurrent_env_creations=max_concurrent_env_creations,
72
+ max_concurrent_agent_predictions=max_concurrent_agent_predictions,
73
+ max_concurrent_tasks=max_concurrent_tasks
74
+ )
75
+
76
+
77
+ __all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]
@@ -0,0 +1,121 @@
1
+ """HUD Adapter for ComputerAgent integration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, ClassVar
6
+
7
+ from hud.adapters.common import CLA, Adapter
8
+ from hud.adapters.common.types import (
9
+ CLAButton,
10
+ CLAKey,
11
+ ClickAction,
12
+ CustomAction,
13
+ DragAction,
14
+ MoveAction,
15
+ Point,
16
+ PressAction,
17
+ ResponseAction,
18
+ ScreenshotFetch,
19
+ ScrollAction,
20
+ TypeAction,
21
+ WaitAction,
22
+ )
23
+
24
+
25
+ class ComputerAgentAdapter(Adapter):
26
+ """Adapter for ComputerAgent to work with HUD."""
27
+
28
+ KEY_MAP: ClassVar[dict[str, CLAKey]] = {
29
+ "return": "enter",
30
+ "arrowup": "up",
31
+ "arrowdown": "down",
32
+ "arrowleft": "left",
33
+ "arrowright": "right",
34
+ "cmd": "ctrl",
35
+ "super": "win",
36
+ "meta": "win",
37
+ }
38
+
39
+ BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
40
+ "wheel": "middle",
41
+ "middle": "middle",
42
+ }
43
+
44
+ def __init__(self) -> None:
45
+ super().__init__()
46
+ # ComputerAgent default dimensions (can be overridden)
47
+ self.agent_width = 1024
48
+ self.agent_height = 768
49
+
50
+ def _map_key(self, key: str) -> CLAKey:
51
+ """Map a key to its standardized form."""
52
+ return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
53
+
54
+ def convert(self, data: Any) -> CLA:
55
+ """Convert a ComputerAgent action to a HUD action."""
56
+ try:
57
+ action_type = data.get("type")
58
+
59
+ if action_type == "click":
60
+ x, y = data.get("x", 0), data.get("y", 0)
61
+ button = data.get("button", "left")
62
+ button = self.BUTTON_MAP.get(button, button)
63
+ if button is None:
64
+ button = "left"
65
+ converted_action = ClickAction(point=Point(x=x, y=y), button=button)
66
+
67
+ elif action_type == "double_click":
68
+ x, y = data.get("x", 0), data.get("y", 0)
69
+ converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
70
+
71
+ elif action_type == "scroll":
72
+ x, y = int(data.get("x", 0)), int(data.get("y", 0))
73
+ scroll_x = int(data.get("scroll_x", 0))
74
+ scroll_y = int(data.get("scroll_y", 0))
75
+ converted_action = ScrollAction(
76
+ point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
77
+ )
78
+
79
+ elif action_type == "type":
80
+ text = data.get("text", "")
81
+ converted_action = TypeAction(text=text, enter_after=False)
82
+
83
+ elif action_type == "wait":
84
+ ms = data.get("ms", 1000)
85
+ converted_action = WaitAction(time=ms)
86
+
87
+ elif action_type == "move":
88
+ x, y = data.get("x", 0), data.get("y", 0)
89
+ converted_action = MoveAction(point=Point(x=x, y=y))
90
+
91
+ elif action_type == "keypress":
92
+ keys = data.get("keys", [])
93
+ if isinstance(keys, str):
94
+ keys = [keys]
95
+ converted_action = PressAction(keys=[self._map_key(k) for k in keys])
96
+
97
+ elif action_type == "drag":
98
+ path = data.get("path", [])
99
+ points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
100
+ converted_action = DragAction(path=points)
101
+
102
+ elif action_type == "screenshot":
103
+ converted_action = ScreenshotFetch()
104
+
105
+ elif action_type == "response":
106
+ converted_action = ResponseAction(text=data.get("text", ""))
107
+
108
+ elif action_type == "custom":
109
+ converted_action = CustomAction(action=data.get("action", ""))
110
+
111
+ else:
112
+ raise ValueError(f"Unsupported action type: {action_type}")
113
+
114
+ # Add reasoning and logs if available
115
+ converted_action.reasoning = data.get("reasoning", "")
116
+ converted_action.logs = data.get("logs", "")
117
+
118
+ return converted_action
119
+
120
+ except Exception as e:
121
+ raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
@@ -0,0 +1,373 @@
1
+ """HUD ComputerAgent wrapper for OSWorld benchmarking."""
2
+
3
+ import logging
4
+ from typing import Any, Literal, Optional, Union, List, Dict
5
+ import asyncio
6
+
7
+ from agent import ComputerAgent as BaseComputerAgent
8
+ from agent.responses import make_failed_tool_call_items
9
+ from hud.adapters import Adapter
10
+ from hud.agent.base import Agent
11
+ from hud.utils.common import Observation
12
+ from hud.adapters.common.types import LogType
13
+ from hud.types import Gym
14
+
15
+ from .adapter import ComputerAgentAdapter
16
+ from .computer_handler import HUDComputerHandler
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ BASE_SYSTEM_PROMPT = """
21
+ You are an autonomous computer-using agent. Follow these guidelines:
22
+
23
+ 1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
24
+ 2. Use the computer tools to complete the task and do not stop until the task is complete.
25
+ 3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
26
+ 4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
27
+ 5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
28
+ 6. Trust that the user wants you to complete the entire task they've requested.
29
+ 7. You must say "Task completed" when the task is complete.
30
+
31
+ Remember: You have been given permission to complete the requested task autonomously.
32
+ """.strip()
33
+
34
+ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
35
+ """
36
+ A ComputerAgent wrapper for HUD integration.
37
+
38
+ This agent wraps the base ComputerAgent to work with HUD environments,
39
+ providing the same interface as OperatorAgent but using ComputerAgent internally.
40
+ """
41
+
42
+ transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
43
+
44
+ def __init__(
45
+ self,
46
+ model: str = "anthropic/claude-3-5-sonnet-20241022",
47
+ environment: Literal["windows", "mac", "linux", "browser"] = "linux",
48
+ adapter: Optional[Adapter] = None,
49
+ name: Optional[str] = None,
50
+ **kwargs: Any,
51
+ ):
52
+ """
53
+ Initialize the ComputerAgent for HUD.
54
+
55
+ Args:
56
+ model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
57
+ environment: The environment type (windows, mac, linux, browser)
58
+ adapter: The adapter to use for preprocessing and postprocessing
59
+ name: The name of the agent
60
+ **kwargs: Additional arguments passed to ComputerAgent
61
+ """
62
+ # Create adapter if not provided
63
+ adapter = adapter or ComputerAgentAdapter()
64
+
65
+ if name is None:
66
+ name = f"computeragent-{model.split('/')[-1]}"
67
+
68
+ # Initialize the base Agent class without client (we'll create it later)
69
+ super().__init__(client=None, adapter=adapter, name=name)
70
+
71
+ self.model = model
72
+ self.environment = environment
73
+ self.kwargs = kwargs
74
+
75
+ # Default dimensions
76
+ self.width = 1024
77
+ self.height = 768
78
+
79
+ # Update dimensions if adapter is provided
80
+ if self.adapter:
81
+ self.width = self.adapter.agent_width
82
+ self.height = self.adapter.agent_height
83
+
84
+ # Create HUD computer handler
85
+ self.hud_computer = HUDComputerHandler(
86
+ environment=environment,
87
+ dimensions=(self.width, self.height)
88
+ )
89
+
90
+ # Handle trajectory_dir by adding TrajectorySaverCallback
91
+ trajectory_dir = kwargs.pop("trajectory_dir", None)
92
+ callbacks = kwargs.get("callbacks", [])
93
+
94
+ if trajectory_dir:
95
+ from agent.callbacks.trajectory_saver import TrajectorySaverCallback
96
+ trajectory_callback = TrajectorySaverCallback(trajectory_dir, reset_on_run=False)
97
+ callbacks = callbacks + [trajectory_callback]
98
+ kwargs["callbacks"] = callbacks
99
+
100
+ # Initialize ComputerAgent with HUD computer handler
101
+ self.computer_agent = BaseComputerAgent(
102
+ model=model,
103
+ tools=[self.hud_computer],
104
+ **kwargs
105
+ )
106
+
107
+ # Set the client to the computer_agent for compatibility
108
+ self.client = self.computer_agent
109
+
110
+ # State tracking
111
+ self.conversation_history: List[Dict[str, Any]] = []
112
+ self.initial_prompt: Optional[str] = None
113
+
114
+ # System prompt for computer use tasks
115
+ self.base_system_prompt = BASE_SYSTEM_PROMPT
116
+
117
+ async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
118
+ """
119
+ Fetch a response from ComputerAgent based on the observation.
120
+
121
+ Args:
122
+ observation: The preprocessed observation, attributes:
123
+ screenshot: Base64 encoded PNG string of the screen
124
+ text: Text observation, if available
125
+
126
+ Returns:
127
+ tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
128
+ boolean indicating if the agent believes the task is complete.
129
+ """
130
+ try:
131
+ # Update the computer handler with the current screenshot
132
+ if observation.screenshot:
133
+ self.hud_computer.update_screenshot(observation.screenshot)
134
+
135
+ # Set up action callback to capture actions
136
+ captured_actions = []
137
+ action_done = False
138
+
139
+ async def action_callback(action: Dict[str, Any]) -> None:
140
+ """Callback to capture actions from ComputerAgent."""
141
+ nonlocal captured_actions, action_done
142
+ captured_actions.append(action)
143
+
144
+ # Set the action callback
145
+ self.hud_computer.set_action_callback(action_callback)
146
+
147
+ # Prepare the message for ComputerAgent
148
+ if not self.conversation_history:
149
+ # First interaction - use the observation text as initial prompt
150
+ if observation.text:
151
+ self.initial_prompt = observation.text
152
+ message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
153
+ else:
154
+ message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
155
+
156
+ input_content = [
157
+ {"type": "input_text", "text": message}
158
+ ]
159
+
160
+ # Add screenshot if present
161
+ if observation.screenshot:
162
+ input_content.append(
163
+ {
164
+ "type": "input_image",
165
+ "image_url": f"data:image/png;base64,{observation.screenshot}",
166
+ }
167
+ )
168
+
169
+ self.conversation_history.append({"role": "user", "content": input_content})
170
+ else:
171
+ # Subsequent interactions - check if last action was computer_call
172
+ # If so, add computer_call_output with screenshot instead of user message
173
+ last_computer_calls = []
174
+ for msg in reversed(self.conversation_history):
175
+ if msg.get("type") == "computer_call":
176
+ call_id = msg.get("call_id")
177
+ if call_id:
178
+ # Check if this call_id already has a computer_call_output
179
+ has_output = any(
180
+ m.get("type") == "computer_call_output" and m.get("call_id") == call_id
181
+ for m in self.conversation_history
182
+ )
183
+ if not has_output:
184
+ last_computer_calls.append(call_id)
185
+
186
+ if last_computer_calls:
187
+ if not observation.screenshot:
188
+ print("No screenshot found, taking screenshot")
189
+ screenshot_b64 = await self.hud_computer.screenshot()
190
+ # Add computer_call_output for each unresponded computer_call
191
+ for call_id in reversed(last_computer_calls): # Maintain order
192
+ self.conversation_history.append({
193
+ "type": "computer_call_output",
194
+ "call_id": call_id,
195
+ "output": {
196
+ "type": "input_image",
197
+ "image_url": f"data:image/png;base64,{screenshot_b64}"
198
+ }
199
+ })
200
+ else:
201
+ # No computer_call found, add regular user message
202
+ message = "Continue with the task based on the current screen state."
203
+ input_content = [
204
+ {"type": "input_text", "text": message}
205
+ ]
206
+
207
+ # Add screenshot if present
208
+ if observation.screenshot:
209
+ input_content.append(
210
+ {
211
+ "type": "input_image",
212
+ "image_url": f"data:image/png;base64,{observation.screenshot}",
213
+ }
214
+ )
215
+
216
+ self.conversation_history.append({"role": "user", "content": input_content})
217
+
218
+ # If the last message is a reasoning message, change it to output_text
219
+ if (self.conversation_history and
220
+ self.conversation_history[-1].get("type") == "reasoning" and
221
+ self.conversation_history[-1].get("summary")):
222
+
223
+ reasoning_msg = self.conversation_history[-1]
224
+ summary_texts = []
225
+
226
+ # Extract all summary_text entries
227
+ for summary_item in reasoning_msg["summary"]:
228
+ if summary_item.get("type") == "summary_text":
229
+ summary_texts.append(summary_item.get("text", ""))
230
+
231
+ # Convert to message format with output_text
232
+ if summary_texts:
233
+ converted_message = {
234
+ "type": "message",
235
+ "role": "assistant",
236
+ "content": [
237
+ {
238
+ "text": " ".join(summary_texts),
239
+ "type": "output_text"
240
+ }
241
+ ]
242
+ }
243
+
244
+ # Replace the reasoning message with the converted message
245
+ self.conversation_history[-1] = converted_message
246
+
247
+ # Run ComputerAgent
248
+ try:
249
+ new_items = []
250
+
251
+ # ComputerAgent.run returns an async generator
252
+ try:
253
+ async for result in self.computer_agent.run(self.conversation_history, stream=False):
254
+ # if the result has computer_call_output, immediately exit
255
+ if result.get("output", []) and result.get("output", [])[-1].get("type") == "computer_call_output":
256
+ break
257
+ # otherwise add agent output to conversation history
258
+ new_items += result["output"]
259
+ except Exception as e:
260
+ # if the last message is reasoning, change it to output_text
261
+ if new_items and new_items[-1].get("type") == "reasoning":
262
+ new_items[-1] = {
263
+ "type": "message",
264
+ "role": "assistant",
265
+ "content": [
266
+ {
267
+ "text": new_items[-1].get("summary", [{}])[0].get("text", ""),
268
+ "type": "output_text"
269
+ }
270
+ ]
271
+ }
272
+ # Check if there are any computer_call items in new_items
273
+ computer_calls = [item for item in new_items if item.get("type") == "computer_call"]
274
+ if computer_calls:
275
+ # Remove computer_call items from new_items
276
+ new_items = [item for item in new_items if item.get("type") != "computer_call"]
277
+
278
+ # Add failed tool call items for each computer call
279
+ for computer_call in computer_calls:
280
+ tool_input = computer_call.get("action", {})
281
+ call_id = computer_call.get("call_id")
282
+ new_items.extend(make_failed_tool_call_items(
283
+ tool_name="computer",
284
+ tool_kwargs=tool_input,
285
+ error_message=repr(e),
286
+ call_id=call_id
287
+ ))
288
+ else:
289
+ # add error message to conversation history (fallback for non-computer-call errors)
290
+ new_items.append({
291
+ "type": "user",
292
+ "content": [
293
+ {
294
+ "type": "input_text",
295
+ "text": f"Error during previous attempted action: {repr(e)}"
296
+ }
297
+ ]
298
+ })
299
+
300
+ # Check if we captured any actions
301
+ if captured_actions:
302
+ # Extract reasoning from the conversation history
303
+ reasoning = ""
304
+ # Look for the latest reasoning message
305
+ for msg in reversed(new_items):
306
+ if msg.get("type") == "reasoning" and msg.get("summary"):
307
+ reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
308
+ break
309
+ elif msg.get("type") == "message" and msg.get("role") == "assistant":
310
+ content = msg.get("content", [])
311
+ if isinstance(content, list):
312
+ reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
313
+ break
314
+
315
+ # update conversation history
316
+ self.conversation_history += new_items
317
+
318
+ # Add reasoning and logs to each action
319
+ for action in captured_actions:
320
+ action["reasoning"] = reasoning
321
+ action["logs"] = {"conversation_length": len(self.conversation_history)}
322
+
323
+ return captured_actions, False
324
+
325
+ # Check if the last message is "Task completed"
326
+ response_text = ""
327
+ for msg in reversed(new_items):
328
+ if msg.get("type") == "message" and msg.get("role") == "assistant":
329
+ content = msg.get("content", [])
330
+ for c in content:
331
+ if c.get("type") == "output_text":
332
+ response_text = c.get("text", response_text)
333
+ break
334
+ break
335
+
336
+ done = "task completed" in response_text.lower()
337
+
338
+ # update conversation history
339
+ self.conversation_history += new_items
340
+
341
+ response_action = {
342
+ "type": "response",
343
+ "text": response_text,
344
+ "reasoning": response_text,
345
+ "logs": {"conversation_length": len(self.conversation_history)}
346
+ }
347
+
348
+ # Check if this indicates task completion or failure
349
+ if "task is infeasible" in response_text.lower():
350
+ response_action = {"type": "custom", "action": "FAIL"}
351
+ done = True
352
+
353
+ return [response_action], done
354
+ except Exception as e:
355
+ logger.error(f"Error running ComputerAgent: {e}")
356
+ # Return an error response
357
+ error_action = {
358
+ "type": "response",
359
+ "text": f"Error occurred: {str(e)}",
360
+ "reasoning": f"ComputerAgent encountered an error: {str(e)}",
361
+ "logs": {"error": str(e)}
362
+ }
363
+ return [error_action], True
364
+
365
+ except Exception as e:
366
+ logger.error(f"Error in fetch_response: {e}")
367
+ error_action = {
368
+ "type": "response",
369
+ "text": f"Error in agent processing: {str(e)}",
370
+ "reasoning": f"Agent processing error: {str(e)}",
371
+ "logs": {"error": str(e)}
372
+ }
373
+ return [error_action], True