cua-agent 0.4.28__tar.gz → 0.4.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (52) hide show
  1. {cua_agent-0.4.28 → cua_agent-0.4.29}/PKG-INFO +3 -3
  2. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/integrations/hud/__init__.py +14 -97
  3. cua_agent-0.4.29/agent/integrations/hud/agent.py +350 -0
  4. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/integrations/hud/proxy.py +81 -0
  5. {cua_agent-0.4.28 → cua_agent-0.4.29}/pyproject.toml +3 -3
  6. {cua_agent-0.4.28 → cua_agent-0.4.29}/README.md +0 -0
  7. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/__init__.py +0 -0
  8. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/__main__.py +0 -0
  9. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/adapters/__init__.py +0 -0
  10. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/adapters/huggingfacelocal_adapter.py +0 -0
  11. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/adapters/human_adapter.py +0 -0
  12. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/adapters/mlxvlm_adapter.py +0 -0
  13. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/agent.py +0 -0
  14. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/__init__.py +0 -0
  15. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/base.py +0 -0
  16. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/budget_manager.py +0 -0
  17. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/image_retention.py +0 -0
  18. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/logging.py +0 -0
  19. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/operator_validator.py +0 -0
  20. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/pii_anonymization.py +0 -0
  21. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/prompt_instructions.py +0 -0
  22. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/telemetry.py +0 -0
  23. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/callbacks/trajectory_saver.py +0 -0
  24. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/cli.py +0 -0
  25. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/computers/__init__.py +0 -0
  26. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/computers/base.py +0 -0
  27. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/computers/cua.py +0 -0
  28. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/computers/custom.py +0 -0
  29. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/decorators.py +0 -0
  30. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/human_tool/__init__.py +0 -0
  31. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/human_tool/__main__.py +0 -0
  32. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/human_tool/server.py +0 -0
  33. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/human_tool/ui.py +0 -0
  34. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/__init__.py +0 -0
  35. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/anthropic.py +0 -0
  36. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/base.py +0 -0
  37. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/composed_grounded.py +0 -0
  38. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/glm45v.py +0 -0
  39. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/gta1.py +0 -0
  40. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/model_types.csv +0 -0
  41. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/omniparser.py +0 -0
  42. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/openai.py +0 -0
  43. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/loops/uitars.py +0 -0
  44. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/proxy/examples.py +0 -0
  45. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/proxy/handlers.py +0 -0
  46. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/responses.py +0 -0
  47. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/types.py +0 -0
  48. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/ui/__init__.py +0 -0
  49. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/ui/__main__.py +0 -0
  50. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/ui/gradio/__init__.py +0 -0
  51. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/ui/gradio/app.py +0 -0
  52. {cua_agent-0.4.28 → cua_agent-0.4.29}/agent/ui/gradio/ui_components.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.28
3
+ Version: 0.4.29
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.12
@@ -37,7 +37,7 @@ Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
37
37
  Provides-Extra: cli
38
38
  Requires-Dist: yaspin>=3.1.0; extra == "cli"
39
39
  Provides-Extra: hud
40
- Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "hud"
40
+ Requires-Dist: hud-python==0.4.19; extra == "hud"
41
41
  Provides-Extra: all
42
42
  Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
43
43
  Requires-Dist: accelerate; extra == "all"
@@ -46,7 +46,7 @@ Requires-Dist: transformers>=4.54.0; extra == "all"
46
46
  Requires-Dist: gradio>=5.23.3; extra == "all"
47
47
  Requires-Dist: python-dotenv>=1.0.1; extra == "all"
48
48
  Requires-Dist: yaspin>=3.1.0; extra == "all"
49
- Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "all"
49
+ Requires-Dist: hud-python==0.4.19; extra == "all"
50
50
  Description-Content-Type: text/markdown
51
51
 
52
52
  <div align="center">
@@ -1,102 +1,21 @@
1
- """HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy).
1
+ """HUD integration: dataset runners and MCP-based computer agent export.
2
2
 
3
- This module exposes two helpers to evaluate HUD-compatible datasets using
4
- HUD's OperatorAgent, while proxying model calls through our ComputerAgent via
5
- `FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).
3
+ This module exposes helpers to evaluate HUD-compatible datasets and exports
4
+ the MCP-compatible computer agent implementation.
6
5
 
7
6
  Exports:
8
- - run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None)
9
- - run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50)
7
+ - run_single_task(dataset, ...)
8
+ - run_full_dataset(dataset, ...)
9
+ - MCPComputerAgent
10
10
  """
11
11
  import time
12
12
  from typing import Any, Optional
13
13
 
14
- from PIL import Image
15
14
  from datasets import load_dataset, Dataset
16
- from hud.agents import OperatorAgent
17
15
  from hud.datasets import Task, run_dataset
18
- from hud.tools.computer.settings import computer_settings
19
16
  from hud import trace
20
17
 
21
- from agent.agent import ComputerAgent as BaseComputerAgent
22
- from .proxy import FakeAsyncOpenAI
23
- from agent.callbacks import PromptInstructionsCallback
24
-
25
-
26
- # ---------------------------------------------------------------------------
27
- # Proxy OperatorAgent
28
- # ---------------------------------------------------------------------------
29
-
30
-
31
- class ProxyOperatorAgent(OperatorAgent):
32
- """OperatorAgent that proxies model calls through our ComputerAgent.
33
-
34
- Accepts the same config keys we pass via hud.run_dataset `agent_config`:
35
- - model: str | None
36
- - allowed_tools: list[str] | None
37
- Additional kwargs are forwarded to OperatorAgent (if any are supported).
38
- """
39
-
40
- def __init__(
41
- self,
42
- *,
43
- model: str | None = None,
44
- allowed_tools: list[str] | None = None,
45
- trajectory_dir: str | dict | None = None,
46
- # === ComputerAgent kwargs ===
47
- tools: list[Any] | None = None,
48
- custom_loop: Any | None = None,
49
- only_n_most_recent_images: int | None = None,
50
- callbacks: list[Any] | None = None,
51
- instructions: str | None = None,
52
- verbosity: int | None = None,
53
- max_retries: int | None = 3,
54
- screenshot_delay: float | int = 0.5,
55
- use_prompt_caching: bool | None = False,
56
- max_trajectory_budget: float | dict | None = None,
57
- telemetry_enabled: bool | None = True,
58
- **kwargs: Any,
59
- ) -> None:
60
- model = model or "computer-use-preview"
61
- allowed_tools = allowed_tools or ["openai_computer"]
62
-
63
- computer_shim = {
64
- 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
65
- 'environment': 'linux',
66
- 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
67
- }
68
- # Build tools ensuring the computer_shim is included
69
- agent_tools: list[Any] = [computer_shim]
70
- if tools:
71
- agent_tools.extend(tools)
72
-
73
- # Build callbacks, injecting prompt instructions if provided
74
- agent_callbacks = list(callbacks or [])
75
- if instructions:
76
- agent_callbacks.append(PromptInstructionsCallback(instructions))
77
-
78
- computer_agent = BaseComputerAgent(
79
- model=model,
80
- tools=agent_tools,
81
- custom_loop=custom_loop,
82
- only_n_most_recent_images=only_n_most_recent_images,
83
- callbacks=agent_callbacks,
84
- verbosity=verbosity,
85
- trajectory_dir=trajectory_dir,
86
- max_retries=max_retries,
87
- screenshot_delay=screenshot_delay,
88
- use_prompt_caching=use_prompt_caching,
89
- max_trajectory_budget=max_trajectory_budget,
90
- telemetry_enabled=telemetry_enabled,
91
- )
92
- model_client = FakeAsyncOpenAI(computer_agent)
93
-
94
- super().__init__(
95
- model_client=model_client, # type: ignore[arg-type]
96
- model=model,
97
- allowed_tools=allowed_tools,
98
- **kwargs,
99
- )
18
+ from .agent import MCPComputerAgent
100
19
 
101
20
 
102
21
  # ---------------------------------------------------------------------------
@@ -123,7 +42,7 @@ async def run_single_task(
123
42
  max_trajectory_budget: float | dict | None = None,
124
43
  telemetry_enabled: bool | None = True,
125
44
  ) -> None:
126
- """Load one task from the dataset and execute it with Operator+CUA proxy."""
45
+ """Load one task from the dataset and execute it with MCPComputerAgent."""
127
46
 
128
47
  # Load dataset and pick a sample
129
48
  if isinstance(dataset, str):
@@ -139,9 +58,9 @@ async def run_single_task(
139
58
  with trace(name=task_prompt):
140
59
  task = Task(**sample_task) # type: ignore[arg-type]
141
60
 
142
- agent = ProxyOperatorAgent(
143
- model=model,
144
- allowed_tools=allowed_tools,
61
+ agent = MCPComputerAgent(
62
+ model=model or "computer-use-preview",
63
+ allowed_tools=allowed_tools or ["openai_computer"],
145
64
  # === ComputerAgent kwargs passthrough ===
146
65
  tools=tools,
147
66
  custom_loop=custom_loop,
@@ -190,9 +109,7 @@ async def run_full_dataset(
190
109
  ) -> list[Any]:
191
110
  """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
192
111
 
193
- # We pass OperatorAgent as the class and provide a config that injects our
194
- # FakeAsyncOpenAI per agent instantiation.
195
-
112
+ # Run with our MCP-based agent class.
196
113
  if isinstance(dataset, str):
197
114
  dataset_name = dataset.split('/')[-1]
198
115
  job_name = job_name or f"Evaluation {dataset_name}"
@@ -205,7 +122,7 @@ async def run_full_dataset(
205
122
  return await run_dataset(
206
123
  name=job_name,
207
124
  dataset=dataset,
208
- agent_class=ProxyOperatorAgent,
125
+ agent_class=MCPComputerAgent,
209
126
  agent_config={
210
127
  "model": model,
211
128
  "allowed_tools": allowed_tools,
@@ -233,5 +150,5 @@ async def run_full_dataset(
233
150
  __all__ = [
234
151
  "run_single_task",
235
152
  "run_full_dataset",
236
- "ProxyOperatorAgent",
153
+ "MCPComputerAgent",
237
154
  ]
@@ -0,0 +1,350 @@
1
+ """MCP-compatible Computer Agent for HUD integration.
2
+
3
+ This agent subclasses HUD's MCPAgent and delegates planning/execution to
4
+ our core ComputerAgent while using the Agent SDK's plain-dict message
5
+ format documented in `docs/content/docs/agent-sdk/message-format.mdx`.
6
+
7
+ Key differences from the OpenAI OperatorAgent variant:
8
+ - No OpenAI types are used; everything is standard Python dicts.
9
+ - Planning is executed via `ComputerAgent.run(messages)`.
10
+ - The first yielded result per step is returned as the agent response.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import io
15
+ from typing import Any, ClassVar, Optional
16
+
17
+ from agent.agent import ComputerAgent as BaseComputerAgent
18
+ from agent.callbacks import PromptInstructionsCallback
19
+ from agent.callbacks.trajectory_saver import TrajectorySaverCallback
20
+ from hud.agents import MCPAgent
21
+ from hud.tools.computer.settings import computer_settings
22
+ from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
23
+
24
+ from agent.responses import make_failed_tool_call_items
25
+ from agent.computers import is_agent_computer
26
+ from PIL import Image
27
+ import mcp.types as types
28
+ import hud
29
+ import uuid
30
+ import base64
31
+ from pathlib import Path
32
+
33
+
34
+ class MCPComputerAgent(MCPAgent):
35
+ """MCP agent that uses ComputerAgent for planning and tools for execution.
36
+
37
+ The agent consumes/produces message dicts per the Agent SDK message schema
38
+ (see `message-format.mdx`).
39
+ """
40
+
41
+ metadata: ClassVar[dict[str, Any]] = {
42
+ "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
43
+ "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
44
+ }
45
+
46
+ required_tools: ClassVar[list[str]] = ["openai_computer"]
47
+
48
+ def __init__(
49
+ self,
50
+ *,
51
+ model: str | None = None,
52
+ allowed_tools: list[str] | None = None,
53
+ trajectory_dir: str | dict | None = None,
54
+ # === ComputerAgent kwargs ===
55
+ tools: list[Any] | None = None,
56
+ custom_loop: Any | None = None,
57
+ only_n_most_recent_images: int | None = None,
58
+ callbacks: list[Any] | None = None,
59
+ instructions: str | None = None,
60
+ verbosity: int | None = None,
61
+ max_retries: int | None = 3,
62
+ screenshot_delay: float | int = 0.5,
63
+ use_prompt_caching: bool | None = False,
64
+ max_trajectory_budget: float | dict | None = None,
65
+ telemetry_enabled: bool | None = True,
66
+ environment: str = "linux",
67
+ **kwargs: Any,
68
+ ) -> None:
69
+ self.allowed_tools = allowed_tools or ["openai_computer"]
70
+ super().__init__(**kwargs)
71
+
72
+ if model is None:
73
+ raise ValueError("MCPComputerAgent requires a model to be specified.")
74
+
75
+ self.model = model
76
+ self.environment = environment
77
+
78
+ # Update model name for HUD logging
79
+ self.model_name = "cua-" + self.model
80
+
81
+ # Stateful tracking of tool call inputs
82
+ self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {}
83
+ self.previous_output: list[dict[str, Any]] = []
84
+
85
+ # Build system prompt
86
+ operator_instructions = """
87
+ You are an autonomous computer-using agent. Follow these guidelines:
88
+
89
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
90
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
91
+ 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
92
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
93
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
94
+ 6. The user has already given you permission by running this agent. No further confirmation is needed.
95
+ 7. Be decisive and action-oriented. Complete the requested task fully.
96
+
97
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
98
+ """.strip() # noqa: E501
99
+ # Append Operator instructions to the system prompt
100
+ if not self.system_prompt:
101
+ self.system_prompt = operator_instructions
102
+ else:
103
+ self.system_prompt += f"\n\n{operator_instructions}"
104
+ # Append user instructions to the system prompt
105
+ if instructions:
106
+ self.system_prompt += f"\n\n{instructions}"
107
+
108
+ # Configure trajectory_dir for HUD
109
+ if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path):
110
+ trajectory_dir = {"trajectory_dir": str(trajectory_dir)}
111
+ if isinstance(trajectory_dir, dict):
112
+ trajectory_dir["reset_on_run"] = False
113
+
114
+ self.last_screenshot_b64 = None
115
+
116
+ buffer = io.BytesIO()
117
+ Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG')
118
+ self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
119
+
120
+ # Ensure a computer shim is present so width/height/environment are known
121
+ computer_shim = {
122
+ "screenshot": lambda: self.last_screenshot_b64,
123
+ "environment": self.environment,
124
+ "dimensions": (
125
+ self.metadata["display_width"],
126
+ self.metadata["display_height"],
127
+ ),
128
+ }
129
+ agent_tools: list[Any] = [computer_shim]
130
+ if tools:
131
+ for tool in tools:
132
+ if is_agent_computer(tool):
133
+ raise ValueError(f"Too many Computer tools: MCPComputerAgent already includes a Computer interface. Received a Computer tool in tools= (e.g., {tool!r}). Remove it and retry.")
134
+ agent_tools.extend(tools)
135
+
136
+ agent_kwargs = {
137
+ "model": self.model,
138
+ "trajectory_dir": trajectory_dir,
139
+ "tools": agent_tools,
140
+ "custom_loop": custom_loop,
141
+ "only_n_most_recent_images": only_n_most_recent_images,
142
+ "callbacks": callbacks,
143
+ "instructions": self.system_prompt,
144
+ "verbosity": verbosity,
145
+ "max_retries": max_retries,
146
+ "screenshot_delay": screenshot_delay,
147
+ "use_prompt_caching": use_prompt_caching,
148
+ "max_trajectory_budget": max_trajectory_budget,
149
+ "telemetry_enabled": telemetry_enabled,
150
+ }
151
+
152
+ self.computer_agent = BaseComputerAgent(
153
+ **agent_kwargs
154
+ )
155
+
156
+ async def get_system_messages(self) -> list[Any]:
157
+ """Create initial messages.
158
+
159
+ Unused - ComputerAgent handles this with the 'instructions' parameter.
160
+ """
161
+ return []
162
+
163
+ async def format_blocks(
164
+ self, blocks: list[types.ContentBlock]
165
+ ) -> list[dict[str, Any]]:
166
+ """
167
+ Format blocks for OpenAI input format.
168
+
169
+ Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
170
+ """ # noqa: E501
171
+ formatted = []
172
+ for block in blocks:
173
+ if isinstance(block, types.TextContent):
174
+ formatted.append({"type": "input_text", "text": block.text})
175
+ elif isinstance(block, types.ImageContent):
176
+ mime_type = getattr(block, "mimeType", "image/png")
177
+ formatted.append(
178
+ {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
179
+ )
180
+ self.last_screenshot_b64 = block.data
181
+ return [{"role": "user", "content": formatted}]
182
+
183
+ @hud.instrument(
184
+ span_type="agent",
185
+ record_args=False, # Messages can be large
186
+ record_result=True,
187
+ )
188
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
189
+ """Get a single-step response by delegating to ComputerAgent.run.
190
+
191
+ Returns an Agent SDK-style response dict:
192
+ { "output": [AgentMessage, ...], "usage": Usage }
193
+ """
194
+ tool_calls: list[MCPToolCall] = []
195
+ output_text: list[str] = []
196
+ is_done: bool = True
197
+
198
+ agent_result: list[dict[str, Any]] = []
199
+
200
+ # Call the ComputerAgent LLM API
201
+ async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
202
+ items = result['output']
203
+ if not items or tool_calls:
204
+ break
205
+
206
+ for item in items:
207
+ if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']:
208
+ agent_result.append(item)
209
+
210
+ # Add messages to output text
211
+ if item['type'] == 'reasoning':
212
+ output_text.extend(
213
+ f"Reasoning: {summary['text']}"
214
+ for summary in item['summary']
215
+ )
216
+ elif item['type'] == 'message':
217
+ if isinstance(item['content'], list):
218
+ output_text.extend(
219
+ item['text']
220
+ for item in item['content']
221
+ if item['type'] == 'output_text'
222
+ )
223
+ elif isinstance(item['content'], str):
224
+ output_text.append(item['content'])
225
+
226
+ # If we get a tool call, we're not done
227
+ if item['type'] == 'computer_call':
228
+ id = item["call_id"]
229
+ tool_calls.append(MCPToolCall(
230
+ name="openai_computer",
231
+ arguments=item["action"],
232
+ id=id,
233
+ ))
234
+ is_done = False
235
+ self.tool_call_inputs[id] = agent_result
236
+ break
237
+
238
+ # if we have tool calls, we should exit the loop
239
+ if tool_calls:
240
+ break
241
+
242
+ self.previous_output = agent_result
243
+
244
+ return AgentResponse(
245
+ content="\n".join(output_text),
246
+ tool_calls=tool_calls,
247
+ done=is_done,
248
+ )
249
+
250
+ def _log_image(self, image_b64: str):
251
+ callbacks = self.computer_agent.callbacks
252
+ for callback in callbacks:
253
+ if isinstance(callback, TrajectorySaverCallback):
254
+ # convert str to bytes
255
+ image_bytes = base64.b64decode(image_b64)
256
+ callback._save_artifact("screenshot_after", image_bytes)
257
+
258
+ async def format_tool_results(
259
+ self,
260
+ tool_calls: list[MCPToolCall],
261
+ tool_results: list[MCPToolResult]
262
+ ) -> list[dict[str, Any]]:
263
+ """Extract latest screenshot from tool results in dict form.
264
+
265
+ Expects results to already be in the message-format content dicts.
266
+ Returns a list of input content dicts suitable for follow-up calls.
267
+ """
268
+ messages = []
269
+
270
+ for call, result in zip(tool_calls, tool_results):
271
+ if call.id not in self.tool_call_inputs:
272
+ # If we don't have the tool call inputs, we should just use the previous output
273
+ previous_output = self.previous_output.copy() or []
274
+
275
+ # First we need to remove any pending computer_calls from the end of previous_output
276
+ while previous_output and previous_output[-1]['type'] == 'computer_call':
277
+ previous_output.pop()
278
+ messages.extend(previous_output)
279
+
280
+ # If the call is a 'response', don't add the result
281
+ if call.name == 'response':
282
+ continue
283
+ # Otherwise, if we have a result, we should add it to the messages
284
+ content = [
285
+ { "type": "input_text", "text": content.text } if isinstance(content, types.TextContent)
286
+ else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent)
287
+ else { "type": "input_text", "text": "" }
288
+ for content in result.content
289
+ ]
290
+ messages.append({
291
+ "role": "user",
292
+ "content": content,
293
+ })
294
+
295
+ continue
296
+
297
+ # Add the assistant's computer call
298
+ messages.extend(self.tool_call_inputs[call.id])
299
+
300
+ if result.isError:
301
+ error_text = "".join([
302
+ content.text
303
+ for content in result.content
304
+ if isinstance(content, types.TextContent)
305
+ ])
306
+
307
+ # Replace computer call with failed tool call
308
+ messages.pop()
309
+ messages.extend(make_failed_tool_call_items(
310
+ tool_name=call.name,
311
+ tool_kwargs=call.arguments or {},
312
+ error_message=error_text,
313
+ call_id=call.id,
314
+ ))
315
+ else:
316
+ # Get the latest screenshot
317
+ screenshots = [
318
+ content.data
319
+ for content in result.content
320
+ if isinstance(content, types.ImageContent)
321
+ ]
322
+
323
+ # Add the resulting screenshot
324
+ if screenshots:
325
+ self._log_image(screenshots[0])
326
+ self.last_screenshot_b64 = screenshots[0]
327
+ messages.append({
328
+ "type": "computer_call_output",
329
+ "call_id": call.id,
330
+ "output": {
331
+ "type": "input_image",
332
+ "image_url": f"data:image/png;base64,{screenshots[0]}"
333
+ },
334
+ })
335
+ else:
336
+ # Otherwise, replace computer call with failed tool call
337
+ messages.pop()
338
+ messages.extend(make_failed_tool_call_items(
339
+ tool_name=call.name,
340
+ tool_kwargs=call.arguments or {},
341
+ error_message="No screenshots returned.",
342
+ call_id=call.id,
343
+ ))
344
+
345
+ return messages
346
+
347
+
348
+ __all__ = [
349
+ "MCPComputerAgent",
350
+ ]
@@ -13,6 +13,10 @@ import uuid
13
13
  from typing import Any, Dict, List, Optional
14
14
 
15
15
  from agent.agent import ComputerAgent as BaseComputerAgent
16
+ from agent.callbacks import PromptInstructionsCallback
17
+ from hud.tools.computer.settings import computer_settings
18
+ from PIL import Image
19
+ from hud.agents import OperatorAgent
16
20
 
17
21
  # OpenAI Responses typed models (required)
18
22
  from openai.types.responses import (
@@ -178,6 +182,83 @@ class FakeAsyncOpenAI:
178
182
  print(traceback.format_exc())
179
183
  raise e
180
184
 
185
+
186
+ # ---------------------------------------------------------------------------
187
+ # Proxy OperatorAgent (moved from __init__.py)
188
+ # ---------------------------------------------------------------------------
189
+
190
+
191
+ class ProxyOperatorAgent(OperatorAgent):
192
+ """OperatorAgent that proxies model calls through our ComputerAgent.
193
+
194
+ Accepts the same config keys we pass via hud.run_dataset `agent_config`:
195
+ - model: str | None
196
+ - allowed_tools: list[str] | None
197
+ Additional kwargs are forwarded to OperatorAgent (if any are supported).
198
+ """
199
+
200
+ def __init__(
201
+ self,
202
+ *,
203
+ model: str | None = None,
204
+ allowed_tools: list[str] | None = None,
205
+ trajectory_dir: str | dict | None = None,
206
+ # === ComputerAgent kwargs ===
207
+ tools: list[Any] | None = None,
208
+ custom_loop: Any | None = None,
209
+ only_n_most_recent_images: int | None = None,
210
+ callbacks: list[Any] | None = None,
211
+ instructions: str | None = None,
212
+ verbosity: int | None = None,
213
+ max_retries: int | None = 3,
214
+ screenshot_delay: float | int = 0.5,
215
+ use_prompt_caching: bool | None = False,
216
+ max_trajectory_budget: float | dict | None = None,
217
+ telemetry_enabled: bool | None = True,
218
+ **kwargs: Any,
219
+ ) -> None:
220
+ model = model or "computer-use-preview"
221
+ allowed_tools = allowed_tools or ["openai_computer"]
222
+
223
+ computer_shim = {
224
+ 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
225
+ 'environment': 'linux',
226
+ 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
227
+ }
228
+ # Build tools ensuring the computer_shim is included
229
+ agent_tools: list[Any] = [computer_shim]
230
+ if tools:
231
+ agent_tools.extend(tools)
232
+
233
+ # Build callbacks, injecting prompt instructions if provided
234
+ agent_callbacks = list(callbacks or [])
235
+ if instructions:
236
+ agent_callbacks.append(PromptInstructionsCallback(instructions))
237
+
238
+ computer_agent = BaseComputerAgent(
239
+ model=model,
240
+ tools=agent_tools,
241
+ custom_loop=custom_loop,
242
+ only_n_most_recent_images=only_n_most_recent_images,
243
+ callbacks=agent_callbacks,
244
+ verbosity=verbosity,
245
+ trajectory_dir=trajectory_dir,
246
+ max_retries=max_retries,
247
+ screenshot_delay=screenshot_delay,
248
+ use_prompt_caching=use_prompt_caching,
249
+ max_trajectory_budget=max_trajectory_budget,
250
+ telemetry_enabled=telemetry_enabled,
251
+ )
252
+ model_client = FakeAsyncOpenAI(computer_agent)
253
+
254
+ super().__init__(
255
+ model_client=model_client, # type: ignore[arg-type]
256
+ model=model,
257
+ allowed_tools=allowed_tools,
258
+ **kwargs,
259
+ )
260
+
181
261
  __all__ = [
182
262
  "FakeAsyncOpenAI",
263
+ "ProxyOperatorAgent",
183
264
  ]
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.4.28"
9
+ version = "0.4.29"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -56,7 +56,7 @@ cli = [
56
56
  "yaspin>=3.1.0",
57
57
  ]
58
58
  hud = [
59
- "hud-python>=0.4.12,<0.5.0",
59
+ "hud-python==0.4.19",
60
60
  ]
61
61
  all = [
62
62
  "mlx-vlm>=0.1.27; sys_platform == 'darwin'",
@@ -66,7 +66,7 @@ all = [
66
66
  "gradio>=5.23.3",
67
67
  "python-dotenv>=1.0.1",
68
68
  "yaspin>=3.1.0",
69
- "hud-python>=0.4.12,<0.5.0",
69
+ "hud-python==0.4.19",
70
70
  ]
71
71
 
72
72
  [tool.uv]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes