cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,297 @@
1
+ """HUD ComputerAgent wrapper and Fake AsyncOpenAI client.
2
+
3
+ Provides FakeAsyncOpenAI that adapts our ComputerAgent to the OpenAI Responses
4
+ interface needed by HUD's OperatorAgent. It implements only `responses.create`
5
+ and returns an OpenAI Response object with `id` and `output` fields, where `output` is a list of
6
+ OpenAI-like response blocks. We intentionally only support a single-step call
7
+ by consuming the first yielded result from `ComputerAgent.run()`.
8
+ """
9
+
10
+ import time
11
+ import traceback
12
+ import uuid
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from agent.agent import ComputerAgent as BaseComputerAgent
16
+ from agent.callbacks import PromptInstructionsCallback
17
+ from hud.agents import OperatorAgent
18
+ from hud.tools.computer.settings import computer_settings
19
+
20
+ # OpenAI Responses typed models (required)
21
+ from openai.types.responses import (
22
+ Response,
23
+ ResponseComputerToolCall,
24
+ ResponseInputParam,
25
+ ResponseOutputItem,
26
+ ResponseOutputMessage,
27
+ ResponseOutputText,
28
+ ResponseReasoningItem,
29
+ ResponseUsage,
30
+ )
31
+ from PIL import Image
32
+
33
+
34
+ def _map_agent_output_to_openai_blocks(
35
+ output_items: List[Dict[str, Any]],
36
+ ) -> List[ResponseOutputItem]:
37
+ """Map our agent output items to OpenAI ResponseOutputItem typed models.
38
+
39
+ Only a subset is supported: computer_call, assistant message (text), and reasoning.
40
+ Unknown types are ignored.
41
+ """
42
+ blocks: List[ResponseOutputItem] = []
43
+ for item in output_items or []:
44
+ t = item.get("type")
45
+ if t == "computer_call":
46
+ comp = ResponseComputerToolCall.model_validate(
47
+ {
48
+ "id": item.get("id") or f"cu_{uuid.uuid4().hex}",
49
+ "type": "computer_call",
50
+ "call_id": item["call_id"],
51
+ "action": item["action"],
52
+ "pending_safety_checks": item.get("pending_safety_checks", []),
53
+ "status": "completed",
54
+ }
55
+ )
56
+ blocks.append(comp)
57
+ # we will exit early here as the responses api only supports a single step
58
+ break
59
+ elif t == "message" and item.get("role") == "assistant":
60
+ content_blocks: List[ResponseOutputText] = []
61
+ for c in item.get("content", []) or []:
62
+ content_blocks.append(
63
+ ResponseOutputText.model_validate(
64
+ {
65
+ "type": "output_text",
66
+ "text": c["text"],
67
+ "annotations": [],
68
+ }
69
+ )
70
+ )
71
+ if content_blocks:
72
+ msg = ResponseOutputMessage.model_validate(
73
+ {
74
+ "id": item.get("id") or f"msg_{uuid.uuid4()}",
75
+ "type": "message",
76
+ "role": "assistant",
77
+ "status": "completed",
78
+ "content": [ct.model_dump() for ct in content_blocks],
79
+ }
80
+ )
81
+ blocks.append(msg)
82
+ elif t == "reasoning":
83
+ reasoning = ResponseReasoningItem.model_validate(
84
+ {
85
+ "id": item.get("id") or f"rsn_{uuid.uuid4()}",
86
+ "type": "reasoning",
87
+ "summary": item["summary"],
88
+ }
89
+ )
90
+ blocks.append(reasoning)
91
+ # Unhandled types are ignored
92
+ return blocks
93
+
94
+
95
+ def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
96
+ out: List[Dict[str, Any]] = []
97
+ for it in list(items):
98
+ if hasattr(it, "model_dump"):
99
+ out.append(it.model_dump()) # type: ignore[attr-defined]
100
+ elif isinstance(it, dict):
101
+ out.append(it)
102
+ else:
103
+ # Strict: rely on default __dict__ if present
104
+ out.append(dict(it)) # may raise if not mapping
105
+ return out
106
+
107
+
108
+ class FakeAsyncOpenAI:
109
+ """Minimal fake OpenAI client with only `responses.create` implemented.
110
+
111
+ It uses a provided `ComputerAgent` instance to produce a single-step
112
+ response compatible with HUD's OperatorAgent loop.
113
+ """
114
+
115
+ def __init__(self, computer_agent: BaseComputerAgent) -> None:
116
+ self._agent = computer_agent
117
+ self.responses = self._Responses(self)
118
+
119
+ class _Responses:
120
+ def __init__(self, parent: "FakeAsyncOpenAI") -> None:
121
+ # Caches for cross-call context when using previous_response_id
122
+ self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {}
123
+ self.context_cache: Dict[str, List[str]] = {}
124
+ self.agent = parent._agent
125
+
126
+ async def create(
127
+ self,
128
+ *,
129
+ model: str,
130
+ input: ResponseInputParam,
131
+ tools: Optional[List[Dict[str, Any]]] = None,
132
+ instructions: Optional[str] = None,
133
+ previous_response_id: Optional[str] = None,
134
+ max_retries: int = 5,
135
+ **_: Any,
136
+ ) -> Any:
137
+ for attempt in range(max_retries):
138
+ # Prepend cached blocks from previous_response_id to input
139
+ full_input = input
140
+ if previous_response_id is not None:
141
+ prev_block_ids = self.context_cache[previous_response_id]
142
+ prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids]
143
+ full_input = _to_plain_dict_list(prev_blocks + input)
144
+
145
+ # Pre-pend instructions message
146
+ effective_input = full_input
147
+ if instructions:
148
+ effective_input = [
149
+ {
150
+ "role": "user",
151
+ "content": instructions,
152
+ }
153
+ ] + full_input
154
+
155
+ # Run a single iteration of the ComputerAgent
156
+ agent_result: Optional[Dict[str, Any]] = None
157
+ async for result in self.agent.run(effective_input): # type: ignore[arg-type]
158
+ agent_result = result
159
+ break
160
+ assert agent_result is not None, "Agent failed to produce result"
161
+
162
+ output = _map_agent_output_to_openai_blocks(agent_result["output"])
163
+ usage = agent_result["usage"]
164
+
165
+ # Cache conversation context using the last response id
166
+ block_ids: List[str] = []
167
+ blocks_to_cache = full_input + output
168
+ for b in blocks_to_cache:
169
+ bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
170
+ self.blocks_cache[bid] = b # type: ignore[assignment]
171
+ block_ids.append(bid)
172
+ response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
173
+ self.context_cache[response_id] = block_ids
174
+
175
+ try:
176
+ return Response.model_validate(
177
+ {
178
+ "id": response_id,
179
+ "created_at": time.time(),
180
+ "object": "response",
181
+ "model": model,
182
+ "output": output,
183
+ "parallel_tool_calls": False,
184
+ "tool_choice": "auto",
185
+ "tools": [],
186
+ "previous_response_id": previous_response_id,
187
+ "usage": ResponseUsage.model_validate(
188
+ {
189
+ "input_tokens": usage.get("input_tokens", 0),
190
+ "output_tokens": usage.get("output_tokens", 0),
191
+ "total_tokens": usage.get("total_tokens", 0),
192
+ "input_tokens_details": usage.get(
193
+ "input_tokens_details", {"cached_tokens": 0}
194
+ ),
195
+ "output_tokens_details": usage.get(
196
+ "output_tokens_details", {"reasoning_tokens": 0}
197
+ ),
198
+ }
199
+ ),
200
+ }
201
+ )
202
+ except Exception as e:
203
+ print(
204
+ f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ",
205
+ e,
206
+ )
207
+ if attempt == max_retries - 1:
208
+ print(traceback.format_exc())
209
+ raise e
210
+
211
+
212
+ # ---------------------------------------------------------------------------
213
+ # Proxy OperatorAgent (moved from __init__.py)
214
+ # ---------------------------------------------------------------------------
215
+
216
+
217
+ class ProxyOperatorAgent(OperatorAgent):
218
+ """OperatorAgent that proxies model calls through our ComputerAgent.
219
+
220
+ Accepts the same config keys we pass via hud.run_dataset `agent_config`:
221
+ - model: str | None
222
+ - allowed_tools: list[str] | None
223
+ Additional kwargs are forwarded to OperatorAgent (if any are supported).
224
+ """
225
+
226
+ def __init__(
227
+ self,
228
+ *,
229
+ model: str | None = None,
230
+ allowed_tools: list[str] | None = None,
231
+ trajectory_dir: str | dict | None = None,
232
+ # === ComputerAgent kwargs ===
233
+ tools: list[Any] | None = None,
234
+ custom_loop: Any | None = None,
235
+ only_n_most_recent_images: int | None = None,
236
+ callbacks: list[Any] | None = None,
237
+ instructions: str | None = None,
238
+ verbosity: int | None = None,
239
+ max_retries: int | None = 3,
240
+ screenshot_delay: float | int = 0.5,
241
+ use_prompt_caching: bool | None = False,
242
+ max_trajectory_budget: float | dict | None = None,
243
+ telemetry_enabled: bool | None = True,
244
+ **kwargs: Any,
245
+ ) -> None:
246
+ model = model or "computer-use-preview"
247
+ allowed_tools = allowed_tools or ["openai_computer"]
248
+
249
+ computer_shim = {
250
+ "screenshot": lambda: Image.new(
251
+ "RGB",
252
+ (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT),
253
+ ),
254
+ "environment": "linux",
255
+ "dimensions": (
256
+ computer_settings.OPENAI_COMPUTER_WIDTH,
257
+ computer_settings.OPENAI_COMPUTER_HEIGHT,
258
+ ),
259
+ }
260
+ # Build tools ensuring the computer_shim is included
261
+ agent_tools: list[Any] = [computer_shim]
262
+ if tools:
263
+ agent_tools.extend(tools)
264
+
265
+ # Build callbacks, injecting prompt instructions if provided
266
+ agent_callbacks = list(callbacks or [])
267
+ if instructions:
268
+ agent_callbacks.append(PromptInstructionsCallback(instructions))
269
+
270
+ computer_agent = BaseComputerAgent(
271
+ model=model,
272
+ tools=agent_tools,
273
+ custom_loop=custom_loop,
274
+ only_n_most_recent_images=only_n_most_recent_images,
275
+ callbacks=agent_callbacks,
276
+ verbosity=verbosity,
277
+ trajectory_dir=trajectory_dir,
278
+ max_retries=max_retries,
279
+ screenshot_delay=screenshot_delay,
280
+ use_prompt_caching=use_prompt_caching,
281
+ max_trajectory_budget=max_trajectory_budget,
282
+ telemetry_enabled=telemetry_enabled,
283
+ )
284
+ model_client = FakeAsyncOpenAI(computer_agent)
285
+
286
+ super().__init__(
287
+ model_client=model_client, # type: ignore[arg-type]
288
+ model=model,
289
+ allowed_tools=allowed_tools,
290
+ **kwargs,
291
+ )
292
+
293
+
294
+ __all__ = [
295
+ "FakeAsyncOpenAI",
296
+ "ProxyOperatorAgent",
297
+ ]
agent/loops/__init__.py CHANGED
@@ -1,14 +1,44 @@
1
- """
2
- Agent loops for agent
3
- """
4
-
5
- # Import the loops to register them
6
- from . import anthropic
7
- from . import openai
8
- from . import uitars
9
- from . import omniparser
10
- from . import gta1
11
- from . import composed_grounded
12
- from . import glm45v
13
-
14
- __all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded", "glm45v"]
1
+ """
2
+ Agent loops for agent
3
+ """
4
+
5
+ # Import the loops to register them
6
+ from . import (
7
+ anthropic,
8
+ composed_grounded,
9
+ fara,
10
+ gelato,
11
+ gemini,
12
+ generic_vlm,
13
+ glm45v,
14
+ gta1,
15
+ holo,
16
+ internvl,
17
+ moondream3,
18
+ omniparser,
19
+ openai,
20
+ opencua,
21
+ uiins,
22
+ uitars,
23
+ uitars2,
24
+ )
25
+
26
+ __all__ = [
27
+ "anthropic",
28
+ "composed_grounded",
29
+ "gelato",
30
+ "gemini",
31
+ "generic_vlm",
32
+ "fara",
33
+ "glm45v",
34
+ "gta1",
35
+ "holo",
36
+ "internvl",
37
+ "moondream3",
38
+ "omniparser",
39
+ "openai",
40
+ "opencua",
41
+ "uiins",
42
+ "uitars",
43
+ "uitars2",
44
+ ]