cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -7,26 +7,33 @@ OpenAI-like response blocks. We intentionally only support a single-step call
7
7
  by consuming the first yielded result from `ComputerAgent.run()`.
8
8
  """
9
9
 
10
- import traceback
11
10
  import time
11
+ import traceback
12
12
  import uuid
13
13
  from typing import Any, Dict, List, Optional
14
14
 
15
15
  from agent.agent import ComputerAgent as BaseComputerAgent
16
+ from agent.callbacks import PromptInstructionsCallback
17
+ from hud.agents import OperatorAgent
18
+ from hud.tools.computer.settings import computer_settings
16
19
 
17
20
  # OpenAI Responses typed models (required)
18
21
  from openai.types.responses import (
19
22
  Response,
23
+ ResponseComputerToolCall,
20
24
  ResponseInputParam,
21
25
  ResponseOutputItem,
22
- ResponseComputerToolCall,
23
26
  ResponseOutputMessage,
24
27
  ResponseOutputText,
25
28
  ResponseReasoningItem,
26
29
  ResponseUsage,
27
30
  )
31
+ from PIL import Image
32
+
28
33
 
29
- def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
34
+ def _map_agent_output_to_openai_blocks(
35
+ output_items: List[Dict[str, Any]],
36
+ ) -> List[ResponseOutputItem]:
30
37
  """Map our agent output items to OpenAI ResponseOutputItem typed models.
31
38
 
32
39
  Only a subset is supported: computer_call, assistant message (text), and reasoning.
@@ -36,14 +43,16 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
36
43
  for item in output_items or []:
37
44
  t = item.get("type")
38
45
  if t == "computer_call":
39
- comp = ResponseComputerToolCall.model_validate({
40
- "id": item.get("id") or f"cu_{uuid.uuid4().hex}",
41
- "type": "computer_call",
42
- "call_id": item["call_id"],
43
- "action": item["action"],
44
- "pending_safety_checks": item.get("pending_safety_checks", []),
45
- "status": "completed",
46
- })
46
+ comp = ResponseComputerToolCall.model_validate(
47
+ {
48
+ "id": item.get("id") or f"cu_{uuid.uuid4().hex}",
49
+ "type": "computer_call",
50
+ "call_id": item["call_id"],
51
+ "action": item["action"],
52
+ "pending_safety_checks": item.get("pending_safety_checks", []),
53
+ "status": "completed",
54
+ }
55
+ )
47
56
  blocks.append(comp)
48
57
  # we will exit early here as the responses api only supports a single step
49
58
  break
@@ -51,31 +60,38 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
51
60
  content_blocks: List[ResponseOutputText] = []
52
61
  for c in item.get("content", []) or []:
53
62
  content_blocks.append(
54
- ResponseOutputText.model_validate({
55
- "type": "output_text",
56
- "text": c["text"],
57
- "annotations": [],
58
- })
63
+ ResponseOutputText.model_validate(
64
+ {
65
+ "type": "output_text",
66
+ "text": c["text"],
67
+ "annotations": [],
68
+ }
69
+ )
59
70
  )
60
71
  if content_blocks:
61
- msg = ResponseOutputMessage.model_validate({
62
- "id": item.get("id") or f"msg_{uuid.uuid4()}",
63
- "type": "message",
64
- "role": "assistant",
65
- "status": "completed",
66
- "content": [ct.model_dump() for ct in content_blocks],
67
- })
72
+ msg = ResponseOutputMessage.model_validate(
73
+ {
74
+ "id": item.get("id") or f"msg_{uuid.uuid4()}",
75
+ "type": "message",
76
+ "role": "assistant",
77
+ "status": "completed",
78
+ "content": [ct.model_dump() for ct in content_blocks],
79
+ }
80
+ )
68
81
  blocks.append(msg)
69
82
  elif t == "reasoning":
70
- reasoning = ResponseReasoningItem.model_validate({
71
- "id": item.get("id") or f"rsn_{uuid.uuid4()}",
72
- "type": "reasoning",
73
- "summary": item["summary"],
74
- })
83
+ reasoning = ResponseReasoningItem.model_validate(
84
+ {
85
+ "id": item.get("id") or f"rsn_{uuid.uuid4()}",
86
+ "type": "reasoning",
87
+ "summary": item["summary"],
88
+ }
89
+ )
75
90
  blocks.append(reasoning)
76
91
  # Unhandled types are ignored
77
92
  return blocks
78
93
 
94
+
79
95
  def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
80
96
  out: List[Dict[str, Any]] = []
81
97
  for it in list(items):
@@ -88,6 +104,7 @@ def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
88
104
  out.append(dict(it)) # may raise if not mapping
89
105
  return out
90
106
 
107
+
91
108
  class FakeAsyncOpenAI:
92
109
  """Minimal fake OpenAI client with only `responses.create` implemented.
93
110
 
@@ -128,10 +145,12 @@ class FakeAsyncOpenAI:
128
145
  # Pre-pend instructions message
129
146
  effective_input = full_input
130
147
  if instructions:
131
- effective_input = [{
132
- "role": "user",
133
- "content": instructions,
134
- }] + full_input
148
+ effective_input = [
149
+ {
150
+ "role": "user",
151
+ "content": instructions,
152
+ }
153
+ ] + full_input
135
154
 
136
155
  # Run a single iteration of the ComputerAgent
137
156
  agent_result: Optional[Dict[str, Any]] = None
@@ -148,36 +167,131 @@ class FakeAsyncOpenAI:
148
167
  blocks_to_cache = full_input + output
149
168
  for b in blocks_to_cache:
150
169
  bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
151
- self.blocks_cache[bid] = b # type: ignore[assignment]
170
+ self.blocks_cache[bid] = b # type: ignore[assignment]
152
171
  block_ids.append(bid)
153
172
  response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
154
173
  self.context_cache[response_id] = block_ids
155
174
 
156
175
  try:
157
- return Response.model_validate({
158
- "id": response_id,
159
- "created_at": time.time(),
160
- "object": "response",
161
- "model": model,
162
- "output": output,
163
- "parallel_tool_calls": False,
164
- "tool_choice": "auto",
165
- "tools": [],
166
- "previous_response_id": previous_response_id,
167
- "usage": ResponseUsage.model_validate({
168
- "input_tokens": usage.get("input_tokens", 0),
169
- "output_tokens": usage.get("output_tokens", 0),
170
- "total_tokens": usage.get("total_tokens", 0),
171
- "input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
172
- "output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
173
- }),
174
- })
176
+ return Response.model_validate(
177
+ {
178
+ "id": response_id,
179
+ "created_at": time.time(),
180
+ "object": "response",
181
+ "model": model,
182
+ "output": output,
183
+ "parallel_tool_calls": False,
184
+ "tool_choice": "auto",
185
+ "tools": [],
186
+ "previous_response_id": previous_response_id,
187
+ "usage": ResponseUsage.model_validate(
188
+ {
189
+ "input_tokens": usage.get("input_tokens", 0),
190
+ "output_tokens": usage.get("output_tokens", 0),
191
+ "total_tokens": usage.get("total_tokens", 0),
192
+ "input_tokens_details": usage.get(
193
+ "input_tokens_details", {"cached_tokens": 0}
194
+ ),
195
+ "output_tokens_details": usage.get(
196
+ "output_tokens_details", {"reasoning_tokens": 0}
197
+ ),
198
+ }
199
+ ),
200
+ }
201
+ )
175
202
  except Exception as e:
176
- print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
203
+ print(
204
+ f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ",
205
+ e,
206
+ )
177
207
  if attempt == max_retries - 1:
178
208
  print(traceback.format_exc())
179
209
  raise e
180
210
 
211
+
212
+ # ---------------------------------------------------------------------------
213
+ # Proxy OperatorAgent (moved from __init__.py)
214
+ # ---------------------------------------------------------------------------
215
+
216
+
217
+ class ProxyOperatorAgent(OperatorAgent):
218
+ """OperatorAgent that proxies model calls through our ComputerAgent.
219
+
220
+ Accepts the same config keys we pass via hud.run_dataset `agent_config`:
221
+ - model: str | None
222
+ - allowed_tools: list[str] | None
223
+ Additional kwargs are forwarded to OperatorAgent (if any are supported).
224
+ """
225
+
226
+ def __init__(
227
+ self,
228
+ *,
229
+ model: str | None = None,
230
+ allowed_tools: list[str] | None = None,
231
+ trajectory_dir: str | dict | None = None,
232
+ # === ComputerAgent kwargs ===
233
+ tools: list[Any] | None = None,
234
+ custom_loop: Any | None = None,
235
+ only_n_most_recent_images: int | None = None,
236
+ callbacks: list[Any] | None = None,
237
+ instructions: str | None = None,
238
+ verbosity: int | None = None,
239
+ max_retries: int | None = 3,
240
+ screenshot_delay: float | int = 0.5,
241
+ use_prompt_caching: bool | None = False,
242
+ max_trajectory_budget: float | dict | None = None,
243
+ telemetry_enabled: bool | None = True,
244
+ **kwargs: Any,
245
+ ) -> None:
246
+ model = model or "computer-use-preview"
247
+ allowed_tools = allowed_tools or ["openai_computer"]
248
+
249
+ computer_shim = {
250
+ "screenshot": lambda: Image.new(
251
+ "RGB",
252
+ (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT),
253
+ ),
254
+ "environment": "linux",
255
+ "dimensions": (
256
+ computer_settings.OPENAI_COMPUTER_WIDTH,
257
+ computer_settings.OPENAI_COMPUTER_HEIGHT,
258
+ ),
259
+ }
260
+ # Build tools ensuring the computer_shim is included
261
+ agent_tools: list[Any] = [computer_shim]
262
+ if tools:
263
+ agent_tools.extend(tools)
264
+
265
+ # Build callbacks, injecting prompt instructions if provided
266
+ agent_callbacks = list(callbacks or [])
267
+ if instructions:
268
+ agent_callbacks.append(PromptInstructionsCallback(instructions))
269
+
270
+ computer_agent = BaseComputerAgent(
271
+ model=model,
272
+ tools=agent_tools,
273
+ custom_loop=custom_loop,
274
+ only_n_most_recent_images=only_n_most_recent_images,
275
+ callbacks=agent_callbacks,
276
+ verbosity=verbosity,
277
+ trajectory_dir=trajectory_dir,
278
+ max_retries=max_retries,
279
+ screenshot_delay=screenshot_delay,
280
+ use_prompt_caching=use_prompt_caching,
281
+ max_trajectory_budget=max_trajectory_budget,
282
+ telemetry_enabled=telemetry_enabled,
283
+ )
284
+ model_client = FakeAsyncOpenAI(computer_agent)
285
+
286
+ super().__init__(
287
+ model_client=model_client, # type: ignore[arg-type]
288
+ model=model,
289
+ allowed_tools=allowed_tools,
290
+ **kwargs,
291
+ )
292
+
293
+
181
294
  __all__ = [
182
295
  "FakeAsyncOpenAI",
296
+ "ProxyOperatorAgent",
183
297
  ]
agent/loops/__init__.py CHANGED
@@ -1,14 +1,44 @@
1
- """
2
- Agent loops for agent
3
- """
4
-
5
- # Import the loops to register them
6
- from . import anthropic
7
- from . import openai
8
- from . import uitars
9
- from . import omniparser
10
- from . import gta1
11
- from . import composed_grounded
12
- from . import glm45v
13
-
14
- __all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded", "glm45v"]
1
+ """
2
+ Agent loops for agent
3
+ """
4
+
5
+ # Import the loops to register them
6
+ from . import (
7
+ anthropic,
8
+ composed_grounded,
9
+ fara,
10
+ gelato,
11
+ gemini,
12
+ generic_vlm,
13
+ glm45v,
14
+ gta1,
15
+ holo,
16
+ internvl,
17
+ moondream3,
18
+ omniparser,
19
+ openai,
20
+ opencua,
21
+ uiins,
22
+ uitars,
23
+ uitars2,
24
+ )
25
+
26
+ __all__ = [
27
+ "anthropic",
28
+ "composed_grounded",
29
+ "gelato",
30
+ "gemini",
31
+ "generic_vlm",
32
+ "fara",
33
+ "glm45v",
34
+ "gta1",
35
+ "holo",
36
+ "internvl",
37
+ "moondream3",
38
+ "omniparser",
39
+ "openai",
40
+ "opencua",
41
+ "uiins",
42
+ "uitars",
43
+ "uitars2",
44
+ ]