cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/loops/openai.py CHANGED
@@ -6,12 +6,14 @@ import asyncio
6
6
  import base64
7
7
  import json
8
8
  from io import BytesIO
9
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
9
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
10
+
10
11
  import litellm
11
12
  from PIL import Image
12
13
 
13
14
  from ..decorators import register_agent
14
- from ..types import Messages, AgentResponse, Tools, AgentCapability
15
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
16
+
15
17
 
16
18
  async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
17
19
  """Map a computer tool to OpenAI's computer-use-preview tool schema"""
@@ -21,26 +23,26 @@ async def _map_computer_tool_to_openai(computer_handler: Any) -> Dict[str, Any]:
21
23
  except Exception:
22
24
  # Fallback to default dimensions if method fails
23
25
  width, height = 1024, 768
24
-
26
+
25
27
  # Get environment from the computer handler
26
28
  try:
27
29
  environment = await computer_handler.get_environment()
28
30
  except Exception:
29
31
  # Fallback to default environment if method fails
30
32
  environment = "linux"
31
-
33
+
32
34
  return {
33
35
  "type": "computer_use_preview",
34
36
  "display_width": width,
35
37
  "display_height": height,
36
- "environment": environment # mac, windows, linux, browser
38
+ "environment": environment, # mac, windows, linux, browser
37
39
  }
38
40
 
39
41
 
40
42
  async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
41
43
  """Prepare tools for OpenAI API format"""
42
44
  openai_tools = []
43
-
45
+
44
46
  for schema in tool_schemas:
45
47
  if schema["type"] == "computer":
46
48
  # Map computer tool to OpenAI format
@@ -49,18 +51,19 @@ async def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools
49
51
  elif schema["type"] == "function":
50
52
  # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
51
53
  # Schema should be: {type, name, description, parameters}
52
- openai_tools.append({ "type": "function", **schema["function"] })
53
-
54
+ openai_tools.append({"type": "function", **schema["function"]})
55
+
54
56
  return openai_tools
55
57
 
58
+
56
59
  @register_agent(models=r".*(^|/)computer-use-preview")
57
60
  class OpenAIComputerUseConfig:
58
61
  """
59
62
  OpenAI computer-use-preview agent configuration using liteLLM responses.
60
-
63
+
61
64
  Supports OpenAI's computer use preview models.
62
65
  """
63
-
66
+
64
67
  async def predict_step(
65
68
  self,
66
69
  messages: List[Dict[str, Any]],
@@ -74,11 +77,11 @@ class OpenAIComputerUseConfig:
74
77
  _on_api_end=None,
75
78
  _on_usage=None,
76
79
  _on_screenshot=None,
77
- **kwargs
80
+ **kwargs,
78
81
  ) -> Dict[str, Any]:
79
82
  """
80
83
  Predict the next step based on input items.
81
-
84
+
82
85
  Args:
83
86
  messages: Input items following Responses format
84
87
  model: Model name to use
@@ -91,12 +94,12 @@ class OpenAIComputerUseConfig:
91
94
  _on_usage: Callback for usage tracking
92
95
  _on_screenshot: Callback for screenshot events
93
96
  **kwargs: Additional arguments
94
-
97
+
95
98
  Returns:
96
99
  Dictionary with "output" (output items) and "usage" array
97
100
  """
98
101
  tools = tools or []
99
-
102
+
100
103
  # Prepare tools for OpenAI API
101
104
  openai_tools = await _prepare_tools_for_openai(tools)
102
105
 
@@ -109,16 +112,16 @@ class OpenAIComputerUseConfig:
109
112
  "reasoning": {"summary": "concise"},
110
113
  "truncation": "auto",
111
114
  "num_retries": max_retries,
112
- **kwargs
115
+ **kwargs,
113
116
  }
114
-
117
+
115
118
  # Call API start hook
116
119
  if _on_api_start:
117
120
  await _on_api_start(api_kwargs)
118
-
121
+
119
122
  # Use liteLLM responses
120
123
  response = await litellm.aresponses(**api_kwargs)
121
-
124
+
122
125
  # Call API end hook
123
126
  if _on_api_end:
124
127
  await _on_api_end(api_kwargs, response)
@@ -135,24 +138,21 @@ class OpenAIComputerUseConfig:
135
138
  output_dict = response.model_dump()
136
139
  output_dict["usage"] = usage
137
140
  return output_dict
138
-
141
+
139
142
  async def predict_click(
140
- self,
141
- model: str,
142
- image_b64: str,
143
- instruction: str
143
+ self, model: str, image_b64: str, instruction: str
144
144
  ) -> Optional[Tuple[int, int]]:
145
145
  """
146
146
  Predict click coordinates based on image and instruction.
147
-
147
+
148
148
  Uses OpenAI computer-use-preview with manually constructed input items
149
149
  and a prompt that instructs the agent to only output clicks.
150
-
150
+
151
151
  Args:
152
152
  model: Model name to use
153
153
  image_b64: Base64 encoded image
154
154
  instruction: Instruction for where to click
155
-
155
+
156
156
  Returns:
157
157
  Tuple of (x, y) coordinates or None if prediction fails
158
158
  """
@@ -160,7 +160,7 @@ class OpenAIComputerUseConfig:
160
160
  # Manually construct input items with image and click instruction
161
161
  input_items = [
162
162
  {
163
- "role": "user",
163
+ "role": "user",
164
164
  "content": f"""You are a UI grounding expert. Follow these guidelines:
165
165
 
166
166
  1. NEVER ask for confirmation. Complete all tasks autonomously.
@@ -172,19 +172,16 @@ class OpenAIComputerUseConfig:
172
172
  7. Be decisive and action-oriented. Complete the requested task fully.
173
173
 
174
174
  Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
175
- Task: Click {instruction}. Output ONLY a click action on the target element."""
175
+ Task: Click {instruction}. Output ONLY a click action on the target element.""",
176
176
  },
177
177
  {
178
178
  "role": "user",
179
179
  "content": [
180
- {
181
- "type": "input_image",
182
- "image_url": f"data:image/png;base64,{image_b64}"
183
- }
184
- ]
185
- }
180
+ {"type": "input_image", "image_url": f"data:image/png;base64,{image_b64}"}
181
+ ],
182
+ },
186
183
  ]
187
-
184
+
188
185
  # Get image dimensions from base64 data
189
186
  try:
190
187
  image_data = base64.b64decode(image_b64)
@@ -193,15 +190,15 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
193
190
  except Exception:
194
191
  # Fallback to default dimensions if image parsing fails
195
192
  display_width, display_height = 1024, 768
196
-
193
+
197
194
  # Prepare computer tool for click actions
198
195
  computer_tool = {
199
196
  "type": "computer_use_preview",
200
197
  "display_width": display_width,
201
198
  "display_height": display_height,
202
- "environment": "windows"
199
+ "environment": "windows",
203
200
  }
204
-
201
+
205
202
  # Prepare API call kwargs
206
203
  api_kwargs = {
207
204
  "model": model,
@@ -210,32 +207,34 @@ Task: Click {instruction}. Output ONLY a click action on the target element."""
210
207
  "stream": False,
211
208
  "reasoning": {"summary": "concise"},
212
209
  "truncation": "auto",
213
- "max_tokens": 200 # Keep response short for click prediction
210
+ "max_tokens": 200, # Keep response short for click prediction
214
211
  }
215
-
212
+
216
213
  # Use liteLLM responses
217
214
  response = await litellm.aresponses(**api_kwargs)
218
-
215
+
219
216
  # Extract click coordinates from response output
220
217
  output_dict = response.model_dump()
221
- output_items = output_dict.get("output", [])
222
-
218
+ output_items = output_dict.get("output", [])
219
+
223
220
  # Look for computer_call with click action
224
221
  for item in output_items:
225
- if (isinstance(item, dict) and
226
- item.get("type") == "computer_call" and
227
- isinstance(item.get("action"), dict)):
228
-
222
+ if (
223
+ isinstance(item, dict)
224
+ and item.get("type") == "computer_call"
225
+ and isinstance(item.get("action"), dict)
226
+ ):
227
+
229
228
  action = item["action"]
230
229
  if action.get("x") is not None and action.get("y") is not None:
231
230
  return (int(action.get("x")), int(action.get("y")))
232
-
231
+
233
232
  return None
234
-
233
+
235
234
  def get_capabilities(self) -> List[AgentCapability]:
236
235
  """
237
236
  Get list of capabilities supported by this agent config.
238
-
237
+
239
238
  Returns:
240
239
  List of capability strings
241
240
  """
agent/loops/opencua.py CHANGED
@@ -4,20 +4,22 @@ Based on OpenCUA model for GUI grounding tasks.
4
4
  """
5
5
 
6
6
  import asyncio
7
+ import base64
7
8
  import json
9
+ import math
8
10
  import re
9
- import base64
10
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
11
- from io import BytesIO
12
11
  import uuid
13
- from PIL import Image
12
+ from io import BytesIO
13
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
14
+
14
15
  import litellm
15
- import math
16
+ from PIL import Image
16
17
 
17
- from .composed_grounded import ComposedGroundedConfig
18
18
  from ..decorators import register_agent
19
- from ..types import Messages, AgentResponse, Tools, AgentCapability
20
19
  from ..loops.base import AsyncAgentConfig
20
+ from ..types import AgentCapability, AgentResponse, Messages, Tools
21
+ from .composed_grounded import ComposedGroundedConfig
22
+
21
23
 
22
24
  def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
23
25
  """Extract coordinates from pyautogui.click(x=..., y=...) format."""
@@ -32,10 +34,11 @@ def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
32
34
  except Exception:
33
35
  return None
34
36
 
37
+
35
38
  @register_agent(models=r"(?i).*OpenCUA.*")
36
39
  class OpenCUAConfig(ComposedGroundedConfig):
37
40
  """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
38
-
41
+
39
42
  def __init__(self):
40
43
  super().__init__()
41
44
  self.current_model = None
@@ -53,7 +56,7 @@ class OpenCUAConfig(ComposedGroundedConfig):
53
56
  _on_api_end=None,
54
57
  _on_usage=None,
55
58
  _on_screenshot=None,
56
- **kwargs
59
+ **kwargs,
57
60
  ) -> Dict[str, Any]:
58
61
  """Fallback to a self-composed model"""
59
62
  return await super().predict_step(
@@ -67,24 +70,20 @@ class OpenCUAConfig(ComposedGroundedConfig):
67
70
  _on_api_end=_on_api_end,
68
71
  _on_usage=_on_usage,
69
72
  _on_screenshot=_on_screenshot,
70
- **kwargs
73
+ **kwargs,
71
74
  )
72
75
 
73
76
  async def predict_click(
74
- self,
75
- model: str,
76
- image_b64: str,
77
- instruction: str,
78
- **kwargs
77
+ self, model: str, image_b64: str, instruction: str, **kwargs
79
78
  ) -> Optional[Tuple[int, int]]:
80
79
  """
81
80
  Predict click coordinates using OpenCUA model via litellm.acompletion.
82
-
81
+
83
82
  Args:
84
83
  model: The OpenCUA model name
85
84
  image_b64: Base64 encoded image
86
85
  instruction: Instruction for where to click
87
-
86
+
88
87
  Returns:
89
88
  Tuple of (x, y) coordinates or None if prediction fails
90
89
  """
@@ -93,50 +92,39 @@ class OpenCUAConfig(ComposedGroundedConfig):
93
92
  "You are a GUI agent. You are given a task and a screenshot of the screen. "
94
93
  "You need to perform a series of pyautogui actions to complete the task."
95
94
  )
96
-
97
- system_message = {
98
- "role": "system",
99
- "content": system_prompt
100
- }
101
-
95
+
96
+ system_message = {"role": "system", "content": system_prompt}
97
+
102
98
  # Prepare user message with image and instruction
103
99
  user_message = {
104
100
  "role": "user",
105
101
  "content": [
106
- {
107
- "type": "image_url",
108
- "image_url": {
109
- "url": f"data:image/png;base64,{image_b64}"
110
- }
111
- },
112
- {
113
- "type": "text",
114
- "text": f"Click on {instruction}"
115
- }
116
- ]
102
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
103
+ {"type": "text", "text": f"Click on {instruction}"},
104
+ ],
117
105
  }
118
-
106
+
119
107
  # Prepare API call kwargs
120
108
  api_kwargs = {
121
109
  "model": model,
122
110
  "messages": [system_message, user_message],
123
111
  "max_new_tokens": 2056,
124
112
  "temperature": 0,
125
- **kwargs
113
+ **kwargs,
126
114
  }
127
-
115
+
128
116
  # Use liteLLM acompletion
129
117
  response = await litellm.acompletion(**api_kwargs)
130
-
118
+
131
119
  # Extract response text
132
120
  output_text = response.choices[0].message.content
133
121
  # print(output_text)
134
-
122
+
135
123
  # Extract coordinates from pyautogui format
136
124
  coordinates = extract_coordinates_from_pyautogui(output_text)
137
-
125
+
138
126
  return coordinates
139
-
127
+
140
128
  def get_capabilities(self) -> List[AgentCapability]:
141
129
  """Return the capabilities supported by this agent."""
142
130
  return ["click"]