cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/computers/cua.py CHANGED
@@ -3,24 +3,27 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
3
3
  """
4
4
 
5
5
  import base64
6
- from typing import Dict, List, Any, Literal, Union, Optional
7
- from .base import AsyncComputerHandler
6
+ from typing import Any, Dict, List, Literal, Optional, Union
7
+
8
8
  from computer import Computer
9
9
 
10
+ from .base import AsyncComputerHandler
11
+
12
+
10
13
  class cuaComputerHandler(AsyncComputerHandler):
11
14
  """Computer handler that implements the Computer protocol using the computer interface."""
12
-
15
+
13
16
  def __init__(self, cua_computer: Computer):
14
17
  """Initialize with a computer interface (from tool schema)."""
15
18
  self.cua_computer = cua_computer
16
19
  self.interface = None
17
20
 
18
21
  async def _initialize(self):
19
- if hasattr(self.cua_computer, '_initialized') and not self.cua_computer._initialized:
22
+ if hasattr(self.cua_computer, "_initialized") and not self.cua_computer._initialized:
20
23
  await self.cua_computer.run()
21
24
  self.interface = self.cua_computer.interface
22
-
23
- # ==== Computer-Use-Preview Action Space ====
25
+
26
+ # ==== Computer-Use-Preview Action Space ====
24
27
 
25
28
  async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
26
29
  """Get the current environment type."""
@@ -32,13 +35,13 @@ class cuaComputerHandler(AsyncComputerHandler):
32
35
  assert self.interface is not None
33
36
  screen_size = await self.interface.get_screen_size()
34
37
  return screen_size["width"], screen_size["height"]
35
-
38
+
36
39
  async def screenshot(self) -> str:
37
40
  """Take a screenshot and return as base64 string."""
38
41
  assert self.interface is not None
39
42
  screenshot_bytes = await self.interface.screenshot()
40
- return base64.b64encode(screenshot_bytes).decode('utf-8')
41
-
43
+ return base64.b64encode(screenshot_bytes).decode("utf-8")
44
+
42
45
  async def click(self, x: int, y: int, button: str = "left") -> None:
43
46
  """Click at coordinates with specified button."""
44
47
  assert self.interface is not None
@@ -49,34 +52,35 @@ class cuaComputerHandler(AsyncComputerHandler):
49
52
  else:
50
53
  # Default to left click for unknown buttons
51
54
  await self.interface.left_click(x, y)
52
-
55
+
53
56
  async def double_click(self, x: int, y: int) -> None:
54
57
  """Double click at coordinates."""
55
58
  assert self.interface is not None
56
59
  await self.interface.double_click(x, y)
57
-
60
+
58
61
  async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
59
62
  """Scroll at coordinates with specified scroll amounts."""
60
63
  assert self.interface is not None
61
64
  await self.interface.move_cursor(x, y)
62
65
  await self.interface.scroll(scroll_x, scroll_y)
63
-
66
+
64
67
  async def type(self, text: str) -> None:
65
68
  """Type text."""
66
69
  assert self.interface is not None
67
70
  await self.interface.type_text(text)
68
-
71
+
69
72
  async def wait(self, ms: int = 1000) -> None:
70
73
  """Wait for specified milliseconds."""
71
74
  assert self.interface is not None
72
75
  import asyncio
76
+
73
77
  await asyncio.sleep(ms / 1000.0)
74
-
78
+
75
79
  async def move(self, x: int, y: int) -> None:
76
80
  """Move cursor to coordinates."""
77
81
  assert self.interface is not None
78
82
  await self.interface.move_cursor(x, y)
79
-
83
+
80
84
  async def keypress(self, keys: Union[List[str], str]) -> None:
81
85
  """Press key combination."""
82
86
  assert self.interface is not None
@@ -87,38 +91,38 @@ class cuaComputerHandler(AsyncComputerHandler):
87
91
  else:
88
92
  # Handle key combinations
89
93
  await self.interface.hotkey(*keys)
90
-
94
+
91
95
  async def drag(self, path: List[Dict[str, int]]) -> None:
92
96
  """Drag along specified path."""
93
97
  assert self.interface is not None
94
98
  if not path:
95
99
  return
96
-
100
+
97
101
  # Start drag from first point
98
102
  start = path[0]
99
103
  await self.interface.mouse_down(start["x"], start["y"])
100
-
104
+
101
105
  # Move through path
102
106
  for point in path[1:]:
103
107
  await self.interface.move_cursor(point["x"], point["y"])
104
-
108
+
105
109
  # End drag at last point
106
110
  end = path[-1]
107
111
  await self.interface.mouse_up(end["x"], end["y"])
108
-
112
+
109
113
  async def get_current_url(self) -> str:
110
114
  """Get current URL (for browser environments)."""
111
115
  # This would need to be implemented based on the specific browser interface
112
116
  # For now, return empty string
113
117
  return ""
114
118
 
115
- # ==== Anthropic Computer Action Space ====
119
+ # ==== Anthropic Computer Action Space ====
116
120
  async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
117
121
  """Left mouse down at coordinates."""
118
122
  assert self.interface is not None
119
123
  await self.interface.mouse_down(x, y, button="left")
120
-
124
+
121
125
  async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
122
126
  """Left mouse up at coordinates."""
123
127
  assert self.interface is not None
124
- await self.interface.mouse_up(x, y, button="left")
128
+ await self.interface.mouse_up(x, y, button="left")
agent/computers/custom.py CHANGED
@@ -3,47 +3,49 @@ Custom computer handler implementation that accepts a dictionary of functions.
3
3
  """
4
4
 
5
5
  import base64
6
- from typing import Dict, List, Any, Literal, Union, Optional, Callable
7
- from PIL import Image
8
6
  import io
7
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union
8
+
9
+ from PIL import Image
10
+
9
11
  from .base import AsyncComputerHandler
10
12
 
11
13
 
12
14
  class CustomComputerHandler(AsyncComputerHandler):
13
15
  """Computer handler that implements the Computer protocol using a dictionary of custom functions."""
14
-
16
+
15
17
  def __init__(self, functions: Dict[str, Callable]):
16
18
  """
17
19
  Initialize with a dictionary of functions.
18
-
20
+
19
21
  Args:
20
22
  functions: Dictionary where keys are method names and values are callable functions.
21
23
  Only 'screenshot' is required, all others are optional.
22
-
24
+
23
25
  Raises:
24
26
  ValueError: If required 'screenshot' function is not provided.
25
27
  """
26
- if 'screenshot' not in functions:
28
+ if "screenshot" not in functions:
27
29
  raise ValueError("'screenshot' function is required in functions dictionary")
28
-
30
+
29
31
  self.functions = functions
30
32
  self._last_screenshot_size: Optional[tuple[int, int]] = None
31
-
33
+
32
34
  async def _call_function(self, func, *args, **kwargs):
33
35
  """
34
36
  Call a function, handling both async and sync functions.
35
-
37
+
36
38
  Args:
37
39
  func: The function to call
38
40
  *args: Positional arguments to pass to the function
39
41
  **kwargs: Keyword arguments to pass to the function
40
-
42
+
41
43
  Returns:
42
44
  The result of the function call
43
45
  """
44
46
  import asyncio
45
47
  import inspect
46
-
48
+
47
49
  if callable(func):
48
50
  if inspect.iscoroutinefunction(func):
49
51
  return await func(*args, **kwargs)
@@ -51,14 +53,14 @@ class CustomComputerHandler(AsyncComputerHandler):
51
53
  return func(*args, **kwargs)
52
54
  else:
53
55
  return func
54
-
56
+
55
57
  async def _get_value(self, attribute: str):
56
58
  """
57
59
  Get value for an attribute, checking both 'get_{attribute}' and '{attribute}' keys.
58
-
60
+
59
61
  Args:
60
62
  attribute: The attribute name to look for
61
-
63
+
62
64
  Returns:
63
65
  The value from the functions dict, called if callable, returned directly if not
64
66
  """
@@ -66,20 +68,20 @@ class CustomComputerHandler(AsyncComputerHandler):
66
68
  get_key = f"get_{attribute}"
67
69
  if get_key in self.functions:
68
70
  return await self._call_function(self.functions[get_key])
69
-
70
- # Check for '{attribute}'
71
+
72
+ # Check for '{attribute}'
71
73
  if attribute in self.functions:
72
74
  return await self._call_function(self.functions[attribute])
73
-
75
+
74
76
  return None
75
-
77
+
76
78
  def _to_b64_str(self, img: Union[bytes, Image.Image, str]) -> str:
77
79
  """
78
80
  Convert image to base64 string.
79
-
81
+
80
82
  Args:
81
83
  img: Image as bytes, PIL Image, or base64 string
82
-
84
+
83
85
  Returns:
84
86
  str: Base64 encoded image string
85
87
  """
@@ -88,43 +90,43 @@ class CustomComputerHandler(AsyncComputerHandler):
88
90
  return img
89
91
  elif isinstance(img, bytes):
90
92
  # Raw bytes
91
- return base64.b64encode(img).decode('utf-8')
93
+ return base64.b64encode(img).decode("utf-8")
92
94
  elif isinstance(img, Image.Image):
93
95
  # PIL Image
94
96
  buffer = io.BytesIO()
95
- img.save(buffer, format='PNG')
96
- return base64.b64encode(buffer.getvalue()).decode('utf-8')
97
+ img.save(buffer, format="PNG")
98
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
97
99
  else:
98
100
  raise ValueError(f"Unsupported image type: {type(img)}")
99
-
100
- # ==== Computer-Use-Preview Action Space ====
101
+
102
+ # ==== Computer-Use-Preview Action Space ====
101
103
 
102
104
  async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
103
105
  """Get the current environment type."""
104
- result = await self._get_value('environment')
106
+ result = await self._get_value("environment")
105
107
  if result is None:
106
108
  return "linux"
107
109
  assert result in ["windows", "mac", "linux", "browser"]
108
- return result # type: ignore
110
+ return result # type: ignore
109
111
 
110
112
  async def get_dimensions(self) -> tuple[int, int]:
111
113
  """Get screen dimensions as (width, height)."""
112
- result = await self._get_value('dimensions')
114
+ result = await self._get_value("dimensions")
113
115
  if result is not None:
114
- return result # type: ignore
115
-
116
+ return result # type: ignore
117
+
116
118
  # Fallback: use last screenshot size if available
117
119
  if not self._last_screenshot_size:
118
120
  await self.screenshot()
119
121
  assert self._last_screenshot_size is not None, "Failed to get screenshot size"
120
-
122
+
121
123
  return self._last_screenshot_size
122
-
124
+
123
125
  async def screenshot(self) -> str:
124
126
  """Take a screenshot and return as base64 string."""
125
- result = await self._call_function(self.functions['screenshot'])
126
- b64_str = self._to_b64_str(result) # type: ignore
127
-
127
+ result = await self._call_function(self.functions["screenshot"])
128
+ b64_str = self._to_b64_str(result) # type: ignore
129
+
128
130
  # Try to extract dimensions for fallback use
129
131
  try:
130
132
  if isinstance(result, Image.Image):
@@ -136,74 +138,75 @@ class CustomComputerHandler(AsyncComputerHandler):
136
138
  except Exception:
137
139
  # If we can't get dimensions, that's okay
138
140
  pass
139
-
141
+
140
142
  return b64_str
141
-
143
+
142
144
  async def click(self, x: int, y: int, button: str = "left") -> None:
143
145
  """Click at coordinates with specified button."""
144
- if 'click' in self.functions:
145
- await self._call_function(self.functions['click'], x, y, button)
146
+ if "click" in self.functions:
147
+ await self._call_function(self.functions["click"], x, y, button)
146
148
  # No-op if not implemented
147
-
149
+
148
150
  async def double_click(self, x: int, y: int) -> None:
149
151
  """Double click at coordinates."""
150
- if 'double_click' in self.functions:
151
- await self._call_function(self.functions['double_click'], x, y)
152
+ if "double_click" in self.functions:
153
+ await self._call_function(self.functions["double_click"], x, y)
152
154
  # No-op if not implemented
153
-
155
+
154
156
  async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
155
157
  """Scroll at coordinates with specified scroll amounts."""
156
- if 'scroll' in self.functions:
157
- await self._call_function(self.functions['scroll'], x, y, scroll_x, scroll_y)
158
+ if "scroll" in self.functions:
159
+ await self._call_function(self.functions["scroll"], x, y, scroll_x, scroll_y)
158
160
  # No-op if not implemented
159
-
161
+
160
162
  async def type(self, text: str) -> None:
161
163
  """Type text."""
162
- if 'type' in self.functions:
163
- await self._call_function(self.functions['type'], text)
164
+ if "type" in self.functions:
165
+ await self._call_function(self.functions["type"], text)
164
166
  # No-op if not implemented
165
-
167
+
166
168
  async def wait(self, ms: int = 1000) -> None:
167
169
  """Wait for specified milliseconds."""
168
- if 'wait' in self.functions:
169
- await self._call_function(self.functions['wait'], ms)
170
+ if "wait" in self.functions:
171
+ await self._call_function(self.functions["wait"], ms)
170
172
  else:
171
173
  # Default implementation
172
174
  import asyncio
175
+
173
176
  await asyncio.sleep(ms / 1000.0)
174
-
177
+
175
178
  async def move(self, x: int, y: int) -> None:
176
179
  """Move cursor to coordinates."""
177
- if 'move' in self.functions:
178
- await self._call_function(self.functions['move'], x, y)
180
+ if "move" in self.functions:
181
+ await self._call_function(self.functions["move"], x, y)
179
182
  # No-op if not implemented
180
-
183
+
181
184
  async def keypress(self, keys: Union[List[str], str]) -> None:
182
185
  """Press key combination."""
183
- if 'keypress' in self.functions:
184
- await self._call_function(self.functions['keypress'], keys)
186
+ if "keypress" in self.functions:
187
+ await self._call_function(self.functions["keypress"], keys)
185
188
  # No-op if not implemented
186
-
189
+
187
190
  async def drag(self, path: List[Dict[str, int]]) -> None:
188
191
  """Drag along specified path."""
189
- if 'drag' in self.functions:
190
- await self._call_function(self.functions['drag'], path)
192
+ if "drag" in self.functions:
193
+ await self._call_function(self.functions["drag"], path)
191
194
  # No-op if not implemented
192
-
195
+
193
196
  async def get_current_url(self) -> str:
194
197
  """Get current URL (for browser environments)."""
195
- if 'get_current_url' in self.functions:
196
- return await self._get_value('current_url') # type: ignore
198
+ if "get_current_url" in self.functions:
199
+ return await self._get_value("current_url") # type: ignore
197
200
  return "" # Default fallback
198
-
201
+
199
202
  async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
200
203
  """Left mouse down at coordinates."""
201
- if 'left_mouse_down' in self.functions:
202
- await self._call_function(self.functions['left_mouse_down'], x, y)
204
+ if "left_mouse_down" in self.functions:
205
+ await self._call_function(self.functions["left_mouse_down"], x, y)
203
206
  # No-op if not implemented
204
-
207
+
205
208
  async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
206
209
  """Left mouse up at coordinates."""
207
- if 'left_mouse_up' in self.functions:
208
- await self._call_function(self.functions['left_mouse_up'], x, y)
210
+ if "left_mouse_up" in self.functions:
211
+ await self._call_function(self.functions["left_mouse_up"], x, y)
209
212
  # No-op if not implemented
agent/decorators.py CHANGED
@@ -3,47 +3,56 @@ Decorators for agent - agent_loop decorator
3
3
  """
4
4
 
5
5
  from typing import List, Optional
6
+
6
7
  from .types import AgentConfigInfo
7
8
 
8
9
  # Global registry
9
10
  _agent_configs: List[AgentConfigInfo] = []
10
11
 
12
+
11
13
  def register_agent(models: str, priority: int = 0):
12
14
  """
13
15
  Decorator to register an AsyncAgentConfig class.
14
-
16
+
15
17
  Args:
16
18
  models: Regex pattern to match supported models
17
19
  priority: Priority for agent selection (higher = more priority)
18
20
  """
21
+
19
22
  def decorator(agent_class: type):
20
23
  # Validate that the class implements AsyncAgentConfig protocol
21
- if not hasattr(agent_class, 'predict_step'):
22
- raise ValueError(f"Agent class {agent_class.__name__} must implement predict_step method")
23
- if not hasattr(agent_class, 'predict_click'):
24
- raise ValueError(f"Agent class {agent_class.__name__} must implement predict_click method")
25
- if not hasattr(agent_class, 'get_capabilities'):
26
- raise ValueError(f"Agent class {agent_class.__name__} must implement get_capabilities method")
27
-
24
+ if not hasattr(agent_class, "predict_step"):
25
+ raise ValueError(
26
+ f"Agent class {agent_class.__name__} must implement predict_step method"
27
+ )
28
+ if not hasattr(agent_class, "predict_click"):
29
+ raise ValueError(
30
+ f"Agent class {agent_class.__name__} must implement predict_click method"
31
+ )
32
+ if not hasattr(agent_class, "get_capabilities"):
33
+ raise ValueError(
34
+ f"Agent class {agent_class.__name__} must implement get_capabilities method"
35
+ )
36
+
28
37
  # Register the agent config
29
38
  config_info = AgentConfigInfo(
30
- agent_class=agent_class,
31
- models_regex=models,
32
- priority=priority
39
+ agent_class=agent_class, models_regex=models, priority=priority
33
40
  )
34
41
  _agent_configs.append(config_info)
35
-
42
+
36
43
  # Sort by priority (highest first)
37
44
  _agent_configs.sort(key=lambda x: x.priority, reverse=True)
38
-
45
+
39
46
  return agent_class
40
-
47
+
41
48
  return decorator
42
49
 
50
+
43
51
  def get_agent_configs() -> List[AgentConfigInfo]:
44
52
  """Get all registered agent configs"""
45
53
  return _agent_configs.copy()
46
54
 
55
+
47
56
  def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
48
57
  """Find the best matching agent config for a model"""
49
58
  for config_info in _agent_configs:
@@ -12,7 +12,7 @@ Components:
12
12
  Usage:
13
13
  # Run the server and UI
14
14
  python -m agent.human_tool
15
-
15
+
16
16
  # Or run components separately
17
17
  python -m agent.human_tool.server # API server only
18
18
  python -m agent.human_tool.ui # UI only
@@ -21,9 +21,4 @@ Usage:
21
21
  from .server import CompletionQueue, completion_queue
22
22
  from .ui import HumanCompletionUI, create_ui
23
23
 
24
- __all__ = [
25
- "CompletionQueue",
26
- "completion_queue",
27
- "HumanCompletionUI",
28
- "create_ui"
29
- ]
24
+ __all__ = ["CompletionQueue", "completion_queue", "HumanCompletionUI", "create_ui"]
@@ -8,6 +8,7 @@ with a Gradio UI for human interaction.
8
8
 
9
9
  import gradio as gr
10
10
  from fastapi import FastAPI
11
+
11
12
  from .server import app as fastapi_app
12
13
  from .ui import create_ui
13
14
 
@@ -18,6 +19,7 @@ gradio_demo = create_ui()
18
19
  CUSTOM_PATH = "/gradio"
19
20
  app = gr.mount_gradio_app(fastapi_app, gradio_demo, path=CUSTOM_PATH)
20
21
 
22
+
21
23
  # Add a redirect from root to Gradio UI
22
24
  @fastapi_app.get("/")
23
25
  async def redirect_to_ui():
@@ -25,14 +27,16 @@ async def redirect_to_ui():
25
27
  return {
26
28
  "message": "Human Completion Server is running",
27
29
  "ui_url": "/gradio",
28
- "api_docs": "/docs"
30
+ "api_docs": "/docs",
29
31
  }
30
32
 
33
+
31
34
  if __name__ == "__main__":
32
35
  import uvicorn
36
+
33
37
  print("🚀 Starting Human-in-the-Loop Completion Server...")
34
38
  print("📊 API Server: http://localhost:8002")
35
39
  print("🎨 Gradio UI: http://localhost:8002/gradio")
36
40
  print("📚 API Docs: http://localhost:8002/docs")
37
-
41
+
38
42
  uvicorn.run(app, host="0.0.0.0", port=8002)