cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. agent/core/__init__.py +0 -27
  30. agent/core/agent.py +0 -210
  31. agent/core/base.py +0 -217
  32. agent/core/callbacks.py +0 -200
  33. agent/core/experiment.py +0 -249
  34. agent/core/factory.py +0 -122
  35. agent/core/messages.py +0 -332
  36. agent/core/provider_config.py +0 -21
  37. agent/core/telemetry.py +0 -142
  38. agent/core/tools/__init__.py +0 -21
  39. agent/core/tools/base.py +0 -74
  40. agent/core/tools/bash.py +0 -52
  41. agent/core/tools/collection.py +0 -46
  42. agent/core/tools/computer.py +0 -113
  43. agent/core/tools/edit.py +0 -67
  44. agent/core/tools/manager.py +0 -56
  45. agent/core/tools.py +0 -32
  46. agent/core/types.py +0 -88
  47. agent/core/visualization.py +0 -197
  48. agent/providers/__init__.py +0 -4
  49. agent/providers/anthropic/__init__.py +0 -6
  50. agent/providers/anthropic/api/client.py +0 -360
  51. agent/providers/anthropic/api/logging.py +0 -150
  52. agent/providers/anthropic/api_handler.py +0 -140
  53. agent/providers/anthropic/callbacks/__init__.py +0 -5
  54. agent/providers/anthropic/callbacks/manager.py +0 -65
  55. agent/providers/anthropic/loop.py +0 -568
  56. agent/providers/anthropic/prompts.py +0 -23
  57. agent/providers/anthropic/response_handler.py +0 -226
  58. agent/providers/anthropic/tools/__init__.py +0 -33
  59. agent/providers/anthropic/tools/base.py +0 -88
  60. agent/providers/anthropic/tools/bash.py +0 -66
  61. agent/providers/anthropic/tools/collection.py +0 -34
  62. agent/providers/anthropic/tools/computer.py +0 -396
  63. agent/providers/anthropic/tools/edit.py +0 -326
  64. agent/providers/anthropic/tools/manager.py +0 -54
  65. agent/providers/anthropic/tools/run.py +0 -42
  66. agent/providers/anthropic/types.py +0 -16
  67. agent/providers/anthropic/utils.py +0 -381
  68. agent/providers/omni/__init__.py +0 -8
  69. agent/providers/omni/api_handler.py +0 -42
  70. agent/providers/omni/clients/anthropic.py +0 -103
  71. agent/providers/omni/clients/base.py +0 -35
  72. agent/providers/omni/clients/oaicompat.py +0 -195
  73. agent/providers/omni/clients/ollama.py +0 -122
  74. agent/providers/omni/clients/openai.py +0 -155
  75. agent/providers/omni/clients/utils.py +0 -25
  76. agent/providers/omni/image_utils.py +0 -34
  77. agent/providers/omni/loop.py +0 -990
  78. agent/providers/omni/parser.py +0 -307
  79. agent/providers/omni/prompts.py +0 -64
  80. agent/providers/omni/tools/__init__.py +0 -30
  81. agent/providers/omni/tools/base.py +0 -29
  82. agent/providers/omni/tools/bash.py +0 -74
  83. agent/providers/omni/tools/computer.py +0 -179
  84. agent/providers/omni/tools/manager.py +0 -61
  85. agent/providers/omni/utils.py +0 -236
  86. agent/providers/openai/__init__.py +0 -6
  87. agent/providers/openai/api_handler.py +0 -456
  88. agent/providers/openai/loop.py +0 -472
  89. agent/providers/openai/response_handler.py +0 -205
  90. agent/providers/openai/tools/__init__.py +0 -15
  91. agent/providers/openai/tools/base.py +0 -79
  92. agent/providers/openai/tools/computer.py +0 -326
  93. agent/providers/openai/tools/manager.py +0 -106
  94. agent/providers/openai/types.py +0 -36
  95. agent/providers/openai/utils.py +0 -98
  96. agent/providers/uitars/__init__.py +0 -1
  97. agent/providers/uitars/clients/base.py +0 -35
  98. agent/providers/uitars/clients/mlxvlm.py +0 -263
  99. agent/providers/uitars/clients/oaicompat.py +0 -214
  100. agent/providers/uitars/loop.py +0 -660
  101. agent/providers/uitars/prompts.py +0 -63
  102. agent/providers/uitars/tools/__init__.py +0 -1
  103. agent/providers/uitars/tools/computer.py +0 -283
  104. agent/providers/uitars/tools/manager.py +0 -60
  105. agent/providers/uitars/utils.py +0 -264
  106. agent/telemetry.py +0 -21
  107. agent/ui/__main__.py +0 -15
  108. cua_agent-0.3.2.dist-info/METADATA +0 -295
  109. cua_agent-0.3.2.dist-info/RECORD +0 -87
  110. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
agent/cli.py ADDED
@@ -0,0 +1,290 @@
1
+ """
2
+ CLI chat interface for agent - Computer Use Agent
3
+
4
+ Usage:
5
+ python -m agent.cli <model_string>
6
+
7
+ Examples:
8
+ python -m agent.cli openai/computer-use-preview
9
+ python -m agent.cli anthropic/claude-3-5-sonnet-20241022
10
+ python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
11
+ """
12
+
13
+ import asyncio
14
+ import argparse
15
+ import os
16
+ import sys
17
+ import json
18
+ from typing import List, Dict, Any
19
+ import dotenv
20
+ from yaspin import yaspin
21
+
22
+ # Load environment variables
23
+ dotenv.load_dotenv()
24
+
25
+ # Color codes for terminal output
26
+ class Colors:
27
+ RESET = '\033[0m'
28
+ BOLD = '\033[1m'
29
+ DIM = '\033[2m'
30
+
31
+ # Text colors
32
+ RED = '\033[31m'
33
+ GREEN = '\033[32m'
34
+ YELLOW = '\033[33m'
35
+ BLUE = '\033[34m'
36
+ MAGENTA = '\033[35m'
37
+ CYAN = '\033[36m'
38
+ WHITE = '\033[37m'
39
+ GRAY = '\033[90m'
40
+
41
+ # Background colors
42
+ BG_RED = '\033[41m'
43
+ BG_GREEN = '\033[42m'
44
+ BG_YELLOW = '\033[43m'
45
+ BG_BLUE = '\033[44m'
46
+
47
+
48
+ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"):
49
+ """Print colored text to terminal."""
50
+ prefix = ""
51
+ if bold:
52
+ prefix += Colors.BOLD
53
+ if dim:
54
+ prefix += Colors.DIM
55
+ if color:
56
+ prefix += color
57
+
58
+ print(f"{prefix}{text}{Colors.RESET}", end=end)
59
+
60
+
61
+ def print_action(action_type: str, details: Dict[str, Any]):
62
+ """Print computer action with nice formatting."""
63
+ # Format action details
64
+ args_str = ""
65
+ if action_type == "click" and "x" in details and "y" in details:
66
+ args_str = f"({details['x']}, {details['y']})"
67
+ elif action_type == "type" and "text" in details:
68
+ text = details["text"]
69
+ if len(text) > 50:
70
+ text = text[:47] + "..."
71
+ args_str = f'"{text}"'
72
+ elif action_type == "key" and "key" in details:
73
+ args_str = f"'{details['key']}'"
74
+ elif action_type == "scroll" and "x" in details and "y" in details:
75
+ args_str = f"({details['x']}, {details['y']})"
76
+
77
+ print_colored(f"🛠️ {action_type}{args_str}", dim=True)
78
+
79
+
80
+ def print_welcome(model: str, agent_loop: str, container_name: str):
81
+ """Print welcome message."""
82
+ print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
83
+ print_colored("Type 'exit' to quit.", dim=True)
84
+
85
+ async def ainput(prompt: str = ""):
86
+ return await asyncio.to_thread(input, prompt)
87
+
88
+ async def chat_loop(agent, model: str, container_name: str):
89
+ """Main chat loop with the agent."""
90
+ print_welcome(model, agent.agent_loop.__name__, container_name)
91
+
92
+ history = []
93
+
94
+ while True:
95
+ # Get user input with prompt
96
+ print_colored("> ", end="")
97
+ user_input = await ainput()
98
+
99
+ if user_input.lower() in ['exit', 'quit', 'q']:
100
+ print_colored("\n👋 Goodbye!")
101
+ break
102
+
103
+ if not user_input:
104
+ continue
105
+
106
+ # Add user message to history
107
+ history.append({"role": "user", "content": user_input})
108
+
109
+ # Stream responses from the agent with spinner
110
+ with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
111
+ spinner.hide()
112
+
113
+ async for result in agent.run(history):
114
+ # Add agent responses to history
115
+ history.extend(result.get("output", []))
116
+
117
+ # Process and display the output
118
+ for item in result.get("output", []):
119
+ if item.get("type") == "message":
120
+ # Display agent text response
121
+ content = item.get("content", [])
122
+ for content_part in content:
123
+ if content_part.get("text"):
124
+ text = content_part.get("text", "").strip()
125
+ if text:
126
+ spinner.hide()
127
+ print_colored(text)
128
+
129
+ elif item.get("type") == "computer_call":
130
+ # Display computer action
131
+ action = item.get("action", {})
132
+ action_type = action.get("type", "")
133
+ if action_type:
134
+ spinner.hide()
135
+ print_action(action_type, action)
136
+ spinner.text = f"Performing {action_type}..."
137
+ spinner.show()
138
+
139
+ elif item.get("type") == "function_call":
140
+ # Display function call
141
+ function_name = item.get("name", "")
142
+ spinner.hide()
143
+ print_colored(f"🔧 Calling function: {function_name}", dim=True)
144
+ spinner.text = f"Calling {function_name}..."
145
+ spinner.show()
146
+
147
+ elif item.get("type") == "function_call_output":
148
+ # Display function output (dimmed)
149
+ output = item.get("output", "")
150
+ if output and len(output.strip()) > 0:
151
+ spinner.hide()
152
+ print_colored(f"📤 {output}", dim=True)
153
+
154
+ spinner.hide()
155
+
156
+
157
+ async def main():
158
+ """Main CLI function."""
159
+ parser = argparse.ArgumentParser(
160
+ description="CUA Agent CLI - Interactive computer use assistant",
161
+ formatter_class=argparse.RawDescriptionHelpFormatter,
162
+ epilog="""
163
+ Examples:
164
+ python -m agent.cli openai/computer-use-preview
165
+ python -m agent.cli anthropic/claude-3-5-sonnet-20241022
166
+ python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
167
+ python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
168
+ """
169
+ )
170
+
171
+ parser.add_argument(
172
+ "model",
173
+ help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
174
+ )
175
+
176
+ parser.add_argument(
177
+ "--images",
178
+ type=int,
179
+ default=3,
180
+ help="Number of recent images to keep in context (default: 3)"
181
+ )
182
+
183
+ parser.add_argument(
184
+ "--trajectory",
185
+ action="store_true",
186
+ help="Save trajectory for debugging"
187
+ )
188
+
189
+ parser.add_argument(
190
+ "--budget",
191
+ type=float,
192
+ help="Maximum budget for the session (in dollars)"
193
+ )
194
+
195
+ parser.add_argument(
196
+ "--verbose",
197
+ action="store_true",
198
+ help="Enable verbose logging"
199
+ )
200
+
201
+ args = parser.parse_args()
202
+
203
+ # Check for required environment variables
204
+ container_name = os.getenv("CUA_CONTAINER_NAME")
205
+ cua_api_key = os.getenv("CUA_API_KEY")
206
+
207
+ # Prompt for missing environment variables
208
+ if not container_name:
209
+ print_colored("CUA_CONTAINER_NAME not set.", dim=True)
210
+ print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
211
+ container_name = input("Enter your CUA container name: ").strip()
212
+ if not container_name:
213
+ print_colored("❌ Container name is required.")
214
+ sys.exit(1)
215
+
216
+ if not cua_api_key:
217
+ print_colored("CUA_API_KEY not set.", dim=True)
218
+ cua_api_key = input("Enter your CUA API key: ").strip()
219
+ if not cua_api_key:
220
+ print_colored("❌ API key is required.")
221
+ sys.exit(1)
222
+
223
+ # Check for provider-specific API keys based on model
224
+ provider_api_keys = {
225
+ "openai/": "OPENAI_API_KEY",
226
+ "anthropic/": "ANTHROPIC_API_KEY",
227
+ "omniparser+": "OPENAI_API_KEY",
228
+ "omniparser+": "ANTHROPIC_API_KEY",
229
+ }
230
+
231
+ # Find matching provider and check for API key
232
+ for prefix, env_var in provider_api_keys.items():
233
+ if args.model.startswith(prefix):
234
+ if not os.getenv(env_var):
235
+ print_colored(f"{env_var} not set.", dim=True)
236
+ api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
237
+ if not api_key:
238
+ print_colored(f"❌ {env_var.replace('_', ' ').title()} is required.")
239
+ sys.exit(1)
240
+ # Set the environment variable for the session
241
+ os.environ[env_var] = api_key
242
+ break
243
+
244
+ # Import here to avoid import errors if dependencies are missing
245
+ try:
246
+ from agent import ComputerAgent
247
+ from computer import Computer
248
+ except ImportError as e:
249
+ print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
250
+ print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
251
+ sys.exit(1)
252
+
253
+ # Create computer instance
254
+ async with Computer(
255
+ os_type="linux",
256
+ provider_type="cloud",
257
+ name=container_name,
258
+ api_key=cua_api_key
259
+ ) as computer:
260
+
261
+ # Create agent
262
+ agent_kwargs = {
263
+ "model": args.model,
264
+ "tools": [computer],
265
+ "only_n_most_recent_images": args.images,
266
+ "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
267
+ }
268
+
269
+ if args.trajectory:
270
+ agent_kwargs["trajectory_dir"] = "trajectories"
271
+
272
+ if args.budget:
273
+ agent_kwargs["max_trajectory_budget"] = {
274
+ "max_budget": args.budget,
275
+ "raise_error": True,
276
+ "reset_after_each_run": False
277
+ }
278
+
279
+ agent = ComputerAgent(**agent_kwargs)
280
+
281
+ # Start chat loop
282
+ await chat_loop(agent, args.model, container_name)
283
+
284
+
285
+
286
+ if __name__ == "__main__":
287
+ try:
288
+ asyncio.run(main())
289
+ except (KeyboardInterrupt, EOFError) as _:
290
+ print_colored("\n\n👋 Goodbye!")
@@ -0,0 +1,107 @@
1
+ """
2
+ Computer handler implementation for OpenAI computer-use-preview protocol.
3
+ """
4
+
5
+ import base64
6
+ from typing import Dict, List, Any, Literal
7
+ from .types import Computer
8
+
9
+
10
+ class OpenAIComputerHandler:
11
+ """Computer handler that implements the Computer protocol using the computer interface."""
12
+
13
+ def __init__(self, computer_interface):
14
+ """Initialize with a computer interface (from tool schema)."""
15
+ self.interface = computer_interface
16
+
17
+ async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
18
+ """Get the current environment type."""
19
+ # For now, return a default - this could be enhanced to detect actual environment
20
+ return "windows"
21
+
22
+ async def get_dimensions(self) -> tuple[int, int]:
23
+ """Get screen dimensions as (width, height)."""
24
+ screen_size = await self.interface.get_screen_size()
25
+ return screen_size["width"], screen_size["height"]
26
+
27
+ async def screenshot(self) -> str:
28
+ """Take a screenshot and return as base64 string."""
29
+ screenshot_bytes = await self.interface.screenshot()
30
+ return base64.b64encode(screenshot_bytes).decode('utf-8')
31
+
32
+ async def click(self, x: int, y: int, button: str = "left") -> None:
33
+ """Click at coordinates with specified button."""
34
+ if button == "left":
35
+ await self.interface.left_click(x, y)
36
+ elif button == "right":
37
+ await self.interface.right_click(x, y)
38
+ else:
39
+ # Default to left click for unknown buttons
40
+ await self.interface.left_click(x, y)
41
+
42
+ async def double_click(self, x: int, y: int) -> None:
43
+ """Double click at coordinates."""
44
+ await self.interface.double_click(x, y)
45
+
46
+ async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
47
+ """Scroll at coordinates with specified scroll amounts."""
48
+ await self.interface.move_cursor(x, y)
49
+ await self.interface.scroll(scroll_x, scroll_y)
50
+
51
+ async def type(self, text: str) -> None:
52
+ """Type text."""
53
+ await self.interface.type_text(text)
54
+
55
+ async def wait(self, ms: int = 1000) -> None:
56
+ """Wait for specified milliseconds."""
57
+ import asyncio
58
+ await asyncio.sleep(ms / 1000.0)
59
+
60
+ async def move(self, x: int, y: int) -> None:
61
+ """Move cursor to coordinates."""
62
+ await self.interface.move_cursor(x, y)
63
+
64
+ async def keypress(self, keys: List[str]) -> None:
65
+ """Press key combination."""
66
+ if len(keys) == 1:
67
+ await self.interface.press_key(keys[0])
68
+ else:
69
+ # Handle key combinations
70
+ await self.interface.hotkey(*keys)
71
+
72
+ async def drag(self, path: List[Dict[str, int]]) -> None:
73
+ """Drag along specified path."""
74
+ if not path:
75
+ return
76
+
77
+ # Start drag from first point
78
+ start = path[0]
79
+ await self.interface.mouse_down(start["x"], start["y"])
80
+
81
+ # Move through path
82
+ for point in path[1:]:
83
+ await self.interface.move_cursor(point["x"], point["y"])
84
+
85
+ # End drag at last point
86
+ end = path[-1]
87
+ await self.interface.mouse_up(end["x"], end["y"])
88
+
89
+ async def get_current_url(self) -> str:
90
+ """Get current URL (for browser environments)."""
91
+ # This would need to be implemented based on the specific browser interface
92
+ # For now, return empty string
93
+ return ""
94
+
95
+
96
+ def acknowledge_safety_check_callback(message: str) -> bool:
97
+ """Safety check callback for user acknowledgment."""
98
+ response = input(
99
+ f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
100
+ ).lower()
101
+ return response.strip() == "y"
102
+
103
+
104
+ def check_blocklisted_url(url: str) -> None:
105
+ """Check if URL is blocklisted (placeholder implementation)."""
106
+ # This would contain actual URL checking logic
107
+ pass
agent/decorators.py ADDED
@@ -0,0 +1,90 @@
1
+ """
2
+ Decorators for agent - agent_loop decorator
3
+ """
4
+
5
+ import asyncio
6
+ import inspect
7
+ from typing import Dict, List, Any, Callable, Optional
8
+ from functools import wraps
9
+
10
+ from .types import AgentLoopInfo
11
+
12
+ # Global registry
13
+ _agent_loops: List[AgentLoopInfo] = []
14
+
15
+ def agent_loop(models: str, priority: int = 0):
16
+ """
17
+ Decorator to register an agent loop function.
18
+
19
+ Args:
20
+ models: Regex pattern to match supported models
21
+ priority: Priority for loop selection (higher = more priority)
22
+ """
23
+ def decorator(func: Callable):
24
+ # Validate function signature
25
+ sig = inspect.signature(func)
26
+ required_params = {'messages', 'model'}
27
+ func_params = set(sig.parameters.keys())
28
+
29
+ if not required_params.issubset(func_params):
30
+ missing = required_params - func_params
31
+ raise ValueError(f"Agent loop function must have parameters: {missing}")
32
+
33
+ # Register the loop
34
+ loop_info = AgentLoopInfo(
35
+ func=func,
36
+ models_regex=models,
37
+ priority=priority
38
+ )
39
+ _agent_loops.append(loop_info)
40
+
41
+ # Sort by priority (highest first)
42
+ _agent_loops.sort(key=lambda x: x.priority, reverse=True)
43
+
44
+ @wraps(func)
45
+ async def wrapper(*args, **kwargs):
46
+ # Wrap the function in an asyncio.Queue for cancellation support
47
+ queue = asyncio.Queue()
48
+ task = None
49
+
50
+ try:
51
+ # Create a task that can be cancelled
52
+ async def run_loop():
53
+ try:
54
+ result = await func(*args, **kwargs)
55
+ await queue.put(('result', result))
56
+ except Exception as e:
57
+ await queue.put(('error', e))
58
+
59
+ task = asyncio.create_task(run_loop())
60
+
61
+ # Wait for result or cancellation
62
+ event_type, data = await queue.get()
63
+
64
+ if event_type == 'error':
65
+ raise data
66
+ return data
67
+
68
+ except asyncio.CancelledError:
69
+ if task:
70
+ task.cancel()
71
+ try:
72
+ await task
73
+ except asyncio.CancelledError:
74
+ pass
75
+ raise
76
+
77
+ return wrapper
78
+
79
+ return decorator
80
+
81
+ def get_agent_loops() -> List[AgentLoopInfo]:
82
+ """Get all registered agent loops"""
83
+ return _agent_loops.copy()
84
+
85
+ def find_agent_loop(model: str) -> Optional[AgentLoopInfo]:
86
+ """Find the best matching agent loop for a model"""
87
+ for loop_info in _agent_loops:
88
+ if loop_info.matches_model(model):
89
+ return loop_info
90
+ return None
@@ -0,0 +1,11 @@
1
+ """
2
+ Agent loops for agent
3
+ """
4
+
5
+ # Import the loops to register them
6
+ from . import anthropic
7
+ from . import openai
8
+ from . import uitars
9
+ from . import omniparser
10
+
11
+ __all__ = ["anthropic", "openai", "uitars", "omniparser"]