cua-agent 0.1.6__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (57) hide show
  1. agent/__init__.py +3 -2
  2. agent/core/__init__.py +1 -6
  3. agent/core/{computer_agent.py → agent.py} +31 -76
  4. agent/core/{loop.py → base.py} +68 -127
  5. agent/core/factory.py +104 -0
  6. agent/core/messages.py +279 -125
  7. agent/core/provider_config.py +15 -0
  8. agent/core/types.py +45 -0
  9. agent/core/visualization.py +197 -0
  10. agent/providers/anthropic/api/client.py +142 -1
  11. agent/providers/anthropic/api_handler.py +140 -0
  12. agent/providers/anthropic/callbacks/__init__.py +5 -0
  13. agent/providers/anthropic/loop.py +207 -221
  14. agent/providers/anthropic/response_handler.py +226 -0
  15. agent/providers/anthropic/tools/bash.py +0 -97
  16. agent/providers/anthropic/utils.py +368 -0
  17. agent/providers/omni/__init__.py +1 -20
  18. agent/providers/omni/api_handler.py +42 -0
  19. agent/providers/omni/clients/anthropic.py +4 -0
  20. agent/providers/omni/image_utils.py +0 -72
  21. agent/providers/omni/loop.py +491 -607
  22. agent/providers/omni/parser.py +58 -4
  23. agent/providers/omni/tools/__init__.py +25 -7
  24. agent/providers/omni/tools/base.py +29 -0
  25. agent/providers/omni/tools/bash.py +43 -38
  26. agent/providers/omni/tools/computer.py +144 -182
  27. agent/providers/omni/tools/manager.py +25 -45
  28. agent/providers/omni/types.py +1 -3
  29. agent/providers/omni/utils.py +224 -145
  30. agent/providers/openai/__init__.py +6 -0
  31. agent/providers/openai/api_handler.py +453 -0
  32. agent/providers/openai/loop.py +440 -0
  33. agent/providers/openai/response_handler.py +205 -0
  34. agent/providers/openai/tools/__init__.py +15 -0
  35. agent/providers/openai/tools/base.py +79 -0
  36. agent/providers/openai/tools/computer.py +319 -0
  37. agent/providers/openai/tools/manager.py +106 -0
  38. agent/providers/openai/types.py +36 -0
  39. agent/providers/openai/utils.py +98 -0
  40. cua_agent-0.1.18.dist-info/METADATA +165 -0
  41. cua_agent-0.1.18.dist-info/RECORD +73 -0
  42. agent/README.md +0 -63
  43. agent/providers/anthropic/messages/manager.py +0 -112
  44. agent/providers/omni/callbacks.py +0 -78
  45. agent/providers/omni/clients/groq.py +0 -101
  46. agent/providers/omni/experiment.py +0 -276
  47. agent/providers/omni/messages.py +0 -171
  48. agent/providers/omni/tool_manager.py +0 -91
  49. agent/providers/omni/visualization.py +0 -130
  50. agent/types/__init__.py +0 -23
  51. agent/types/base.py +0 -41
  52. agent/types/messages.py +0 -36
  53. cua_agent-0.1.6.dist-info/METADATA +0 -120
  54. cua_agent-0.1.6.dist-info/RECORD +0 -64
  55. /agent/{types → core}/tools.py +0 -0
  56. {cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/WHEEL +0 -0
  57. {cua_agent-0.1.6.dist-info → cua_agent-0.1.18.dist-info}/entry_points.txt +0 -0
@@ -1,81 +1,61 @@
1
- """Omni tool manager implementation."""
2
-
3
- from typing import Dict, List, Any
4
- from enum import Enum
1
+ """Tool manager for the Omni provider."""
5
2
 
3
+ from typing import Any, Dict, List
6
4
  from computer.computer import Computer
7
5
 
8
- from ....core.tools import BaseToolManager
6
+ from ....core.tools import BaseToolManager, ToolResult
9
7
  from ....core.tools.collection import ToolCollection
8
+ from .computer import ComputerTool
9
+ from .bash import BashTool
10
+ from ..types import LLMProvider
10
11
 
11
- from .bash import OmniBashTool
12
- from .computer import OmniComputerTool
13
-
14
-
15
- class ProviderType(Enum):
16
- """Supported provider types."""
17
-
18
- ANTHROPIC = "anthropic"
19
- OPENAI = "openai"
20
- CLAUDE = "claude" # Alias for Anthropic
21
- GPT = "gpt" # Alias for OpenAI
22
12
 
13
+ class ToolManager(BaseToolManager):
14
+ """Manages Omni provider tool initialization and execution."""
23
15
 
24
- class OmniToolManager(BaseToolManager):
25
- """Tool manager for multi-provider support."""
26
-
27
- def __init__(self, computer: Computer):
28
- """Initialize Omni tool manager.
16
+ def __init__(self, computer: Computer, provider: LLMProvider):
17
+ """Initialize the tool manager.
29
18
 
30
19
  Args:
31
- computer: Computer instance for tools
20
+ computer: Computer instance for computer-related tools
21
+ provider: The LLM provider being used
32
22
  """
33
23
  super().__init__(computer)
34
- # Initialize tools
35
- self.computer_tool = OmniComputerTool(self.computer)
36
- self.bash_tool = OmniBashTool(self.computer)
24
+ self.provider = provider
25
+ # Initialize Omni-specific tools
26
+ self.computer_tool = ComputerTool(self.computer)
27
+ self.bash_tool = BashTool(self.computer)
37
28
 
38
29
  def _initialize_tools(self) -> ToolCollection:
39
30
  """Initialize all available tools."""
40
31
  return ToolCollection(self.computer_tool, self.bash_tool)
41
32
 
42
33
  async def _initialize_tools_specific(self) -> None:
43
- """Initialize provider-specific tool requirements."""
34
+ """Initialize Omni provider-specific tool requirements."""
44
35
  await self.computer_tool.initialize_dimensions()
45
36
 
46
37
  def get_tool_params(self) -> List[Dict[str, Any]]:
47
38
  """Get tool parameters for API calls.
48
39
 
49
40
  Returns:
50
- List of tool parameters in default format
41
+ List of tool parameters for the current provider's API
51
42
  """
52
43
  if self.tools is None:
53
44
  raise RuntimeError("Tools not initialized. Call initialize() first.")
45
+
54
46
  return self.tools.to_params()
55
47
 
56
- def get_provider_tools(self, provider: ProviderType) -> List[Dict[str, Any]]:
57
- """Get tools formatted for a specific provider.
48
+ async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> ToolResult:
49
+ """Execute a tool with the given input.
58
50
 
59
51
  Args:
60
- provider: Provider type to format tools for
52
+ name: Name of the tool to execute
53
+ tool_input: Input parameters for the tool
61
54
 
62
55
  Returns:
63
- List of tool parameters in provider-specific format
56
+ Result of the tool execution
64
57
  """
65
58
  if self.tools is None:
66
59
  raise RuntimeError("Tools not initialized. Call initialize() first.")
67
60
 
68
- # Default is the base implementation
69
- tools = self.tools.to_params()
70
-
71
- # Customize for each provider if needed
72
- if provider in [ProviderType.ANTHROPIC, ProviderType.CLAUDE]:
73
- # Format for Anthropic API
74
- # Additional adjustments can be made here
75
- pass
76
- elif provider in [ProviderType.OPENAI, ProviderType.GPT]:
77
- # Format for OpenAI API
78
- # Future implementation
79
- pass
80
-
81
- return tools
61
+ return await self.tools.run(name=name, tool_input=tool_input)
@@ -9,12 +9,10 @@ class LLMProvider(StrEnum):
9
9
  """Supported LLM providers."""
10
10
 
11
11
  ANTHROPIC = "anthropic"
12
+ OMNI = "omni"
12
13
  OPENAI = "openai"
13
14
 
14
15
 
15
- LLMProvider
16
-
17
-
18
16
  @dataclass
19
17
  class LLM:
20
18
  """Configuration for LLM model and provider."""
@@ -1,157 +1,236 @@
1
- """Utility functions for Omni provider."""
1
+ """Main entry point for computer agents."""
2
2
 
3
- import base64
4
- import io
3
+ import asyncio
4
+ import json
5
5
  import logging
6
- from typing import Tuple
7
- from PIL import Image
6
+ import os
7
+ from typing import Any, Dict, List, Optional
8
+ from som.models import ParseResult
9
+ from ...core.types import AgentResponse
8
10
 
9
11
  logger = logging.getLogger(__name__)
10
12
 
11
13
 
12
- def compress_image_base64(
13
- base64_str: str, max_size_bytes: int = 5 * 1024 * 1024, quality: int = 90
14
- ) -> tuple[str, str]:
15
- """Compress a base64 encoded image to ensure it's below a certain size.
14
+ async def to_openai_agent_response_format(
15
+ response: Any,
16
+ messages: List[Dict[str, Any]],
17
+ parsed_screen: Optional[ParseResult] = None,
18
+ parser: Optional[Any] = None,
19
+ model: Optional[str] = None,
20
+ ) -> AgentResponse:
21
+ """Create an OpenAI computer use agent compatible response format.
16
22
 
17
23
  Args:
18
- base64_str: Base64 encoded image string (with or without data URL prefix)
19
- max_size_bytes: Maximum size in bytes (default: 5MB)
20
- quality: Initial JPEG quality (0-100)
24
+ response: The original API response
25
+ messages: List of messages in standard OpenAI format
26
+ parsed_screen: Optional pre-parsed screen information
27
+ parser: Optional parser instance for coordinate calculation
28
+ model: Optional model name
21
29
 
22
30
  Returns:
23
- tuple[str, str]: (Compressed base64 encoded image, media_type)
31
+ A response formatted according to OpenAI's computer use agent standard, including:
32
+ - All standard OpenAI computer use agent fields
33
+ - Original response in response.choices[0].message
34
+ - Full message history in messages field
24
35
  """
25
- # Handle data URL prefix if present (e.g., "data:image/png;base64,...")
26
- original_prefix = ""
27
- media_type = "image/png" # Default media type
28
-
29
- if base64_str.startswith("data:"):
30
- parts = base64_str.split(",", 1)
31
- if len(parts) == 2:
32
- original_prefix = parts[0] + ","
33
- base64_str = parts[1]
34
- # Try to extract media type from the prefix
35
- if "image/jpeg" in original_prefix.lower():
36
- media_type = "image/jpeg"
37
- elif "image/png" in original_prefix.lower():
38
- media_type = "image/png"
39
-
40
- # Check if the base64 string is small enough already
41
- if len(base64_str) <= max_size_bytes:
42
- logger.info(f"Image already within size limit: {len(base64_str)} bytes")
43
- return original_prefix + base64_str, media_type
44
-
45
- try:
46
- # Decode base64
47
- img_data = base64.b64decode(base64_str)
48
- img_size = len(img_data)
49
- logger.info(f"Original image size: {img_size} bytes")
50
-
51
- # Open image
52
- img = Image.open(io.BytesIO(img_data))
53
-
54
- # First, try to compress as PNG (maintains transparency if present)
55
- buffer = io.BytesIO()
56
- img.save(buffer, format="PNG", optimize=True)
57
- buffer.seek(0)
58
- compressed_data = buffer.getvalue()
59
- compressed_b64 = base64.b64encode(compressed_data).decode("utf-8")
60
-
61
- if len(compressed_b64) <= max_size_bytes:
62
- logger.info(f"Compressed to {len(compressed_data)} bytes as PNG")
63
- return compressed_b64, "image/png"
64
-
65
- # Strategy 1: Try reducing quality with JPEG format
66
- current_quality = quality
67
- while current_quality > 20:
68
- buffer = io.BytesIO()
69
- # Convert to RGB if image has alpha channel (JPEG doesn't support transparency)
70
- if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
71
- logger.info("Converting transparent image to RGB for JPEG compression")
72
- rgb_img = Image.new("RGB", img.size, (255, 255, 255))
73
- rgb_img.paste(img, mask=img.split()[3] if img.mode == "RGBA" else None)
74
- rgb_img.save(buffer, format="JPEG", quality=current_quality, optimize=True)
75
- else:
76
- img.save(buffer, format="JPEG", quality=current_quality, optimize=True)
77
-
78
- buffer.seek(0)
79
- compressed_data = buffer.getvalue()
80
- compressed_b64 = base64.b64encode(compressed_data).decode("utf-8")
81
-
82
- if len(compressed_b64) <= max_size_bytes:
83
- logger.info(
84
- f"Compressed to {len(compressed_data)} bytes with JPEG quality {current_quality}"
85
- )
86
- return compressed_b64, "image/jpeg"
87
-
88
- # Reduce quality and try again
89
- current_quality -= 10
90
-
91
- # Strategy 2: If quality reduction isn't enough, reduce dimensions
92
- scale_factor = 0.8
93
- current_img = img
94
-
95
- while scale_factor > 0.3:
96
- # Resize image
97
- new_width = int(img.width * scale_factor)
98
- new_height = int(img.height * scale_factor)
99
- current_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
100
-
101
- # Try with reduced size and quality
102
- buffer = io.BytesIO()
103
- # Convert to RGB if necessary for JPEG
104
- if current_img.mode in ("RGBA", "LA") or (
105
- current_img.mode == "P" and "transparency" in current_img.info
106
- ):
107
- rgb_img = Image.new("RGB", current_img.size, (255, 255, 255))
108
- rgb_img.paste(
109
- current_img, mask=current_img.split()[3] if current_img.mode == "RGBA" else None
110
- )
111
- rgb_img.save(buffer, format="JPEG", quality=70, optimize=True)
112
- else:
113
- current_img.save(buffer, format="JPEG", quality=70, optimize=True)
114
-
115
- buffer.seek(0)
116
- compressed_data = buffer.getvalue()
117
- compressed_b64 = base64.b64encode(compressed_data).decode("utf-8")
118
-
119
- if len(compressed_b64) <= max_size_bytes:
120
- logger.info(
121
- f"Compressed to {len(compressed_data)} bytes with scale {scale_factor} and JPEG quality 70"
122
- )
123
- return compressed_b64, "image/jpeg"
124
-
125
- # Reduce scale factor and try again
126
- scale_factor -= 0.1
127
-
128
- # If we get here, we couldn't compress enough
129
- logger.warning("Could not compress image below required size with quality preservation")
130
-
131
- # Last resort: Use minimum quality and size
132
- buffer = io.BytesIO()
133
- smallest_img = img.resize(
134
- (int(img.width * 0.5), int(img.height * 0.5)), Image.Resampling.LANCZOS
36
+ from datetime import datetime
37
+ import time
38
+
39
+ # Create a unique ID for this response
40
+ response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
41
+ reasoning_id = f"rs_{response_id}"
42
+ action_id = f"cu_{response_id}"
43
+ call_id = f"call_{response_id}"
44
+
45
+ # Extract the last assistant message
46
+ assistant_msg = None
47
+ for msg in reversed(messages):
48
+ if msg["role"] == "assistant":
49
+ assistant_msg = msg
50
+ break
51
+
52
+ if not assistant_msg:
53
+ # If no assistant message found, create a default one
54
+ assistant_msg = {"role": "assistant", "content": "No response available"}
55
+
56
+ # Initialize output array
57
+ output_items = []
58
+
59
+ # Extract reasoning and action details from the response
60
+ content = assistant_msg["content"]
61
+ reasoning_text = None
62
+ action_details = None
63
+
64
+ for item in content:
65
+ if isinstance(item, dict) and item.get("type") == "text":
66
+ try:
67
+ # Try to parse JSON from text block
68
+ text_content = item.get("text", "")
69
+ parsed_json = json.loads(text_content)
70
+
71
+ # Get reasoning text
72
+ if reasoning_text is None:
73
+ reasoning_text = parsed_json.get("Explanation", "")
74
+
75
+ # Extract action details
76
+ action = parsed_json.get("Action", "").lower()
77
+ text_input = parsed_json.get("Text", "")
78
+ value = parsed_json.get("Value", "") # Also handle Value field
79
+ box_id = parsed_json.get("Box ID") # Extract Box ID
80
+
81
+ if action in ["click", "left_click"]:
82
+ # Always calculate coordinates from Box ID for click actions
83
+ x, y = 100, 100 # Default fallback values
84
+
85
+ if parsed_screen and box_id is not None and parser is not None:
86
+ try:
87
+ box_id_int = (
88
+ box_id
89
+ if isinstance(box_id, int)
90
+ else int(str(box_id)) if str(box_id).isdigit() else None
91
+ )
92
+ if box_id_int is not None:
93
+ # Use the parser's method to calculate coordinates
94
+ x, y = await parser.calculate_click_coordinates(
95
+ box_id_int, parsed_screen
96
+ )
97
+ except Exception as e:
98
+ logger.error(
99
+ f"Error extracting coordinates for Box ID {box_id}: {str(e)}"
100
+ )
101
+
102
+ action_details = {
103
+ "type": "click",
104
+ "button": "left",
105
+ "box_id": (
106
+ (
107
+ box_id
108
+ if isinstance(box_id, int)
109
+ else int(box_id) if str(box_id).isdigit() else None
110
+ )
111
+ if box_id is not None
112
+ else None
113
+ ),
114
+ "x": x,
115
+ "y": y,
116
+ }
117
+ elif action in ["type", "type_text"] and (text_input or value):
118
+ action_details = {
119
+ "type": "type",
120
+ "text": text_input or value,
121
+ }
122
+ elif action == "hotkey" and value:
123
+ action_details = {
124
+ "type": "hotkey",
125
+ "keys": value,
126
+ }
127
+ elif action == "scroll":
128
+ # Use default coordinates for scrolling
129
+ delta_x = 0
130
+ delta_y = 0
131
+ # Try to extract scroll delta values from content if available
132
+ scroll_data = parsed_json.get("Scroll", {})
133
+ if scroll_data:
134
+ delta_x = scroll_data.get("delta_x", 0)
135
+ delta_y = scroll_data.get("delta_y", 0)
136
+ action_details = {
137
+ "type": "scroll",
138
+ "x": 100,
139
+ "y": 100,
140
+ "scroll_x": delta_x,
141
+ "scroll_y": delta_y,
142
+ }
143
+ elif action == "none":
144
+ # Handle case when action is None (task completion)
145
+ action_details = {"type": "none", "description": "Task completed"}
146
+ except json.JSONDecodeError:
147
+ # If not JSON, just use as reasoning text
148
+ if reasoning_text is None:
149
+ reasoning_text = ""
150
+ reasoning_text += item.get("text", "")
151
+
152
+ # Add reasoning item if we have text content
153
+ if reasoning_text:
154
+ output_items.append(
155
+ {
156
+ "type": "reasoning",
157
+ "id": reasoning_id,
158
+ "summary": [
159
+ {
160
+ "type": "summary_text",
161
+ "text": reasoning_text[:200], # Truncate to reasonable length
162
+ }
163
+ ],
164
+ }
135
165
  )
136
- # Convert to RGB if necessary
137
- if smallest_img.mode in ("RGBA", "LA") or (
138
- smallest_img.mode == "P" and "transparency" in smallest_img.info
139
- ):
140
- rgb_img = Image.new("RGB", smallest_img.size, (255, 255, 255))
141
- rgb_img.paste(
142
- smallest_img, mask=smallest_img.split()[3] if smallest_img.mode == "RGBA" else None
143
- )
144
- rgb_img.save(buffer, format="JPEG", quality=20, optimize=True)
145
- else:
146
- smallest_img.save(buffer, format="JPEG", quality=20, optimize=True)
147
-
148
- buffer.seek(0)
149
- final_data = buffer.getvalue()
150
- final_b64 = base64.b64encode(final_data).decode("utf-8")
151
-
152
- logger.warning(f"Final compressed size: {len(final_b64)} bytes (may still exceed limit)")
153
- return final_b64, "image/jpeg"
154
-
155
- except Exception as e:
156
- logger.error(f"Error compressing image: {str(e)}")
157
- raise
166
+
167
+ # If no action details extracted, use default
168
+ if not action_details:
169
+ action_details = {
170
+ "type": "click",
171
+ "button": "left",
172
+ "x": 100,
173
+ "y": 100,
174
+ }
175
+
176
+ # Add computer_call item
177
+ computer_call = {
178
+ "type": "computer_call",
179
+ "id": action_id,
180
+ "call_id": call_id,
181
+ "action": action_details,
182
+ "pending_safety_checks": [],
183
+ "status": "completed",
184
+ }
185
+ output_items.append(computer_call)
186
+
187
+ # Extract user and assistant messages from the history
188
+ user_messages = []
189
+ assistant_messages = []
190
+ for msg in messages:
191
+ if msg["role"] == "user":
192
+ user_messages.append(msg)
193
+ elif msg["role"] == "assistant":
194
+ assistant_messages.append(msg)
195
+
196
+ # Create the OpenAI-compatible response format with all expected fields
197
+ return {
198
+ "id": response_id,
199
+ "object": "response",
200
+ "created_at": int(time.time()),
201
+ "status": "completed",
202
+ "error": None,
203
+ "incomplete_details": None,
204
+ "instructions": None,
205
+ "max_output_tokens": None,
206
+ "model": model or "unknown",
207
+ "output": output_items,
208
+ "parallel_tool_calls": True,
209
+ "previous_response_id": None,
210
+ "reasoning": {"effort": "medium", "generate_summary": "concise"},
211
+ "store": True,
212
+ "temperature": 1.0,
213
+ "text": {"format": {"type": "text"}},
214
+ "tool_choice": "auto",
215
+ "tools": [
216
+ {
217
+ "type": "computer_use_preview",
218
+ "display_height": 768,
219
+ "display_width": 1024,
220
+ "environment": "mac",
221
+ }
222
+ ],
223
+ "top_p": 1.0,
224
+ "truncation": "auto",
225
+ "usage": {
226
+ "input_tokens": 0, # Placeholder values
227
+ "input_tokens_details": {"cached_tokens": 0},
228
+ "output_tokens": 0, # Placeholder values
229
+ "output_tokens_details": {"reasoning_tokens": 0},
230
+ "total_tokens": 0, # Placeholder values
231
+ },
232
+ "user": None,
233
+ "metadata": {},
234
+ # Include the original response for backward compatibility
235
+ "response": {"choices": [{"message": assistant_msg, "finish_reason": "stop"}]},
236
+ }
@@ -0,0 +1,6 @@
1
+ """OpenAI Agent Response API provider for computer control."""
2
+
3
+ from .types import LLMProvider
4
+ from .loop import OpenAILoop
5
+
6
+ __all__ = ["OpenAILoop", "LLMProvider"]