cua-agent 0.3.1__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +1 -1
  30. agent/core/__init__.py +0 -27
  31. agent/core/agent.py +0 -210
  32. agent/core/base.py +0 -217
  33. agent/core/callbacks.py +0 -200
  34. agent/core/experiment.py +0 -249
  35. agent/core/factory.py +0 -122
  36. agent/core/messages.py +0 -332
  37. agent/core/provider_config.py +0 -21
  38. agent/core/telemetry.py +0 -142
  39. agent/core/tools/__init__.py +0 -21
  40. agent/core/tools/base.py +0 -74
  41. agent/core/tools/bash.py +0 -52
  42. agent/core/tools/collection.py +0 -46
  43. agent/core/tools/computer.py +0 -113
  44. agent/core/tools/edit.py +0 -67
  45. agent/core/tools/manager.py +0 -56
  46. agent/core/tools.py +0 -32
  47. agent/core/types.py +0 -88
  48. agent/core/visualization.py +0 -197
  49. agent/providers/__init__.py +0 -4
  50. agent/providers/anthropic/__init__.py +0 -6
  51. agent/providers/anthropic/api/client.py +0 -360
  52. agent/providers/anthropic/api/logging.py +0 -150
  53. agent/providers/anthropic/api_handler.py +0 -140
  54. agent/providers/anthropic/callbacks/__init__.py +0 -5
  55. agent/providers/anthropic/callbacks/manager.py +0 -65
  56. agent/providers/anthropic/loop.py +0 -568
  57. agent/providers/anthropic/prompts.py +0 -23
  58. agent/providers/anthropic/response_handler.py +0 -226
  59. agent/providers/anthropic/tools/__init__.py +0 -33
  60. agent/providers/anthropic/tools/base.py +0 -88
  61. agent/providers/anthropic/tools/bash.py +0 -66
  62. agent/providers/anthropic/tools/collection.py +0 -34
  63. agent/providers/anthropic/tools/computer.py +0 -396
  64. agent/providers/anthropic/tools/edit.py +0 -326
  65. agent/providers/anthropic/tools/manager.py +0 -54
  66. agent/providers/anthropic/tools/run.py +0 -42
  67. agent/providers/anthropic/types.py +0 -16
  68. agent/providers/anthropic/utils.py +0 -367
  69. agent/providers/omni/__init__.py +0 -8
  70. agent/providers/omni/api_handler.py +0 -42
  71. agent/providers/omni/clients/anthropic.py +0 -103
  72. agent/providers/omni/clients/base.py +0 -35
  73. agent/providers/omni/clients/oaicompat.py +0 -195
  74. agent/providers/omni/clients/ollama.py +0 -122
  75. agent/providers/omni/clients/openai.py +0 -155
  76. agent/providers/omni/clients/utils.py +0 -25
  77. agent/providers/omni/image_utils.py +0 -34
  78. agent/providers/omni/loop.py +0 -990
  79. agent/providers/omni/parser.py +0 -307
  80. agent/providers/omni/prompts.py +0 -64
  81. agent/providers/omni/tools/__init__.py +0 -30
  82. agent/providers/omni/tools/base.py +0 -29
  83. agent/providers/omni/tools/bash.py +0 -74
  84. agent/providers/omni/tools/computer.py +0 -179
  85. agent/providers/omni/tools/manager.py +0 -61
  86. agent/providers/omni/utils.py +0 -236
  87. agent/providers/openai/__init__.py +0 -6
  88. agent/providers/openai/api_handler.py +0 -456
  89. agent/providers/openai/loop.py +0 -472
  90. agent/providers/openai/response_handler.py +0 -205
  91. agent/providers/openai/tools/__init__.py +0 -15
  92. agent/providers/openai/tools/base.py +0 -79
  93. agent/providers/openai/tools/computer.py +0 -326
  94. agent/providers/openai/tools/manager.py +0 -106
  95. agent/providers/openai/types.py +0 -36
  96. agent/providers/openai/utils.py +0 -98
  97. agent/providers/uitars/__init__.py +0 -1
  98. agent/providers/uitars/clients/base.py +0 -35
  99. agent/providers/uitars/clients/mlxvlm.py +0 -263
  100. agent/providers/uitars/clients/oaicompat.py +0 -214
  101. agent/providers/uitars/loop.py +0 -660
  102. agent/providers/uitars/prompts.py +0 -63
  103. agent/providers/uitars/tools/__init__.py +0 -1
  104. agent/providers/uitars/tools/computer.py +0 -283
  105. agent/providers/uitars/tools/manager.py +0 -60
  106. agent/providers/uitars/utils.py +0 -264
  107. agent/telemetry.py +0 -21
  108. agent/ui/__main__.py +0 -15
  109. cua_agent-0.3.1.dist-info/METADATA +0 -295
  110. cua_agent-0.3.1.dist-info/RECORD +0 -87
  111. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
@@ -1,61 +0,0 @@
1
- """Tool manager for the Omni provider."""
2
-
3
- from typing import Any, Dict, List
4
- from computer.computer import Computer
5
-
6
- from ....core.tools import BaseToolManager, ToolResult
7
- from ....core.tools.collection import ToolCollection
8
- from .computer import ComputerTool
9
- from .bash import BashTool
10
- from ....core.types import LLMProvider
11
-
12
-
13
- class ToolManager(BaseToolManager):
14
- """Manages Omni provider tool initialization and execution."""
15
-
16
- def __init__(self, computer: Computer, provider: LLMProvider):
17
- """Initialize the tool manager.
18
-
19
- Args:
20
- computer: Computer instance for computer-related tools
21
- provider: The LLM provider being used
22
- """
23
- super().__init__(computer)
24
- self.provider = provider
25
- # Initialize Omni-specific tools
26
- self.computer_tool = ComputerTool(self.computer)
27
- self.bash_tool = BashTool(self.computer)
28
-
29
- def _initialize_tools(self) -> ToolCollection:
30
- """Initialize all available tools."""
31
- return ToolCollection(self.computer_tool, self.bash_tool)
32
-
33
- async def _initialize_tools_specific(self) -> None:
34
- """Initialize Omni provider-specific tool requirements."""
35
- await self.computer_tool.initialize_dimensions()
36
-
37
- def get_tool_params(self) -> List[Dict[str, Any]]:
38
- """Get tool parameters for API calls.
39
-
40
- Returns:
41
- List of tool parameters for the current provider's API
42
- """
43
- if self.tools is None:
44
- raise RuntimeError("Tools not initialized. Call initialize() first.")
45
-
46
- return self.tools.to_params()
47
-
48
- async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> ToolResult:
49
- """Execute a tool with the given input.
50
-
51
- Args:
52
- name: Name of the tool to execute
53
- tool_input: Input parameters for the tool
54
-
55
- Returns:
56
- Result of the tool execution
57
- """
58
- if self.tools is None:
59
- raise RuntimeError("Tools not initialized. Call initialize() first.")
60
-
61
- return await self.tools.run(name=name, tool_input=tool_input)
@@ -1,236 +0,0 @@
1
- """Main entry point for computer agents."""
2
-
3
- import asyncio
4
- import json
5
- import logging
6
- import os
7
- from typing import Any, Dict, List, Optional
8
- from som.models import ParseResult
9
- from ...core.types import AgentResponse
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- async def to_openai_agent_response_format(
15
- response: Any,
16
- messages: List[Dict[str, Any]],
17
- parsed_screen: Optional[ParseResult] = None,
18
- parser: Optional[Any] = None,
19
- model: Optional[str] = None,
20
- ) -> AgentResponse:
21
- """Create an OpenAI computer use agent compatible response format.
22
-
23
- Args:
24
- response: The original API response
25
- messages: List of messages in standard OpenAI format
26
- parsed_screen: Optional pre-parsed screen information
27
- parser: Optional parser instance for coordinate calculation
28
- model: Optional model name
29
-
30
- Returns:
31
- A response formatted according to OpenAI's computer use agent standard, including:
32
- - All standard OpenAI computer use agent fields
33
- - Original response in response.choices[0].message
34
- - Full message history in messages field
35
- """
36
- from datetime import datetime
37
- import time
38
-
39
- # Create a unique ID for this response
40
- response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
41
- reasoning_id = f"rs_{response_id}"
42
- action_id = f"cu_{response_id}"
43
- call_id = f"call_{response_id}"
44
-
45
- # Extract the last assistant message
46
- assistant_msg = None
47
- for msg in reversed(messages):
48
- if msg["role"] == "assistant":
49
- assistant_msg = msg
50
- break
51
-
52
- if not assistant_msg:
53
- # If no assistant message found, create a default one
54
- assistant_msg = {"role": "assistant", "content": "No response available"}
55
-
56
- # Initialize output array
57
- output_items = []
58
-
59
- # Extract reasoning and action details from the response
60
- content = assistant_msg["content"]
61
- reasoning_text = None
62
- action_details = None
63
-
64
- for item in content:
65
- if isinstance(item, dict) and item.get("type") == "text":
66
- try:
67
- # Try to parse JSON from text block
68
- text_content = item.get("text", "")
69
- parsed_json = json.loads(text_content)
70
-
71
- # Get reasoning text
72
- if reasoning_text is None:
73
- reasoning_text = parsed_json.get("Explanation", "")
74
-
75
- # Extract action details
76
- action = parsed_json.get("Action", "").lower()
77
- text_input = parsed_json.get("Text", "")
78
- value = parsed_json.get("Value", "") # Also handle Value field
79
- box_id = parsed_json.get("Box ID") # Extract Box ID
80
-
81
- if action in ["click", "left_click"]:
82
- # Always calculate coordinates from Box ID for click actions
83
- x, y = 100, 100 # Default fallback values
84
-
85
- if parsed_screen and box_id is not None and parser is not None:
86
- try:
87
- box_id_int = (
88
- box_id
89
- if isinstance(box_id, int)
90
- else int(str(box_id)) if str(box_id).isdigit() else None
91
- )
92
- if box_id_int is not None:
93
- # Use the parser's method to calculate coordinates
94
- x, y = await parser.calculate_click_coordinates(
95
- box_id_int, parsed_screen
96
- )
97
- except Exception as e:
98
- logger.error(
99
- f"Error extracting coordinates for Box ID {box_id}: {str(e)}"
100
- )
101
-
102
- action_details = {
103
- "type": "click",
104
- "button": "left",
105
- "box_id": (
106
- (
107
- box_id
108
- if isinstance(box_id, int)
109
- else int(box_id) if str(box_id).isdigit() else None
110
- )
111
- if box_id is not None
112
- else None
113
- ),
114
- "x": x,
115
- "y": y,
116
- }
117
- elif action in ["type", "type_text"] and (text_input or value):
118
- action_details = {
119
- "type": "type",
120
- "text": text_input or value,
121
- }
122
- elif action == "hotkey" and value:
123
- action_details = {
124
- "type": "hotkey",
125
- "keys": value,
126
- }
127
- elif action == "scroll":
128
- # Use default coordinates for scrolling
129
- delta_x = 0
130
- delta_y = 0
131
- # Try to extract scroll delta values from content if available
132
- scroll_data = parsed_json.get("Scroll", {})
133
- if scroll_data:
134
- delta_x = scroll_data.get("delta_x", 0)
135
- delta_y = scroll_data.get("delta_y", 0)
136
- action_details = {
137
- "type": "scroll",
138
- "x": 100,
139
- "y": 100,
140
- "scroll_x": delta_x,
141
- "scroll_y": delta_y,
142
- }
143
- elif action == "none":
144
- # Handle case when action is None (task completion)
145
- action_details = {"type": "none", "description": "Task completed"}
146
- except json.JSONDecodeError:
147
- # If not JSON, just use as reasoning text
148
- if reasoning_text is None:
149
- reasoning_text = ""
150
- reasoning_text += item.get("text", "")
151
-
152
- # Add reasoning item if we have text content
153
- if reasoning_text:
154
- output_items.append(
155
- {
156
- "type": "reasoning",
157
- "id": reasoning_id,
158
- "summary": [
159
- {
160
- "type": "summary_text",
161
- "text": reasoning_text[:200], # Truncate to reasonable length
162
- }
163
- ],
164
- }
165
- )
166
-
167
- # If no action details extracted, use default
168
- if not action_details:
169
- action_details = {
170
- "type": "click",
171
- "button": "left",
172
- "x": 100,
173
- "y": 100,
174
- }
175
-
176
- # Add computer_call item
177
- computer_call = {
178
- "type": "computer_call",
179
- "id": action_id,
180
- "call_id": call_id,
181
- "action": action_details,
182
- "pending_safety_checks": [],
183
- "status": "completed",
184
- }
185
- output_items.append(computer_call)
186
-
187
- # Extract user and assistant messages from the history
188
- user_messages = []
189
- assistant_messages = []
190
- for msg in messages:
191
- if msg["role"] == "user":
192
- user_messages.append(msg)
193
- elif msg["role"] == "assistant":
194
- assistant_messages.append(msg)
195
-
196
- # Create the OpenAI-compatible response format with all expected fields
197
- return {
198
- "id": response_id,
199
- "object": "response",
200
- "created_at": int(time.time()),
201
- "status": "completed",
202
- "error": None,
203
- "incomplete_details": None,
204
- "instructions": None,
205
- "max_output_tokens": None,
206
- "model": model or "unknown",
207
- "output": output_items,
208
- "parallel_tool_calls": True,
209
- "previous_response_id": None,
210
- "reasoning": {"effort": "medium", "generate_summary": "concise"},
211
- "store": True,
212
- "temperature": 1.0,
213
- "text": {"format": {"type": "text"}},
214
- "tool_choice": "auto",
215
- "tools": [
216
- {
217
- "type": "computer_use_preview",
218
- "display_height": 768,
219
- "display_width": 1024,
220
- "environment": "mac",
221
- }
222
- ],
223
- "top_p": 1.0,
224
- "truncation": "auto",
225
- "usage": {
226
- "input_tokens": 0, # Placeholder values
227
- "input_tokens_details": {"cached_tokens": 0},
228
- "output_tokens": 0, # Placeholder values
229
- "output_tokens_details": {"reasoning_tokens": 0},
230
- "total_tokens": 0, # Placeholder values
231
- },
232
- "user": None,
233
- "metadata": {},
234
- # Include the original response for backward compatibility
235
- "response": {"choices": [{"message": assistant_msg, "finish_reason": "stop"}]},
236
- }
@@ -1,6 +0,0 @@
1
- """OpenAI Agent Response API provider for computer control."""
2
-
3
- from .types import LLMProvider
4
- from .loop import OpenAILoop
5
-
6
- __all__ = ["OpenAILoop", "LLMProvider"]