cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. agent/core/__init__.py +0 -27
  30. agent/core/agent.py +0 -210
  31. agent/core/base.py +0 -217
  32. agent/core/callbacks.py +0 -200
  33. agent/core/experiment.py +0 -249
  34. agent/core/factory.py +0 -122
  35. agent/core/messages.py +0 -332
  36. agent/core/provider_config.py +0 -21
  37. agent/core/telemetry.py +0 -142
  38. agent/core/tools/__init__.py +0 -21
  39. agent/core/tools/base.py +0 -74
  40. agent/core/tools/bash.py +0 -52
  41. agent/core/tools/collection.py +0 -46
  42. agent/core/tools/computer.py +0 -113
  43. agent/core/tools/edit.py +0 -67
  44. agent/core/tools/manager.py +0 -56
  45. agent/core/tools.py +0 -32
  46. agent/core/types.py +0 -88
  47. agent/core/visualization.py +0 -197
  48. agent/providers/__init__.py +0 -4
  49. agent/providers/anthropic/__init__.py +0 -6
  50. agent/providers/anthropic/api/client.py +0 -360
  51. agent/providers/anthropic/api/logging.py +0 -150
  52. agent/providers/anthropic/api_handler.py +0 -140
  53. agent/providers/anthropic/callbacks/__init__.py +0 -5
  54. agent/providers/anthropic/callbacks/manager.py +0 -65
  55. agent/providers/anthropic/loop.py +0 -568
  56. agent/providers/anthropic/prompts.py +0 -23
  57. agent/providers/anthropic/response_handler.py +0 -226
  58. agent/providers/anthropic/tools/__init__.py +0 -33
  59. agent/providers/anthropic/tools/base.py +0 -88
  60. agent/providers/anthropic/tools/bash.py +0 -66
  61. agent/providers/anthropic/tools/collection.py +0 -34
  62. agent/providers/anthropic/tools/computer.py +0 -396
  63. agent/providers/anthropic/tools/edit.py +0 -326
  64. agent/providers/anthropic/tools/manager.py +0 -54
  65. agent/providers/anthropic/tools/run.py +0 -42
  66. agent/providers/anthropic/types.py +0 -16
  67. agent/providers/anthropic/utils.py +0 -381
  68. agent/providers/omni/__init__.py +0 -8
  69. agent/providers/omni/api_handler.py +0 -42
  70. agent/providers/omni/clients/anthropic.py +0 -103
  71. agent/providers/omni/clients/base.py +0 -35
  72. agent/providers/omni/clients/oaicompat.py +0 -195
  73. agent/providers/omni/clients/ollama.py +0 -122
  74. agent/providers/omni/clients/openai.py +0 -155
  75. agent/providers/omni/clients/utils.py +0 -25
  76. agent/providers/omni/image_utils.py +0 -34
  77. agent/providers/omni/loop.py +0 -990
  78. agent/providers/omni/parser.py +0 -307
  79. agent/providers/omni/prompts.py +0 -64
  80. agent/providers/omni/tools/__init__.py +0 -30
  81. agent/providers/omni/tools/base.py +0 -29
  82. agent/providers/omni/tools/bash.py +0 -74
  83. agent/providers/omni/tools/computer.py +0 -179
  84. agent/providers/omni/tools/manager.py +0 -61
  85. agent/providers/omni/utils.py +0 -236
  86. agent/providers/openai/__init__.py +0 -6
  87. agent/providers/openai/api_handler.py +0 -456
  88. agent/providers/openai/loop.py +0 -472
  89. agent/providers/openai/response_handler.py +0 -205
  90. agent/providers/openai/tools/__init__.py +0 -15
  91. agent/providers/openai/tools/base.py +0 -79
  92. agent/providers/openai/tools/computer.py +0 -326
  93. agent/providers/openai/tools/manager.py +0 -106
  94. agent/providers/openai/types.py +0 -36
  95. agent/providers/openai/utils.py +0 -98
  96. agent/providers/uitars/__init__.py +0 -1
  97. agent/providers/uitars/clients/base.py +0 -35
  98. agent/providers/uitars/clients/mlxvlm.py +0 -263
  99. agent/providers/uitars/clients/oaicompat.py +0 -214
  100. agent/providers/uitars/loop.py +0 -660
  101. agent/providers/uitars/prompts.py +0 -63
  102. agent/providers/uitars/tools/__init__.py +0 -1
  103. agent/providers/uitars/tools/computer.py +0 -283
  104. agent/providers/uitars/tools/manager.py +0 -60
  105. agent/providers/uitars/utils.py +0 -264
  106. agent/telemetry.py +0 -21
  107. agent/ui/__main__.py +0 -15
  108. cua_agent-0.3.2.dist-info/METADATA +0 -295
  109. cua_agent-0.3.2.dist-info/RECORD +0 -87
  110. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
@@ -1,79 +0,0 @@
1
- """OpenAI-specific tool base classes."""
2
-
3
- from abc import ABCMeta, abstractmethod
4
- from dataclasses import dataclass, fields, replace
5
- from typing import Any, Dict, List, Optional
6
-
7
- from ....core.tools.base import BaseTool
8
-
9
-
10
- class BaseOpenAITool(BaseTool, metaclass=ABCMeta):
11
- """Abstract base class for OpenAI-defined tools."""
12
-
13
- def __init__(self):
14
- """Initialize the base OpenAI tool."""
15
- # No specific initialization needed yet, but included for future extensibility
16
- pass
17
-
18
- @abstractmethod
19
- async def __call__(self, **kwargs) -> Any:
20
- """Executes the tool with the given arguments."""
21
- ...
22
-
23
- @abstractmethod
24
- def to_params(self) -> Dict[str, Any]:
25
- """Convert tool to OpenAI-specific API parameters.
26
-
27
- Returns:
28
- Dictionary with tool parameters for OpenAI API
29
- """
30
- raise NotImplementedError
31
-
32
-
33
- @dataclass(kw_only=True, frozen=True)
34
- class ToolResult:
35
- """Represents the result of a tool execution."""
36
-
37
- output: str | None = None
38
- error: str | None = None
39
- base64_image: str | None = None
40
- system: str | None = None
41
- content: list[dict] | None = None
42
-
43
- def __bool__(self):
44
- return any(getattr(self, field.name) for field in fields(self))
45
-
46
- def __add__(self, other: "ToolResult"):
47
- def combine_fields(field: str | None, other_field: str | None, concatenate: bool = True):
48
- if field and other_field:
49
- if concatenate:
50
- return field + other_field
51
- raise ValueError("Cannot combine tool results")
52
- return field or other_field
53
-
54
- return ToolResult(
55
- output=combine_fields(self.output, other.output),
56
- error=combine_fields(self.error, other.error),
57
- base64_image=combine_fields(self.base64_image, other.base64_image, False),
58
- system=combine_fields(self.system, other.system),
59
- content=self.content or other.content, # Use first non-None content
60
- )
61
-
62
- def replace(self, **kwargs):
63
- """Returns a new ToolResult with the given fields replaced."""
64
- return replace(self, **kwargs)
65
-
66
-
67
- class CLIResult(ToolResult):
68
- """A ToolResult that can be rendered as a CLI output."""
69
-
70
-
71
- class ToolFailure(ToolResult):
72
- """A ToolResult that represents a failure."""
73
-
74
-
75
- class ToolError(Exception):
76
- """Raised when a tool encounters an error."""
77
-
78
- def __init__(self, message):
79
- self.message = message
@@ -1,326 +0,0 @@
1
- """Computer tool for OpenAI."""
2
-
3
- import asyncio
4
- import base64
5
- import logging
6
- from typing import Literal, Any, Dict, Optional, List, Union
7
-
8
- from computer.computer import Computer
9
-
10
- from .base import BaseOpenAITool, ToolError, ToolResult
11
- from ....core.tools.computer import BaseComputerTool
12
-
13
- TYPING_DELAY_MS = 12
14
- TYPING_GROUP_SIZE = 50
15
-
16
- # Key mapping for special keys
17
- KEY_MAPPING = {
18
- "enter": "return",
19
- "backspace": "delete",
20
- "delete": "forwarddelete",
21
- "escape": "esc",
22
- "pageup": "page_up",
23
- "pagedown": "page_down",
24
- "arrowup": "up",
25
- "arrowdown": "down",
26
- "arrowleft": "left",
27
- "arrowright": "right",
28
- "home": "home",
29
- "end": "end",
30
- "tab": "tab",
31
- "space": "space",
32
- "shift": "shift",
33
- "control": "control",
34
- "alt": "alt",
35
- "meta": "command",
36
- }
37
-
38
- Action = Literal[
39
- "key",
40
- "type",
41
- "mouse_move",
42
- "left_click",
43
- "right_click",
44
- "double_click",
45
- "screenshot",
46
- "scroll",
47
- "drag",
48
- ]
49
-
50
-
51
- class ComputerTool(BaseComputerTool, BaseOpenAITool):
52
- """
53
- A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
54
- """
55
-
56
- name: Literal["computer"] = "computer"
57
- api_type: Literal["computer_use_preview"] = "computer_use_preview"
58
- width: Optional[int] = None
59
- height: Optional[int] = None
60
- display_num: Optional[int] = None
61
- computer: Computer # The CUA Computer instance
62
- logger = logging.getLogger(__name__)
63
-
64
- def __init__(self, computer: Computer):
65
- """Initialize the computer tool.
66
-
67
- Args:
68
- computer: Computer instance
69
- """
70
- self.computer = computer
71
- self.width = None
72
- self.height = None
73
- self.logger = logging.getLogger(__name__)
74
-
75
- # Initialize the base computer tool first
76
- BaseComputerTool.__init__(self, computer)
77
- # Then initialize the OpenAI tool
78
- BaseOpenAITool.__init__(self)
79
-
80
- # Additional initialization
81
- self.width = None # Will be initialized from computer interface
82
- self.height = None # Will be initialized from computer interface
83
- self.display_num = None
84
-
85
- def to_params(self) -> Dict[str, Any]:
86
- """Convert tool to API parameters.
87
-
88
- Returns:
89
- Dictionary with tool parameters
90
- """
91
- if self.width is None or self.height is None:
92
- raise RuntimeError(
93
- "Screen dimensions not initialized. Call initialize_dimensions() first."
94
- )
95
- return {
96
- "type": self.api_type,
97
- "display_width": self.width,
98
- "display_height": self.height,
99
- "display_number": self.display_num,
100
- }
101
-
102
- async def initialize_dimensions(self):
103
- """Initialize screen dimensions from the computer interface."""
104
- try:
105
- display_size = await self.computer.interface.get_screen_size()
106
- self.width = display_size["width"]
107
- self.height = display_size["height"]
108
- assert isinstance(self.width, int) and isinstance(self.height, int)
109
- self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
110
- except Exception as e:
111
- # Fall back to defaults if we can't get accurate dimensions
112
- self.width = 1024
113
- self.height = 768
114
- self.logger.warning(
115
- f"Failed to get screen dimensions, using defaults: {self.width}x{self.height}. Error: {e}"
116
- )
117
-
118
- async def __call__(
119
- self,
120
- *,
121
- type: str, # OpenAI uses 'type' instead of 'action'
122
- text: Optional[str] = None,
123
- **kwargs,
124
- ):
125
- try:
126
- # Ensure dimensions are initialized
127
- if self.width is None or self.height is None:
128
- await self.initialize_dimensions()
129
- if self.width is None or self.height is None:
130
- raise ToolError("Failed to initialize screen dimensions")
131
-
132
- if type == "type":
133
- if text is None:
134
- raise ToolError("text is required for type action")
135
- return await self.handle_typing(text)
136
- elif type == "click":
137
- # Map button to correct action name
138
- button = kwargs.get("button")
139
- if button is None:
140
- raise ToolError("button is required for click action")
141
- return await self.handle_click(button, kwargs["x"], kwargs["y"])
142
- elif type == "keypress":
143
- # Check for keys in kwargs if text is None
144
- if text is None:
145
- if "keys" in kwargs and isinstance(kwargs["keys"], list):
146
- # Pass the keys list directly instead of joining and then splitting
147
- return await self.handle_key(kwargs["keys"])
148
- else:
149
- raise ToolError("Either 'text' or 'keys' is required for keypress action")
150
- return await self.handle_key(text)
151
- elif type == "mouse_move":
152
- if "coordinates" not in kwargs:
153
- raise ToolError("coordinates is required for mouse_move action")
154
- return await self.handle_mouse_move(
155
- kwargs["coordinates"][0], kwargs["coordinates"][1]
156
- )
157
- elif type == "scroll":
158
- # Get x, y coordinates directly from kwargs
159
- x = kwargs.get("x")
160
- y = kwargs.get("y")
161
- if x is None or y is None:
162
- raise ToolError("x and y coordinates are required for scroll action")
163
- scroll_x = kwargs.get("scroll_x", 0) // 50
164
- scroll_y = kwargs.get("scroll_y", 0) // 50
165
- return await self.handle_scroll(x, y, scroll_x, scroll_y)
166
- elif type == "drag":
167
- path = kwargs.get("path")
168
- if not path or not isinstance(path, list) or len(path) < 2:
169
- raise ToolError("path is required for drag action and must contain at least 2 points")
170
- return await self.handle_drag(path)
171
- elif type == "screenshot":
172
- return await self.screenshot()
173
- elif type == "wait":
174
- duration = kwargs.get("duration", 1.0)
175
- await asyncio.sleep(duration)
176
- return await self.screenshot()
177
- else:
178
- raise ToolError(f"Unsupported action: {type}")
179
-
180
- except Exception as e:
181
- self.logger.error(f"Error in ComputerTool.__call__: {str(e)}")
182
- raise ToolError(f"Failed to execute {type}: {str(e)}")
183
-
184
- async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
185
- """Handle mouse clicks."""
186
- try:
187
- # Perform the click based on button type
188
- if button == "left":
189
- await self.computer.interface.left_click(x, y)
190
- elif button == "right":
191
- await self.computer.interface.right_click(x, y)
192
- elif button == "double":
193
- await self.computer.interface.double_click(x, y)
194
- else:
195
- raise ToolError(f"Unsupported button type: {button}")
196
-
197
- # Wait briefly for UI to update
198
- await asyncio.sleep(0.3)
199
-
200
- return ToolResult(
201
- output=f"Performed {button} click at ({x}, {y})",
202
- )
203
- except Exception as e:
204
- self.logger.error(f"Error in handle_click: {str(e)}")
205
- raise ToolError(f"Failed to perform {button} click at ({x}, {y}): {str(e)}")
206
-
207
- async def handle_typing(self, text: str) -> ToolResult:
208
- """Handle typing text with a small delay between characters."""
209
- try:
210
- # Type the text with a small delay
211
- await self.computer.interface.type_text(text)
212
-
213
- await asyncio.sleep(0.3)
214
-
215
- return ToolResult(output=f"Typed: {text}")
216
- except Exception as e:
217
- self.logger.error(f"Error in handle_typing: {str(e)}")
218
- raise ToolError(f"Failed to type '{text}': {str(e)}")
219
-
220
- async def handle_key(self, key: Union[str, List[str]]) -> ToolResult:
221
- """Handle key press, supporting both single keys and combinations.
222
-
223
- Args:
224
- key: Either a string (e.g. "ctrl+c") or a list of keys (e.g. ["ctrl", "c"])
225
- """
226
- try:
227
- # Check if key is already a list
228
- if isinstance(key, list):
229
- keys = [k.strip().lower() for k in key]
230
- else:
231
- # Split key string into list if it's a combination (e.g. "ctrl+c")
232
- keys = [k.strip().lower() for k in key.split("+")]
233
-
234
- # Map each key
235
- mapped_keys = [KEY_MAPPING.get(k, k) for k in keys]
236
-
237
- if len(mapped_keys) > 1:
238
- # For key combinations (like Ctrl+C)
239
- await self.computer.interface.hotkey(*mapped_keys)
240
- else:
241
- # Single key press
242
- await self.computer.interface.press_key(mapped_keys[0])
243
-
244
- # Wait briefly
245
- await asyncio.sleep(0.3)
246
-
247
- return ToolResult(output=f"Pressed key: {key}")
248
- except Exception as e:
249
- self.logger.error(f"Error in handle_key: {str(e)}")
250
- raise ToolError(f"Failed to press key '{key}': {str(e)}")
251
-
252
- async def handle_mouse_move(self, x: int, y: int) -> ToolResult:
253
- """Handle mouse movement."""
254
- try:
255
- # Move cursor to position
256
- await self.computer.interface.move_cursor(x, y)
257
-
258
- # Wait briefly
259
- await asyncio.sleep(0.2)
260
-
261
- return ToolResult(output=f"Moved cursor to ({x}, {y})")
262
- except Exception as e:
263
- self.logger.error(f"Error in handle_mouse_move: {str(e)}")
264
- raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
265
-
266
- async def handle_scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> ToolResult:
267
- """Handle scrolling."""
268
- try:
269
- # Move cursor to position first
270
- await self.computer.interface.move_cursor(x, y)
271
-
272
- # Scroll based on direction
273
- if scroll_y > 0:
274
- await self.computer.interface.scroll_down(abs(scroll_y))
275
- elif scroll_y < 0:
276
- await self.computer.interface.scroll_up(abs(scroll_y))
277
-
278
- # Wait for UI to update
279
- await asyncio.sleep(0.5)
280
-
281
- return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
282
- except Exception as e:
283
- self.logger.error(f"Error in handle_scroll: {str(e)}")
284
- raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
285
-
286
- async def handle_drag(self, path: List[Dict[str, int]]) -> ToolResult:
287
- """Handle mouse drag operation using a path of coordinates.
288
-
289
- Args:
290
- path: List of coordinate points {"x": int, "y": int} defining the drag path
291
-
292
- Returns:
293
- ToolResult with the operation result and screenshot
294
- """
295
- try:
296
- # Convert from [{"x": x, "y": y}, ...] format to [(x, y), ...] format
297
- points = [(p["x"], p["y"]) for p in path]
298
-
299
- # Perform drag action
300
- if len(points) == 2:
301
- await self.computer.interface.move_cursor(points[0][0], points[0][1])
302
- await self.computer.interface.drag_to(points[1][0], points[1][1])
303
- else:
304
- await self.computer.interface.drag(points, button="left")
305
-
306
- # Wait for UI to update
307
- await asyncio.sleep(0.5)
308
-
309
- return ToolResult(
310
- output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
311
- )
312
- except Exception as e:
313
- self.logger.error(f"Error in handle_drag: {str(e)}")
314
- raise ToolError(f"Failed to perform drag operation: {str(e)}")
315
-
316
- async def screenshot(self) -> ToolResult:
317
- """Take a screenshot."""
318
- try:
319
- # Take screenshot
320
- screenshot = await self.computer.interface.screenshot()
321
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
322
-
323
- return ToolResult(output="Screenshot taken", base64_image=base64_screenshot)
324
- except Exception as e:
325
- self.logger.error(f"Error in screenshot: {str(e)}")
326
- raise ToolError(f"Failed to take screenshot: {str(e)}")
@@ -1,106 +0,0 @@
1
- """Tool manager for the OpenAI provider."""
2
-
3
- import logging
4
- from typing import Dict, Any, Optional, List, Callable, Awaitable, Union
5
-
6
- from computer import Computer
7
- from ..types import ComputerAction, ResponseItemType
8
- from .computer import ComputerTool
9
- from ....core.tools.base import ToolResult, ToolFailure
10
- from ....core.tools.collection import ToolCollection
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class ToolManager:
16
- """Manager for computer tools in the OpenAI agent."""
17
-
18
- def __init__(
19
- self,
20
- computer: Computer,
21
- acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
22
- ):
23
- """Initialize the tool manager.
24
-
25
- Args:
26
- computer: Computer instance
27
- acknowledge_safety_check_callback: Optional callback for safety check acknowledgment
28
- """
29
- self.computer = computer
30
- self.acknowledge_safety_check_callback = acknowledge_safety_check_callback
31
- self._initialized = False
32
- self.computer_tool = ComputerTool(computer)
33
- self.tools = None
34
- logger.info("Initialized OpenAI ToolManager")
35
-
36
- async def initialize(self) -> None:
37
- """Initialize the tool manager."""
38
- if not self._initialized:
39
- logger.info("Initializing OpenAI ToolManager")
40
-
41
- # Initialize the computer tool
42
- await self.computer_tool.initialize_dimensions()
43
-
44
- # Initialize tool collection
45
- self.tools = ToolCollection(self.computer_tool)
46
-
47
- self._initialized = True
48
- logger.info("OpenAI ToolManager initialized")
49
-
50
- async def get_tools_definition(self) -> List[Dict[str, Any]]:
51
- """Get the tools definition for the OpenAI agent.
52
-
53
- Returns:
54
- Tools definition for the OpenAI agent
55
- """
56
- if not self.tools:
57
- raise RuntimeError("Tools not initialized. Call initialize() first.")
58
-
59
- # For the OpenAI Agent Response API, we use a special "computer-preview" tool
60
- # which provides the correct interface for computer control
61
- display_width, display_height = await self._get_computer_dimensions()
62
-
63
- # Get environment, using "mac" as default since we're on macOS
64
- environment = getattr(self.computer, "environment", "mac")
65
-
66
- # Ensure environment is one of the allowed values
67
- if environment not in ["windows", "mac", "linux", "browser"]:
68
- logger.warning(f"Invalid environment value: {environment}, using 'mac' instead")
69
- environment = "mac"
70
-
71
- return [
72
- {
73
- "type": "computer-preview",
74
- "display_width": display_width,
75
- "display_height": display_height,
76
- "environment": environment,
77
- }
78
- ]
79
-
80
- async def _get_computer_dimensions(self) -> tuple[int, int]:
81
- """Get the dimensions of the computer display.
82
-
83
- Returns:
84
- Tuple of (width, height)
85
- """
86
- # If computer tool is initialized, use its dimensions
87
- if self.computer_tool.width is not None and self.computer_tool.height is not None:
88
- return (self.computer_tool.width, self.computer_tool.height)
89
-
90
- # Try to get from computer.interface if available
91
- screen_size = await self.computer.interface.get_screen_size()
92
- return (int(screen_size["width"]), int(screen_size["height"]))
93
-
94
- async def execute_tool(self, name: str, tool_input: Dict[str, Any]) -> ToolResult:
95
- """Execute a tool with the given input.
96
-
97
- Args:
98
- name: Name of the tool to execute
99
- tool_input: Input parameters for the tool
100
-
101
- Returns:
102
- Result of the tool execution
103
- """
104
- if not self.tools:
105
- raise RuntimeError("Tools not initialized. Call initialize() first.")
106
- return await self.tools.run(name=name, tool_input=tool_input)
@@ -1,36 +0,0 @@
1
- """Type definitions for the OpenAI provider."""
2
-
3
- from enum import StrEnum, auto
4
- from typing import Dict, List, Optional, Union, Any
5
- from dataclasses import dataclass
6
-
7
-
8
- class LLMProvider(StrEnum):
9
- """OpenAI LLM provider types."""
10
-
11
- OPENAI = "openai"
12
-
13
-
14
- class ResponseItemType(StrEnum):
15
- """Types of items in OpenAI Agent Response output."""
16
-
17
- MESSAGE = "message"
18
- COMPUTER_CALL = "computer_call"
19
- COMPUTER_CALL_OUTPUT = "computer_call_output"
20
- REASONING = "reasoning"
21
-
22
-
23
- @dataclass
24
- class ComputerAction:
25
- """Represents a computer action to be performed."""
26
-
27
- type: str
28
- x: Optional[int] = None
29
- y: Optional[int] = None
30
- text: Optional[str] = None
31
- button: Optional[str] = None
32
- keys: Optional[List[str]] = None
33
- ms: Optional[int] = None
34
- scroll_x: Optional[int] = None
35
- scroll_y: Optional[int] = None
36
- path: Optional[List[Dict[str, int]]] = None
@@ -1,98 +0,0 @@
1
- """Utility functions for the OpenAI provider."""
2
-
3
- import logging
4
- import json
5
- import base64
6
- from typing import Any, Dict, List, Optional
7
-
8
- from ...core.types import AgentResponse
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- def format_images_for_openai(images_base64: List[str]) -> List[Dict[str, Any]]:
14
- """Format images for OpenAI Agent Response API.
15
-
16
- Args:
17
- images_base64: List of base64 encoded images
18
-
19
- Returns:
20
- List of formatted image items for Agent Response API
21
- """
22
- return [
23
- {"type": "input_image", "image_url": f"data:image/png;base64,{image}"}
24
- for image in images_base64
25
- ]
26
-
27
-
28
- def extract_message_content(message: Dict[str, Any]) -> str:
29
- """Extract text content from a message.
30
-
31
- Args:
32
- message: Message to extract content from
33
-
34
- Returns:
35
- Text content from the message
36
- """
37
- if isinstance(message.get("content"), str):
38
- return message["content"]
39
-
40
- if isinstance(message.get("content"), list):
41
- text = ""
42
- role = message.get("role", "user")
43
-
44
- for item in message["content"]:
45
- if isinstance(item, dict):
46
- # For user messages
47
- if role == "user" and item.get("type") == "input_text":
48
- text += item.get("text", "")
49
- # For standard format
50
- elif item.get("type") == "text":
51
- text += item.get("text", "")
52
- # For assistant messages in Agent Response API format
53
- elif item.get("type") == "output_text":
54
- text += item.get("text", "")
55
- return text
56
-
57
- return ""
58
-
59
-
60
- def sanitize_message(msg: Dict[str, Any]) -> Dict[str, Any]:
61
- """Sanitize a message for logging by removing large image data.
62
-
63
- Args:
64
- msg: Message to sanitize
65
-
66
- Returns:
67
- Sanitized message
68
- """
69
- if not isinstance(msg, dict):
70
- return msg
71
-
72
- sanitized = msg.copy()
73
-
74
- # Handle message content
75
- if isinstance(sanitized.get("content"), list):
76
- sanitized_content = []
77
- for item in sanitized["content"]:
78
- if isinstance(item, dict):
79
- # Handle various image types
80
- if item.get("type") == "image_url" and "image_url" in item:
81
- sanitized_content.append({"type": "image_url", "image_url": "[omitted]"})
82
- elif item.get("type") == "input_image" and "image_url" in item:
83
- sanitized_content.append({"type": "input_image", "image_url": "[omitted]"})
84
- elif item.get("type") == "image" and "source" in item:
85
- sanitized_content.append({"type": "image", "source": "[omitted]"})
86
- else:
87
- sanitized_content.append(item)
88
- else:
89
- sanitized_content.append(item)
90
- sanitized["content"] = sanitized_content
91
-
92
- # Handle computer_call_output
93
- if sanitized.get("type") == "computer_call_output" and "output" in sanitized:
94
- output = sanitized["output"]
95
- if isinstance(output, dict) and "image_url" in output:
96
- sanitized["output"] = {**output, "image_url": "[omitted]"}
97
-
98
- return sanitized
@@ -1 +0,0 @@
1
- """UI-TARS Agent provider package."""
@@ -1,35 +0,0 @@
1
- """Base client implementation for Omni providers."""
2
-
3
- import logging
4
- from typing import Dict, List, Optional, Any, Tuple
5
-
6
- logger = logging.getLogger(__name__)
7
-
8
-
9
- class BaseUITarsClient:
10
- """Base class for provider-specific clients."""
11
-
12
- def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
13
- """Initialize base client.
14
-
15
- Args:
16
- api_key: Optional API key
17
- model: Optional model name
18
- """
19
- self.api_key = api_key
20
- self.model = model
21
-
22
- async def run_interleaved(
23
- self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
24
- ) -> Dict[str, Any]:
25
- """Run interleaved chat completion.
26
-
27
- Args:
28
- messages: List of message dicts
29
- system: System prompt
30
- max_tokens: Optional max tokens override
31
-
32
- Returns:
33
- Response dict
34
- """
35
- raise NotImplementedError