cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. agent/core/__init__.py +0 -27
  30. agent/core/agent.py +0 -210
  31. agent/core/base.py +0 -217
  32. agent/core/callbacks.py +0 -200
  33. agent/core/experiment.py +0 -249
  34. agent/core/factory.py +0 -122
  35. agent/core/messages.py +0 -332
  36. agent/core/provider_config.py +0 -21
  37. agent/core/telemetry.py +0 -142
  38. agent/core/tools/__init__.py +0 -21
  39. agent/core/tools/base.py +0 -74
  40. agent/core/tools/bash.py +0 -52
  41. agent/core/tools/collection.py +0 -46
  42. agent/core/tools/computer.py +0 -113
  43. agent/core/tools/edit.py +0 -67
  44. agent/core/tools/manager.py +0 -56
  45. agent/core/tools.py +0 -32
  46. agent/core/types.py +0 -88
  47. agent/core/visualization.py +0 -197
  48. agent/providers/__init__.py +0 -4
  49. agent/providers/anthropic/__init__.py +0 -6
  50. agent/providers/anthropic/api/client.py +0 -360
  51. agent/providers/anthropic/api/logging.py +0 -150
  52. agent/providers/anthropic/api_handler.py +0 -140
  53. agent/providers/anthropic/callbacks/__init__.py +0 -5
  54. agent/providers/anthropic/callbacks/manager.py +0 -65
  55. agent/providers/anthropic/loop.py +0 -568
  56. agent/providers/anthropic/prompts.py +0 -23
  57. agent/providers/anthropic/response_handler.py +0 -226
  58. agent/providers/anthropic/tools/__init__.py +0 -33
  59. agent/providers/anthropic/tools/base.py +0 -88
  60. agent/providers/anthropic/tools/bash.py +0 -66
  61. agent/providers/anthropic/tools/collection.py +0 -34
  62. agent/providers/anthropic/tools/computer.py +0 -396
  63. agent/providers/anthropic/tools/edit.py +0 -326
  64. agent/providers/anthropic/tools/manager.py +0 -54
  65. agent/providers/anthropic/tools/run.py +0 -42
  66. agent/providers/anthropic/types.py +0 -16
  67. agent/providers/anthropic/utils.py +0 -381
  68. agent/providers/omni/__init__.py +0 -8
  69. agent/providers/omni/api_handler.py +0 -42
  70. agent/providers/omni/clients/anthropic.py +0 -103
  71. agent/providers/omni/clients/base.py +0 -35
  72. agent/providers/omni/clients/oaicompat.py +0 -195
  73. agent/providers/omni/clients/ollama.py +0 -122
  74. agent/providers/omni/clients/openai.py +0 -155
  75. agent/providers/omni/clients/utils.py +0 -25
  76. agent/providers/omni/image_utils.py +0 -34
  77. agent/providers/omni/loop.py +0 -990
  78. agent/providers/omni/parser.py +0 -307
  79. agent/providers/omni/prompts.py +0 -64
  80. agent/providers/omni/tools/__init__.py +0 -30
  81. agent/providers/omni/tools/base.py +0 -29
  82. agent/providers/omni/tools/bash.py +0 -74
  83. agent/providers/omni/tools/computer.py +0 -179
  84. agent/providers/omni/tools/manager.py +0 -61
  85. agent/providers/omni/utils.py +0 -236
  86. agent/providers/openai/__init__.py +0 -6
  87. agent/providers/openai/api_handler.py +0 -456
  88. agent/providers/openai/loop.py +0 -472
  89. agent/providers/openai/response_handler.py +0 -205
  90. agent/providers/openai/tools/__init__.py +0 -15
  91. agent/providers/openai/tools/base.py +0 -79
  92. agent/providers/openai/tools/computer.py +0 -326
  93. agent/providers/openai/tools/manager.py +0 -106
  94. agent/providers/openai/types.py +0 -36
  95. agent/providers/openai/utils.py +0 -98
  96. agent/providers/uitars/__init__.py +0 -1
  97. agent/providers/uitars/clients/base.py +0 -35
  98. agent/providers/uitars/clients/mlxvlm.py +0 -263
  99. agent/providers/uitars/clients/oaicompat.py +0 -214
  100. agent/providers/uitars/loop.py +0 -660
  101. agent/providers/uitars/prompts.py +0 -63
  102. agent/providers/uitars/tools/__init__.py +0 -1
  103. agent/providers/uitars/tools/computer.py +0 -283
  104. agent/providers/uitars/tools/manager.py +0 -60
  105. agent/providers/uitars/utils.py +0 -264
  106. agent/telemetry.py +0 -21
  107. agent/ui/__main__.py +0 -15
  108. cua_agent-0.3.2.dist-info/METADATA +0 -295
  109. cua_agent-0.3.2.dist-info/RECORD +0 -87
  110. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
  111. {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
@@ -1,63 +0,0 @@
1
- """Prompts for UI-TARS agent."""
2
-
3
- MAC_SPECIFIC_NOTES = """
4
- (You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
5
- """
6
-
7
- SYSTEM_PROMPT = "You are a helpful assistant."
8
-
9
- COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
10
-
11
- ## Output Format
12
- ```
13
- Thought: ...
14
- Action: ...
15
- ```
16
-
17
- ## Action Space
18
-
19
- click(start_box='<|box_start|>(x1,y1)<|box_end|>')
20
- left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
21
- right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
22
- drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
23
- hotkey(key='')
24
- type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
25
- scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
26
- wait() #Sleep for 5s and take a screenshot to check for any changes.
27
- finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
28
-
29
-
30
- ## Note
31
- - Use {language} in `Thought` part.
32
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
33
-
34
- ## User Instruction
35
- {instruction}
36
- """
37
-
38
- MOBILE_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
39
- ## Output Format
40
- ```
41
- Thought: ...
42
- Action: ...
43
- ```
44
- ## Action Space
45
-
46
- click(start_box='<|box_start|>(x1,y1)<|box_end|>')
47
- long_press(start_box='<|box_start|>(x1,y1)<|box_end|>')
48
- type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
49
- scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
50
- open_app(app_name=\'\')
51
- drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
52
- press_home()
53
- press_back()
54
- finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
55
-
56
-
57
- ## Note
58
- - Use {language} in `Thought` part.
59
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
60
-
61
- ## User Instruction
62
- {instruction}
63
- """
@@ -1 +0,0 @@
1
- """UI-TARS tools package."""
@@ -1,283 +0,0 @@
1
- """Computer tool for UI-TARS."""
2
-
3
- import asyncio
4
- import base64
5
- import logging
6
- import re
7
- from typing import Any, Dict, List, Optional, Literal, Union
8
-
9
- from computer import Computer
10
- from ....core.tools.base import ToolResult, ToolFailure
11
- from ....core.tools.computer import BaseComputerTool
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- class ComputerTool(BaseComputerTool):
17
- """
18
- A tool that allows the UI-TARS agent to interact with the screen, keyboard, and mouse.
19
- """
20
-
21
- name: str = "computer"
22
- width: Optional[int] = None
23
- height: Optional[int] = None
24
- computer: Computer
25
-
26
- def __init__(self, computer: Computer):
27
- """Initialize the computer tool.
28
-
29
- Args:
30
- computer: Computer instance
31
- """
32
- super().__init__(computer)
33
- self.computer = computer
34
- self.width = None
35
- self.height = None
36
- self.logger = logging.getLogger(__name__)
37
-
38
- def to_params(self) -> Dict[str, Any]:
39
- """Convert tool to API parameters.
40
-
41
- Returns:
42
- Dictionary with tool parameters
43
- """
44
- if self.width is None or self.height is None:
45
- raise RuntimeError(
46
- "Screen dimensions not initialized. Call initialize_dimensions() first."
47
- )
48
- return {
49
- "type": "computer",
50
- "display_width": self.width,
51
- "display_height": self.height,
52
- }
53
-
54
- async def initialize_dimensions(self) -> None:
55
- """Initialize screen dimensions from the computer interface."""
56
- try:
57
- display_size = await self.computer.interface.get_screen_size()
58
- self.width = display_size["width"]
59
- self.height = display_size["height"]
60
- self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
61
- except Exception as e:
62
- # Fall back to defaults if we can't get accurate dimensions
63
- self.width = 1024
64
- self.height = 768
65
- self.logger.warning(
66
- f"Failed to get screen dimensions, using defaults: {self.width}x{self.height}. Error: {e}"
67
- )
68
-
69
- async def __call__(
70
- self,
71
- *,
72
- action: str,
73
- **kwargs,
74
- ) -> ToolResult:
75
- """Execute a computer action.
76
-
77
- Args:
78
- action: The action to perform (based on UI-TARS action space)
79
- **kwargs: Additional parameters for the action
80
-
81
- Returns:
82
- ToolResult containing action output and possibly a base64 image
83
- """
84
- try:
85
- # Ensure dimensions are initialized
86
- if self.width is None or self.height is None:
87
- await self.initialize_dimensions()
88
- if self.width is None or self.height is None:
89
- return ToolFailure(error="Failed to initialize screen dimensions")
90
-
91
- # Handle actions defined in UI-TARS action space (from prompts.py)
92
- # Handle standard click (left click)
93
- if action == "click":
94
- if "x" in kwargs and "y" in kwargs:
95
- x, y = kwargs["x"], kwargs["y"]
96
- await self.computer.interface.left_click(x, y)
97
-
98
- # Wait briefly for UI to update
99
- await asyncio.sleep(0.5)
100
-
101
- # Take screenshot after action
102
- screenshot = await self.computer.interface.screenshot()
103
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
104
-
105
- return ToolResult(
106
- output=f"Clicked at ({x}, {y})",
107
- base64_image=base64_screenshot,
108
- )
109
- else:
110
- return ToolFailure(error="Missing coordinates for click action")
111
-
112
- # Handle double click
113
- elif action == "left_double":
114
- if "x" in kwargs and "y" in kwargs:
115
- x, y = kwargs["x"], kwargs["y"]
116
- await self.computer.interface.double_click(x, y)
117
-
118
- # Wait briefly for UI to update
119
- await asyncio.sleep(0.5)
120
-
121
- # Take screenshot after action
122
- screenshot = await self.computer.interface.screenshot()
123
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
124
-
125
- return ToolResult(
126
- output=f"Double-clicked at ({x}, {y})",
127
- base64_image=base64_screenshot,
128
- )
129
- else:
130
- return ToolFailure(error="Missing coordinates for left_double action")
131
-
132
- # Handle right click
133
- elif action == "right_single":
134
- if "x" in kwargs and "y" in kwargs:
135
- x, y = kwargs["x"], kwargs["y"]
136
- await self.computer.interface.right_click(x, y)
137
-
138
- # Wait briefly for UI to update
139
- await asyncio.sleep(0.5)
140
-
141
- # Take screenshot after action
142
- screenshot = await self.computer.interface.screenshot()
143
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
144
-
145
- return ToolResult(
146
- output=f"Right-clicked at ({x}, {y})",
147
- base64_image=base64_screenshot,
148
- )
149
- else:
150
- return ToolFailure(error="Missing coordinates for right_single action")
151
-
152
- # Handle typing text
153
- elif action == "type_text":
154
- if "text" in kwargs:
155
- text = kwargs["text"]
156
- await self.computer.interface.type_text(text)
157
-
158
- # Wait for UI to update
159
- await asyncio.sleep(0.3)
160
-
161
- # Take screenshot after action
162
- screenshot = await self.computer.interface.screenshot()
163
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
164
-
165
- return ToolResult(
166
- output=f"Typed: {text}",
167
- base64_image=base64_screenshot,
168
- )
169
- else:
170
- return ToolFailure(error="Missing text for type action")
171
-
172
- # Handle hotkey
173
- elif action == "hotkey":
174
- if "keys" in kwargs:
175
- keys = kwargs["keys"]
176
-
177
- if len(keys) > 1:
178
- await self.computer.interface.hotkey(*keys)
179
- else:
180
- # Single key press
181
- await self.computer.interface.press_key(keys[0])
182
-
183
- # Wait for UI to update
184
- await asyncio.sleep(0.3)
185
-
186
- # Take screenshot after action
187
- screenshot = await self.computer.interface.screenshot()
188
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
189
-
190
- return ToolResult(
191
- output=f"Pressed hotkey: {', '.join(keys)}",
192
- base64_image=base64_screenshot,
193
- )
194
- else:
195
- return ToolFailure(error="Missing keys for hotkey action")
196
-
197
- # Handle drag action
198
- elif action == "drag":
199
- if all(k in kwargs for k in ["start_x", "start_y", "end_x", "end_y"]):
200
- start_x, start_y = kwargs["start_x"], kwargs["start_y"]
201
- end_x, end_y = kwargs["end_x"], kwargs["end_y"]
202
-
203
- # Perform drag
204
- await self.computer.interface.move_cursor(start_x, start_y)
205
- await self.computer.interface.drag_to(end_x, end_y)
206
-
207
- # Wait for UI to update
208
- await asyncio.sleep(0.5)
209
-
210
- # Take screenshot after action
211
- screenshot = await self.computer.interface.screenshot()
212
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
213
-
214
- return ToolResult(
215
- output=f"Dragged from ({start_x}, {start_y}) to ({end_x}, {end_y})",
216
- base64_image=base64_screenshot,
217
- )
218
- else:
219
- return ToolFailure(error="Missing coordinates for drag action")
220
-
221
- # Handle scroll action
222
- elif action == "scroll":
223
- if all(k in kwargs for k in ["x", "y", "direction"]):
224
- x, y = kwargs["x"], kwargs["y"]
225
- direction = kwargs["direction"]
226
-
227
- # Move cursor to position
228
- await self.computer.interface.move_cursor(x, y)
229
-
230
- # Scroll based on direction
231
- if direction == "down":
232
- await self.computer.interface.scroll_down(5)
233
- elif direction == "up":
234
- await self.computer.interface.scroll_up(5)
235
- elif direction == "right":
236
- pass # await self.computer.interface.scroll_right(5)
237
- elif direction == "left":
238
- pass # await self.computer.interface.scroll_left(5)
239
- else:
240
- return ToolFailure(error=f"Invalid scroll direction: {direction}")
241
-
242
- # Wait for UI to update
243
- await asyncio.sleep(0.5)
244
-
245
- # Take screenshot after action
246
- screenshot = await self.computer.interface.screenshot()
247
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
248
-
249
- return ToolResult(
250
- output=f"Scrolled {direction} at ({x}, {y})",
251
- base64_image=base64_screenshot,
252
- )
253
- else:
254
- return ToolFailure(error="Missing parameters for scroll action")
255
-
256
- # Handle wait action
257
- elif action == "wait":
258
- # Sleep for 5 seconds as specified in the action space
259
- await asyncio.sleep(5)
260
-
261
- # Take screenshot after waiting
262
- screenshot = await self.computer.interface.screenshot()
263
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
264
-
265
- return ToolResult(
266
- output="Waited for 5 seconds",
267
- base64_image=base64_screenshot,
268
- )
269
-
270
- # Handle finished action (task completion)
271
- elif action == "finished":
272
- content = kwargs.get("content", "Task completed")
273
- return ToolResult(
274
- output=f"Task finished: {content}",
275
- )
276
-
277
- return await self._handle_scroll(action)
278
- else:
279
- return ToolFailure(error=f"Unsupported action: {action}")
280
-
281
- except Exception as e:
282
- self.logger.error(f"Error in ComputerTool.__call__: {str(e)}")
283
- return ToolFailure(error=f"Failed to execute {action}: {str(e)}")
@@ -1,60 +0,0 @@
1
- """Tool manager for the UI-TARS provider."""
2
-
3
- import logging
4
- from typing import Any, Dict, List, Optional
5
-
6
- from computer import Computer
7
- from ....core.tools import BaseToolManager
8
- from ....core.tools.collection import ToolCollection
9
- from .computer import ComputerTool
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- class ToolManager(BaseToolManager):
15
- """Manages UI-TARS provider tool initialization and execution."""
16
-
17
- def __init__(self, computer: Computer):
18
- """Initialize the tool manager.
19
-
20
- Args:
21
- computer: Computer instance for computer-related tools
22
- """
23
- super().__init__(computer)
24
- # Initialize UI-TARS-specific tools
25
- self.computer_tool = ComputerTool(self.computer)
26
- self._initialized = False
27
-
28
- def _initialize_tools(self) -> ToolCollection:
29
- """Initialize all available tools."""
30
- return ToolCollection(self.computer_tool)
31
-
32
- async def _initialize_tools_specific(self) -> None:
33
- """Initialize UI-TARS provider-specific tool requirements."""
34
- await self.computer_tool.initialize_dimensions()
35
-
36
- def get_tool_params(self) -> List[Dict[str, Any]]:
37
- """Get tool parameters for API calls.
38
-
39
- Returns:
40
- List of tool parameters for the current provider's API
41
- """
42
- if self.tools is None:
43
- raise RuntimeError("Tools not initialized. Call initialize() first.")
44
-
45
- return self.tools.to_params()
46
-
47
- async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> Any:
48
- """Execute a tool with the given input.
49
-
50
- Args:
51
- name: Name of the tool to execute
52
- tool_input: Input parameters for the tool
53
-
54
- Returns:
55
- Result of the tool execution
56
- """
57
- if self.tools is None:
58
- raise RuntimeError("Tools not initialized. Call initialize() first.")
59
-
60
- return await self.tools.run(name=name, tool_input=tool_input)
@@ -1,264 +0,0 @@
1
- """Utility functions for the UI-TARS provider."""
2
-
3
- import logging
4
- import base64
5
- import re
6
- from typing import Any, Dict, List, Optional, Union, Tuple
7
- from datetime import datetime
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
- from ...core.types import AgentResponse
12
-
13
- async def to_agent_response_format(
14
- response: Dict[str, Any],
15
- messages: List[Dict[str, Any]],
16
- model: Optional[str] = None,
17
- ) -> AgentResponse:
18
- """Convert raw UI-TARS response to agent response format.
19
-
20
- Args:
21
- response: Raw UI-TARS response
22
- messages: List of messages in standard format
23
- model: Optional model name
24
-
25
- Returns:
26
- AgentResponse: Standardized agent response format
27
- """
28
- # Create unique IDs for this response
29
- response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
30
- reasoning_id = f"rs_{response_id}"
31
- action_id = f"cu_{response_id}"
32
- call_id = f"call_{response_id}"
33
-
34
- # Parse actions from the raw response
35
- content = response["choices"][0]["message"]["content"]
36
- actions = parse_actions(content)
37
-
38
- # Extract thought content if available
39
- reasoning_text = ""
40
- if "Thought:" in content:
41
- thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
42
- if thought_match:
43
- reasoning_text = thought_match.group(1).strip()
44
-
45
- # Create output items
46
- output_items = []
47
- if reasoning_text:
48
- output_items.append({
49
- "type": "reasoning",
50
- "id": reasoning_id,
51
- "text": reasoning_text
52
- })
53
- if actions:
54
- for i, action in enumerate(actions):
55
- action_name, tool_args = parse_action_parameters(action)
56
- if action_name == "finished":
57
- output_items.append({
58
- "type": "message",
59
- "role": "assistant",
60
- "content": [{
61
- "type": "output_text",
62
- "text": tool_args["content"]
63
- }],
64
- "id": f"action_{i}_{action_id}",
65
- "status": "completed"
66
- })
67
- else:
68
- if tool_args.get("action") == action_name:
69
- del tool_args["action"]
70
- output_items.append({
71
- "type": "computer_call",
72
- "id": f"{action}_{i}_{action_id}",
73
- "call_id": f"call_{i}_{action_id}",
74
- "action": { "type": action_name, **tool_args },
75
- "pending_safety_checks": [],
76
- "status": "completed"
77
- })
78
-
79
- # Create agent response
80
- agent_response = AgentResponse(
81
- id=response_id,
82
- object="response",
83
- created_at=int(datetime.now().timestamp()),
84
- status="completed",
85
- error=None,
86
- incomplete_details=None,
87
- instructions=None,
88
- max_output_tokens=None,
89
- model=model or response["model"],
90
- output=output_items,
91
- parallel_tool_calls=True,
92
- previous_response_id=None,
93
- reasoning={"effort": "medium"},
94
- store=True,
95
- temperature=0.0,
96
- top_p=0.7,
97
- text={"format": {"type": "text"}},
98
- tool_choice="auto",
99
- tools=[
100
- {
101
- "type": "computer_use_preview",
102
- "display_height": 768,
103
- "display_width": 1024,
104
- "environment": "mac",
105
- }
106
- ],
107
- truncation="auto",
108
- usage=response.get("usage", {}),
109
- user=None,
110
- metadata={},
111
- response=response
112
- )
113
- return agent_response
114
-
115
-
116
- def add_box_token(input_string: str) -> str:
117
- """Add box tokens to the coordinates in the model response.
118
-
119
- Args:
120
- input_string: Raw model response
121
-
122
- Returns:
123
- String with box tokens added
124
- """
125
- if "Action: " not in input_string or "start_box=" not in input_string:
126
- return input_string
127
-
128
- suffix = input_string.split("Action: ")[0] + "Action: "
129
- actions = input_string.split("Action: ")[1:]
130
- processed_actions = []
131
-
132
- for action in actions:
133
- action = action.strip()
134
- coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
135
-
136
- updated_action = action
137
- for coord_type, x, y in coordinates:
138
- updated_action = updated_action.replace(
139
- f"{coord_type}='({x},{y})'",
140
- f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'"
141
- )
142
- processed_actions.append(updated_action)
143
-
144
- return suffix + "\n\n".join(processed_actions)
145
-
146
-
147
- def parse_actions(response: str) -> List[str]:
148
- """Parse actions from UI-TARS model response.
149
-
150
- Args:
151
- response: The raw model response text
152
-
153
- Returns:
154
- List of parsed actions
155
- """
156
- actions = []
157
- # Extract the Action part from the response
158
- if "Action:" in response:
159
- action_text = response.split("Action:")[-1].strip()
160
- # Clean up and format action
161
- if action_text:
162
- # Handle multiple actions separated by newlines
163
- action_parts = action_text.split("\n\n")
164
- for part in action_parts:
165
- if part.strip():
166
- actions.append(part.strip())
167
-
168
- return actions
169
-
170
-
171
- def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
172
- """Parse parameters from an action string.
173
-
174
- Args:
175
- action: The action string to parse
176
-
177
- Returns:
178
- Tuple of (action_name, action_parameters)
179
- """
180
- # Handle "finished" action
181
- if action.startswith("finished"):
182
- # Parse content if it exists
183
- content_match = re.search(r"content='([^']*)'", action)
184
- if content_match:
185
- content = content_match.group(1)
186
- return "finished", {"content": content}
187
- else:
188
- return "finished", {}
189
-
190
- # Parse action parameters
191
- action_match = re.match(r'(\w+)\((.*)\)', action)
192
- if not action_match:
193
- logger.warning(f"Could not parse action: {action}")
194
- return "", {}
195
-
196
- action_name = action_match.group(1)
197
- action_params_str = action_match.group(2)
198
-
199
- tool_args = {"action": action_name}
200
-
201
- # Extract coordinate values from the action
202
- if "start_box" in action_params_str:
203
- # Extract all box coordinates
204
- box_pattern = r"(start_box|end_box)='(?:<\|box_start\|>)?\((\d+),\s*(\d+)\)(?:<\|box_end\|>)?'"
205
- box_matches = re.findall(box_pattern, action_params_str)
206
-
207
- # Handle click-type actions
208
- if action_name in ["click", "left_double", "right_single"]:
209
- # Get coordinates from start_box
210
- for box_type, x, y in box_matches:
211
- if box_type == "start_box":
212
- tool_args["x"] = int(x)
213
- tool_args["y"] = int(y)
214
- break
215
-
216
- # Handle drag action
217
- elif action_name == "drag":
218
- start_x, start_y = None, None
219
- end_x, end_y = None, None
220
-
221
- for box_type, x, y in box_matches:
222
- if box_type == "start_box":
223
- start_x, start_y = int(x), int(y)
224
- elif box_type == "end_box":
225
- end_x, end_y = int(x), int(y)
226
-
227
- if not None in [start_x, start_y, end_x, end_y]:
228
- tool_args["start_x"] = start_x
229
- tool_args["start_y"] = start_y
230
- tool_args["end_x"] = end_x
231
- tool_args["end_y"] = end_y
232
-
233
- # Handle scroll action
234
- elif action_name == "scroll":
235
- # Get coordinates from start_box
236
- for box_type, x, y in box_matches:
237
- if box_type == "start_box":
238
- tool_args["x"] = int(x)
239
- tool_args["y"] = int(y)
240
- break
241
-
242
- # Extract direction
243
- direction_match = re.search(r"direction='([^']+)'", action_params_str)
244
- if direction_match:
245
- tool_args["direction"] = direction_match.group(1)
246
-
247
- # Handle typing text
248
- elif action_name == "type":
249
- # Extract text content
250
- content_match = re.search(r"content='([^']*)'", action_params_str)
251
- if content_match:
252
- # Unescape escaped characters
253
- text = content_match.group(1).replace("\\'", "'").replace('\\"', '"').replace("\\n", "\n")
254
- tool_args = {"action": "type_text", "text": text}
255
-
256
- # Handle hotkey
257
- elif action_name == "hotkey":
258
- # Extract key combination
259
- key_match = re.search(r"key='([^']*)'", action_params_str)
260
- if key_match:
261
- keys = key_match.group(1).split()
262
- tool_args = {"action": "hotkey", "keys": keys}
263
-
264
- return action_name, tool_args
agent/telemetry.py DELETED
@@ -1,21 +0,0 @@
1
- """Telemetry support for Agent class."""
2
-
3
- import os
4
- import platform
5
- import sys
6
- import time
7
- from typing import Any, Dict, Optional
8
-
9
- from core.telemetry import (
10
- record_event,
11
- is_telemetry_enabled,
12
- flush,
13
- get_telemetry_client,
14
- increment,
15
- )
16
-
17
- # System information used for telemetry
18
- SYSTEM_INFO = {
19
- "os": sys.platform,
20
- "python_version": platform.python_version(),
21
- }