cua-agent 0.3.1__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (111) hide show
  1. agent/__init__.py +15 -51
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +216 -0
  5. agent/agent.py +577 -0
  6. agent/callbacks/__init__.py +17 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/trajectory_saver.py +305 -0
  13. agent/cli.py +290 -0
  14. agent/computer_handler.py +107 -0
  15. agent/decorators.py +90 -0
  16. agent/loops/__init__.py +11 -0
  17. agent/loops/anthropic.py +728 -0
  18. agent/loops/omniparser.py +339 -0
  19. agent/loops/openai.py +95 -0
  20. agent/loops/uitars.py +688 -0
  21. agent/responses.py +207 -0
  22. agent/types.py +79 -0
  23. agent/ui/__init__.py +7 -1
  24. agent/ui/gradio/__init__.py +6 -19
  25. agent/ui/gradio/app.py +80 -1299
  26. agent/ui/gradio/ui_components.py +703 -0
  27. cua_agent-0.4.0b1.dist-info/METADATA +424 -0
  28. cua_agent-0.4.0b1.dist-info/RECORD +30 -0
  29. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +1 -1
  30. agent/core/__init__.py +0 -27
  31. agent/core/agent.py +0 -210
  32. agent/core/base.py +0 -217
  33. agent/core/callbacks.py +0 -200
  34. agent/core/experiment.py +0 -249
  35. agent/core/factory.py +0 -122
  36. agent/core/messages.py +0 -332
  37. agent/core/provider_config.py +0 -21
  38. agent/core/telemetry.py +0 -142
  39. agent/core/tools/__init__.py +0 -21
  40. agent/core/tools/base.py +0 -74
  41. agent/core/tools/bash.py +0 -52
  42. agent/core/tools/collection.py +0 -46
  43. agent/core/tools/computer.py +0 -113
  44. agent/core/tools/edit.py +0 -67
  45. agent/core/tools/manager.py +0 -56
  46. agent/core/tools.py +0 -32
  47. agent/core/types.py +0 -88
  48. agent/core/visualization.py +0 -197
  49. agent/providers/__init__.py +0 -4
  50. agent/providers/anthropic/__init__.py +0 -6
  51. agent/providers/anthropic/api/client.py +0 -360
  52. agent/providers/anthropic/api/logging.py +0 -150
  53. agent/providers/anthropic/api_handler.py +0 -140
  54. agent/providers/anthropic/callbacks/__init__.py +0 -5
  55. agent/providers/anthropic/callbacks/manager.py +0 -65
  56. agent/providers/anthropic/loop.py +0 -568
  57. agent/providers/anthropic/prompts.py +0 -23
  58. agent/providers/anthropic/response_handler.py +0 -226
  59. agent/providers/anthropic/tools/__init__.py +0 -33
  60. agent/providers/anthropic/tools/base.py +0 -88
  61. agent/providers/anthropic/tools/bash.py +0 -66
  62. agent/providers/anthropic/tools/collection.py +0 -34
  63. agent/providers/anthropic/tools/computer.py +0 -396
  64. agent/providers/anthropic/tools/edit.py +0 -326
  65. agent/providers/anthropic/tools/manager.py +0 -54
  66. agent/providers/anthropic/tools/run.py +0 -42
  67. agent/providers/anthropic/types.py +0 -16
  68. agent/providers/anthropic/utils.py +0 -367
  69. agent/providers/omni/__init__.py +0 -8
  70. agent/providers/omni/api_handler.py +0 -42
  71. agent/providers/omni/clients/anthropic.py +0 -103
  72. agent/providers/omni/clients/base.py +0 -35
  73. agent/providers/omni/clients/oaicompat.py +0 -195
  74. agent/providers/omni/clients/ollama.py +0 -122
  75. agent/providers/omni/clients/openai.py +0 -155
  76. agent/providers/omni/clients/utils.py +0 -25
  77. agent/providers/omni/image_utils.py +0 -34
  78. agent/providers/omni/loop.py +0 -990
  79. agent/providers/omni/parser.py +0 -307
  80. agent/providers/omni/prompts.py +0 -64
  81. agent/providers/omni/tools/__init__.py +0 -30
  82. agent/providers/omni/tools/base.py +0 -29
  83. agent/providers/omni/tools/bash.py +0 -74
  84. agent/providers/omni/tools/computer.py +0 -179
  85. agent/providers/omni/tools/manager.py +0 -61
  86. agent/providers/omni/utils.py +0 -236
  87. agent/providers/openai/__init__.py +0 -6
  88. agent/providers/openai/api_handler.py +0 -456
  89. agent/providers/openai/loop.py +0 -472
  90. agent/providers/openai/response_handler.py +0 -205
  91. agent/providers/openai/tools/__init__.py +0 -15
  92. agent/providers/openai/tools/base.py +0 -79
  93. agent/providers/openai/tools/computer.py +0 -326
  94. agent/providers/openai/tools/manager.py +0 -106
  95. agent/providers/openai/types.py +0 -36
  96. agent/providers/openai/utils.py +0 -98
  97. agent/providers/uitars/__init__.py +0 -1
  98. agent/providers/uitars/clients/base.py +0 -35
  99. agent/providers/uitars/clients/mlxvlm.py +0 -263
  100. agent/providers/uitars/clients/oaicompat.py +0 -214
  101. agent/providers/uitars/loop.py +0 -660
  102. agent/providers/uitars/prompts.py +0 -63
  103. agent/providers/uitars/tools/__init__.py +0 -1
  104. agent/providers/uitars/tools/computer.py +0 -283
  105. agent/providers/uitars/tools/manager.py +0 -60
  106. agent/providers/uitars/utils.py +0 -264
  107. agent/telemetry.py +0 -21
  108. agent/ui/__main__.py +0 -15
  109. cua_agent-0.3.1.dist-info/METADATA +0 -295
  110. cua_agent-0.3.1.dist-info/RECORD +0 -87
  111. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
@@ -1,63 +0,0 @@
1
- """Prompts for UI-TARS agent."""
2
-
3
- MAC_SPECIFIC_NOTES = """
4
- (You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
5
- """
6
-
7
- SYSTEM_PROMPT = "You are a helpful assistant."
8
-
9
- COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
10
-
11
- ## Output Format
12
- ```
13
- Thought: ...
14
- Action: ...
15
- ```
16
-
17
- ## Action Space
18
-
19
- click(start_box='<|box_start|>(x1,y1)<|box_end|>')
20
- left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
21
- right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
22
- drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
23
- hotkey(key='')
24
- type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
25
- scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
26
- wait() #Sleep for 5s and take a screenshot to check for any changes.
27
- finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
28
-
29
-
30
- ## Note
31
- - Use {language} in `Thought` part.
32
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
33
-
34
- ## User Instruction
35
- {instruction}
36
- """
37
-
38
- MOBILE_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
39
- ## Output Format
40
- ```
41
- Thought: ...
42
- Action: ...
43
- ```
44
- ## Action Space
45
-
46
- click(start_box='<|box_start|>(x1,y1)<|box_end|>')
47
- long_press(start_box='<|box_start|>(x1,y1)<|box_end|>')
48
- type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
49
- scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
50
- open_app(app_name=\'\')
51
- drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
52
- press_home()
53
- press_back()
54
- finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
55
-
56
-
57
- ## Note
58
- - Use {language} in `Thought` part.
59
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
60
-
61
- ## User Instruction
62
- {instruction}
63
- """
@@ -1 +0,0 @@
1
- """UI-TARS tools package."""
@@ -1,283 +0,0 @@
1
- """Computer tool for UI-TARS."""
2
-
3
- import asyncio
4
- import base64
5
- import logging
6
- import re
7
- from typing import Any, Dict, List, Optional, Literal, Union
8
-
9
- from computer import Computer
10
- from ....core.tools.base import ToolResult, ToolFailure
11
- from ....core.tools.computer import BaseComputerTool
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- class ComputerTool(BaseComputerTool):
17
- """
18
- A tool that allows the UI-TARS agent to interact with the screen, keyboard, and mouse.
19
- """
20
-
21
- name: str = "computer"
22
- width: Optional[int] = None
23
- height: Optional[int] = None
24
- computer: Computer
25
-
26
- def __init__(self, computer: Computer):
27
- """Initialize the computer tool.
28
-
29
- Args:
30
- computer: Computer instance
31
- """
32
- super().__init__(computer)
33
- self.computer = computer
34
- self.width = None
35
- self.height = None
36
- self.logger = logging.getLogger(__name__)
37
-
38
- def to_params(self) -> Dict[str, Any]:
39
- """Convert tool to API parameters.
40
-
41
- Returns:
42
- Dictionary with tool parameters
43
- """
44
- if self.width is None or self.height is None:
45
- raise RuntimeError(
46
- "Screen dimensions not initialized. Call initialize_dimensions() first."
47
- )
48
- return {
49
- "type": "computer",
50
- "display_width": self.width,
51
- "display_height": self.height,
52
- }
53
-
54
- async def initialize_dimensions(self) -> None:
55
- """Initialize screen dimensions from the computer interface."""
56
- try:
57
- display_size = await self.computer.interface.get_screen_size()
58
- self.width = display_size["width"]
59
- self.height = display_size["height"]
60
- self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
61
- except Exception as e:
62
- # Fall back to defaults if we can't get accurate dimensions
63
- self.width = 1024
64
- self.height = 768
65
- self.logger.warning(
66
- f"Failed to get screen dimensions, using defaults: {self.width}x{self.height}. Error: {e}"
67
- )
68
-
69
- async def __call__(
70
- self,
71
- *,
72
- action: str,
73
- **kwargs,
74
- ) -> ToolResult:
75
- """Execute a computer action.
76
-
77
- Args:
78
- action: The action to perform (based on UI-TARS action space)
79
- **kwargs: Additional parameters for the action
80
-
81
- Returns:
82
- ToolResult containing action output and possibly a base64 image
83
- """
84
- try:
85
- # Ensure dimensions are initialized
86
- if self.width is None or self.height is None:
87
- await self.initialize_dimensions()
88
- if self.width is None or self.height is None:
89
- return ToolFailure(error="Failed to initialize screen dimensions")
90
-
91
- # Handle actions defined in UI-TARS action space (from prompts.py)
92
- # Handle standard click (left click)
93
- if action == "click":
94
- if "x" in kwargs and "y" in kwargs:
95
- x, y = kwargs["x"], kwargs["y"]
96
- await self.computer.interface.left_click(x, y)
97
-
98
- # Wait briefly for UI to update
99
- await asyncio.sleep(0.5)
100
-
101
- # Take screenshot after action
102
- screenshot = await self.computer.interface.screenshot()
103
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
104
-
105
- return ToolResult(
106
- output=f"Clicked at ({x}, {y})",
107
- base64_image=base64_screenshot,
108
- )
109
- else:
110
- return ToolFailure(error="Missing coordinates for click action")
111
-
112
- # Handle double click
113
- elif action == "left_double":
114
- if "x" in kwargs and "y" in kwargs:
115
- x, y = kwargs["x"], kwargs["y"]
116
- await self.computer.interface.double_click(x, y)
117
-
118
- # Wait briefly for UI to update
119
- await asyncio.sleep(0.5)
120
-
121
- # Take screenshot after action
122
- screenshot = await self.computer.interface.screenshot()
123
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
124
-
125
- return ToolResult(
126
- output=f"Double-clicked at ({x}, {y})",
127
- base64_image=base64_screenshot,
128
- )
129
- else:
130
- return ToolFailure(error="Missing coordinates for left_double action")
131
-
132
- # Handle right click
133
- elif action == "right_single":
134
- if "x" in kwargs and "y" in kwargs:
135
- x, y = kwargs["x"], kwargs["y"]
136
- await self.computer.interface.right_click(x, y)
137
-
138
- # Wait briefly for UI to update
139
- await asyncio.sleep(0.5)
140
-
141
- # Take screenshot after action
142
- screenshot = await self.computer.interface.screenshot()
143
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
144
-
145
- return ToolResult(
146
- output=f"Right-clicked at ({x}, {y})",
147
- base64_image=base64_screenshot,
148
- )
149
- else:
150
- return ToolFailure(error="Missing coordinates for right_single action")
151
-
152
- # Handle typing text
153
- elif action == "type_text":
154
- if "text" in kwargs:
155
- text = kwargs["text"]
156
- await self.computer.interface.type_text(text)
157
-
158
- # Wait for UI to update
159
- await asyncio.sleep(0.3)
160
-
161
- # Take screenshot after action
162
- screenshot = await self.computer.interface.screenshot()
163
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
164
-
165
- return ToolResult(
166
- output=f"Typed: {text}",
167
- base64_image=base64_screenshot,
168
- )
169
- else:
170
- return ToolFailure(error="Missing text for type action")
171
-
172
- # Handle hotkey
173
- elif action == "hotkey":
174
- if "keys" in kwargs:
175
- keys = kwargs["keys"]
176
-
177
- if len(keys) > 1:
178
- await self.computer.interface.hotkey(*keys)
179
- else:
180
- # Single key press
181
- await self.computer.interface.press_key(keys[0])
182
-
183
- # Wait for UI to update
184
- await asyncio.sleep(0.3)
185
-
186
- # Take screenshot after action
187
- screenshot = await self.computer.interface.screenshot()
188
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
189
-
190
- return ToolResult(
191
- output=f"Pressed hotkey: {', '.join(keys)}",
192
- base64_image=base64_screenshot,
193
- )
194
- else:
195
- return ToolFailure(error="Missing keys for hotkey action")
196
-
197
- # Handle drag action
198
- elif action == "drag":
199
- if all(k in kwargs for k in ["start_x", "start_y", "end_x", "end_y"]):
200
- start_x, start_y = kwargs["start_x"], kwargs["start_y"]
201
- end_x, end_y = kwargs["end_x"], kwargs["end_y"]
202
-
203
- # Perform drag
204
- await self.computer.interface.move_cursor(start_x, start_y)
205
- await self.computer.interface.drag_to(end_x, end_y)
206
-
207
- # Wait for UI to update
208
- await asyncio.sleep(0.5)
209
-
210
- # Take screenshot after action
211
- screenshot = await self.computer.interface.screenshot()
212
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
213
-
214
- return ToolResult(
215
- output=f"Dragged from ({start_x}, {start_y}) to ({end_x}, {end_y})",
216
- base64_image=base64_screenshot,
217
- )
218
- else:
219
- return ToolFailure(error="Missing coordinates for drag action")
220
-
221
- # Handle scroll action
222
- elif action == "scroll":
223
- if all(k in kwargs for k in ["x", "y", "direction"]):
224
- x, y = kwargs["x"], kwargs["y"]
225
- direction = kwargs["direction"]
226
-
227
- # Move cursor to position
228
- await self.computer.interface.move_cursor(x, y)
229
-
230
- # Scroll based on direction
231
- if direction == "down":
232
- await self.computer.interface.scroll_down(5)
233
- elif direction == "up":
234
- await self.computer.interface.scroll_up(5)
235
- elif direction == "right":
236
- pass # await self.computer.interface.scroll_right(5)
237
- elif direction == "left":
238
- pass # await self.computer.interface.scroll_left(5)
239
- else:
240
- return ToolFailure(error=f"Invalid scroll direction: {direction}")
241
-
242
- # Wait for UI to update
243
- await asyncio.sleep(0.5)
244
-
245
- # Take screenshot after action
246
- screenshot = await self.computer.interface.screenshot()
247
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
248
-
249
- return ToolResult(
250
- output=f"Scrolled {direction} at ({x}, {y})",
251
- base64_image=base64_screenshot,
252
- )
253
- else:
254
- return ToolFailure(error="Missing parameters for scroll action")
255
-
256
- # Handle wait action
257
- elif action == "wait":
258
- # Sleep for 5 seconds as specified in the action space
259
- await asyncio.sleep(5)
260
-
261
- # Take screenshot after waiting
262
- screenshot = await self.computer.interface.screenshot()
263
- base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
264
-
265
- return ToolResult(
266
- output="Waited for 5 seconds",
267
- base64_image=base64_screenshot,
268
- )
269
-
270
- # Handle finished action (task completion)
271
- elif action == "finished":
272
- content = kwargs.get("content", "Task completed")
273
- return ToolResult(
274
- output=f"Task finished: {content}",
275
- )
276
-
277
- return await self._handle_scroll(action)
278
- else:
279
- return ToolFailure(error=f"Unsupported action: {action}")
280
-
281
- except Exception as e:
282
- self.logger.error(f"Error in ComputerTool.__call__: {str(e)}")
283
- return ToolFailure(error=f"Failed to execute {action}: {str(e)}")
@@ -1,60 +0,0 @@
1
- """Tool manager for the UI-TARS provider."""
2
-
3
- import logging
4
- from typing import Any, Dict, List, Optional
5
-
6
- from computer import Computer
7
- from ....core.tools import BaseToolManager
8
- from ....core.tools.collection import ToolCollection
9
- from .computer import ComputerTool
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- class ToolManager(BaseToolManager):
15
- """Manages UI-TARS provider tool initialization and execution."""
16
-
17
- def __init__(self, computer: Computer):
18
- """Initialize the tool manager.
19
-
20
- Args:
21
- computer: Computer instance for computer-related tools
22
- """
23
- super().__init__(computer)
24
- # Initialize UI-TARS-specific tools
25
- self.computer_tool = ComputerTool(self.computer)
26
- self._initialized = False
27
-
28
- def _initialize_tools(self) -> ToolCollection:
29
- """Initialize all available tools."""
30
- return ToolCollection(self.computer_tool)
31
-
32
- async def _initialize_tools_specific(self) -> None:
33
- """Initialize UI-TARS provider-specific tool requirements."""
34
- await self.computer_tool.initialize_dimensions()
35
-
36
- def get_tool_params(self) -> List[Dict[str, Any]]:
37
- """Get tool parameters for API calls.
38
-
39
- Returns:
40
- List of tool parameters for the current provider's API
41
- """
42
- if self.tools is None:
43
- raise RuntimeError("Tools not initialized. Call initialize() first.")
44
-
45
- return self.tools.to_params()
46
-
47
- async def execute_tool(self, name: str, tool_input: dict[str, Any]) -> Any:
48
- """Execute a tool with the given input.
49
-
50
- Args:
51
- name: Name of the tool to execute
52
- tool_input: Input parameters for the tool
53
-
54
- Returns:
55
- Result of the tool execution
56
- """
57
- if self.tools is None:
58
- raise RuntimeError("Tools not initialized. Call initialize() first.")
59
-
60
- return await self.tools.run(name=name, tool_input=tool_input)
@@ -1,264 +0,0 @@
1
- """Utility functions for the UI-TARS provider."""
2
-
3
- import logging
4
- import base64
5
- import re
6
- from typing import Any, Dict, List, Optional, Union, Tuple
7
- from datetime import datetime
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
- from ...core.types import AgentResponse
12
-
13
- async def to_agent_response_format(
14
- response: Dict[str, Any],
15
- messages: List[Dict[str, Any]],
16
- model: Optional[str] = None,
17
- ) -> AgentResponse:
18
- """Convert raw UI-TARS response to agent response format.
19
-
20
- Args:
21
- response: Raw UI-TARS response
22
- messages: List of messages in standard format
23
- model: Optional model name
24
-
25
- Returns:
26
- AgentResponse: Standardized agent response format
27
- """
28
- # Create unique IDs for this response
29
- response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
30
- reasoning_id = f"rs_{response_id}"
31
- action_id = f"cu_{response_id}"
32
- call_id = f"call_{response_id}"
33
-
34
- # Parse actions from the raw response
35
- content = response["choices"][0]["message"]["content"]
36
- actions = parse_actions(content)
37
-
38
- # Extract thought content if available
39
- reasoning_text = ""
40
- if "Thought:" in content:
41
- thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
42
- if thought_match:
43
- reasoning_text = thought_match.group(1).strip()
44
-
45
- # Create output items
46
- output_items = []
47
- if reasoning_text:
48
- output_items.append({
49
- "type": "reasoning",
50
- "id": reasoning_id,
51
- "text": reasoning_text
52
- })
53
- if actions:
54
- for i, action in enumerate(actions):
55
- action_name, tool_args = parse_action_parameters(action)
56
- if action_name == "finished":
57
- output_items.append({
58
- "type": "message",
59
- "role": "assistant",
60
- "content": [{
61
- "type": "output_text",
62
- "text": tool_args["content"]
63
- }],
64
- "id": f"action_{i}_{action_id}",
65
- "status": "completed"
66
- })
67
- else:
68
- if tool_args.get("action") == action_name:
69
- del tool_args["action"]
70
- output_items.append({
71
- "type": "computer_call",
72
- "id": f"{action}_{i}_{action_id}",
73
- "call_id": f"call_{i}_{action_id}",
74
- "action": { "type": action_name, **tool_args },
75
- "pending_safety_checks": [],
76
- "status": "completed"
77
- })
78
-
79
- # Create agent response
80
- agent_response = AgentResponse(
81
- id=response_id,
82
- object="response",
83
- created_at=int(datetime.now().timestamp()),
84
- status="completed",
85
- error=None,
86
- incomplete_details=None,
87
- instructions=None,
88
- max_output_tokens=None,
89
- model=model or response["model"],
90
- output=output_items,
91
- parallel_tool_calls=True,
92
- previous_response_id=None,
93
- reasoning={"effort": "medium"},
94
- store=True,
95
- temperature=0.0,
96
- top_p=0.7,
97
- text={"format": {"type": "text"}},
98
- tool_choice="auto",
99
- tools=[
100
- {
101
- "type": "computer_use_preview",
102
- "display_height": 768,
103
- "display_width": 1024,
104
- "environment": "mac",
105
- }
106
- ],
107
- truncation="auto",
108
- usage=response.get("usage", {}),
109
- user=None,
110
- metadata={},
111
- response=response
112
- )
113
- return agent_response
114
-
115
-
116
- def add_box_token(input_string: str) -> str:
117
- """Add box tokens to the coordinates in the model response.
118
-
119
- Args:
120
- input_string: Raw model response
121
-
122
- Returns:
123
- String with box tokens added
124
- """
125
- if "Action: " not in input_string or "start_box=" not in input_string:
126
- return input_string
127
-
128
- suffix = input_string.split("Action: ")[0] + "Action: "
129
- actions = input_string.split("Action: ")[1:]
130
- processed_actions = []
131
-
132
- for action in actions:
133
- action = action.strip()
134
- coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
135
-
136
- updated_action = action
137
- for coord_type, x, y in coordinates:
138
- updated_action = updated_action.replace(
139
- f"{coord_type}='({x},{y})'",
140
- f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'"
141
- )
142
- processed_actions.append(updated_action)
143
-
144
- return suffix + "\n\n".join(processed_actions)
145
-
146
-
147
- def parse_actions(response: str) -> List[str]:
148
- """Parse actions from UI-TARS model response.
149
-
150
- Args:
151
- response: The raw model response text
152
-
153
- Returns:
154
- List of parsed actions
155
- """
156
- actions = []
157
- # Extract the Action part from the response
158
- if "Action:" in response:
159
- action_text = response.split("Action:")[-1].strip()
160
- # Clean up and format action
161
- if action_text:
162
- # Handle multiple actions separated by newlines
163
- action_parts = action_text.split("\n\n")
164
- for part in action_parts:
165
- if part.strip():
166
- actions.append(part.strip())
167
-
168
- return actions
169
-
170
-
171
- def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
172
- """Parse parameters from an action string.
173
-
174
- Args:
175
- action: The action string to parse
176
-
177
- Returns:
178
- Tuple of (action_name, action_parameters)
179
- """
180
- # Handle "finished" action
181
- if action.startswith("finished"):
182
- # Parse content if it exists
183
- content_match = re.search(r"content='([^']*)'", action)
184
- if content_match:
185
- content = content_match.group(1)
186
- return "finished", {"content": content}
187
- else:
188
- return "finished", {}
189
-
190
- # Parse action parameters
191
- action_match = re.match(r'(\w+)\((.*)\)', action)
192
- if not action_match:
193
- logger.warning(f"Could not parse action: {action}")
194
- return "", {}
195
-
196
- action_name = action_match.group(1)
197
- action_params_str = action_match.group(2)
198
-
199
- tool_args = {"action": action_name}
200
-
201
- # Extract coordinate values from the action
202
- if "start_box" in action_params_str:
203
- # Extract all box coordinates
204
- box_pattern = r"(start_box|end_box)='(?:<\|box_start\|>)?\((\d+),\s*(\d+)\)(?:<\|box_end\|>)?'"
205
- box_matches = re.findall(box_pattern, action_params_str)
206
-
207
- # Handle click-type actions
208
- if action_name in ["click", "left_double", "right_single"]:
209
- # Get coordinates from start_box
210
- for box_type, x, y in box_matches:
211
- if box_type == "start_box":
212
- tool_args["x"] = int(x)
213
- tool_args["y"] = int(y)
214
- break
215
-
216
- # Handle drag action
217
- elif action_name == "drag":
218
- start_x, start_y = None, None
219
- end_x, end_y = None, None
220
-
221
- for box_type, x, y in box_matches:
222
- if box_type == "start_box":
223
- start_x, start_y = int(x), int(y)
224
- elif box_type == "end_box":
225
- end_x, end_y = int(x), int(y)
226
-
227
- if not None in [start_x, start_y, end_x, end_y]:
228
- tool_args["start_x"] = start_x
229
- tool_args["start_y"] = start_y
230
- tool_args["end_x"] = end_x
231
- tool_args["end_y"] = end_y
232
-
233
- # Handle scroll action
234
- elif action_name == "scroll":
235
- # Get coordinates from start_box
236
- for box_type, x, y in box_matches:
237
- if box_type == "start_box":
238
- tool_args["x"] = int(x)
239
- tool_args["y"] = int(y)
240
- break
241
-
242
- # Extract direction
243
- direction_match = re.search(r"direction='([^']+)'", action_params_str)
244
- if direction_match:
245
- tool_args["direction"] = direction_match.group(1)
246
-
247
- # Handle typing text
248
- elif action_name == "type":
249
- # Extract text content
250
- content_match = re.search(r"content='([^']*)'", action_params_str)
251
- if content_match:
252
- # Unescape escaped characters
253
- text = content_match.group(1).replace("\\'", "'").replace('\\"', '"').replace("\\n", "\n")
254
- tool_args = {"action": "type_text", "text": text}
255
-
256
- # Handle hotkey
257
- elif action_name == "hotkey":
258
- # Extract key combination
259
- key_match = re.search(r"key='([^']*)'", action_params_str)
260
- if key_match:
261
- keys = key_match.group(1).split()
262
- tool_args = {"action": "hotkey", "keys": keys}
263
-
264
- return action_name, tool_args
agent/telemetry.py DELETED
@@ -1,21 +0,0 @@
1
- """Telemetry support for Agent class."""
2
-
3
- import os
4
- import platform
5
- import sys
6
- import time
7
- from typing import Any, Dict, Optional
8
-
9
- from core.telemetry import (
10
- record_event,
11
- is_telemetry_enabled,
12
- flush,
13
- get_telemetry_client,
14
- increment,
15
- )
16
-
17
- # System information used for telemetry
18
- SYSTEM_INFO = {
19
- "os": sys.platform,
20
- "python_version": platform.python_version(),
21
- }