cua-agent 0.1.30__tar.gz → 0.1.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- {cua_agent-0.1.30 → cua_agent-0.1.32}/PKG-INFO +11 -4
- {cua_agent-0.1.30 → cua_agent-0.1.32}/README.md +10 -3
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/loop.py +2 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/computer.py +11 -9
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/oaicompat.py +12 -2
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/loop.py +2 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/loop.py +4 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/computer.py +44 -7
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/clients/oaicompat.py +24 -16
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/loop.py +18 -39
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/prompts.py +5 -1
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/tools/computer.py +6 -2
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/utils.py +112 -1
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/ui/gradio/app.py +58 -127
- {cua_agent-0.1.30 → cua_agent-0.1.32}/pyproject.toml +3 -3
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/agent.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/base.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/callbacks.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/experiment.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/factory.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/messages.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/provider_config.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/telemetry.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/base.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/bash.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/collection.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/computer.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/edit.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools/manager.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/tools.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/types.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/core/visualization.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/api/client.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/api/logging.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/api_handler.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/callbacks/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/callbacks/manager.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/prompts.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/response_handler.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/base.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/bash.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/collection.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/edit.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/manager.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/tools/run.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/types.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/anthropic/utils.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/api_handler.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/anthropic.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/base.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/ollama.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/openai.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/clients/utils.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/image_utils.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/parser.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/prompts.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/base.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/bash.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/computer.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/tools/manager.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/omni/utils.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/api_handler.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/response_handler.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/base.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/tools/manager.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/types.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/openai/utils.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/clients/base.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/tools/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/providers/uitars/tools/manager.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/telemetry.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/ui/__init__.py +0 -0
- {cua_agent-0.1.30 → cua_agent-0.1.32}/agent/ui/gradio/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.32
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -101,6 +101,7 @@ pip install "cua-agent[all]"
|
|
|
101
101
|
# or install specific loop providers
|
|
102
102
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
103
103
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
104
|
+
pip install "cua-agent[uitars]" # UI-Tars support
|
|
104
105
|
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
105
106
|
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
106
107
|
```
|
|
@@ -119,10 +120,10 @@ async with Computer() as macos_computer:
|
|
|
119
120
|
# model=LLM(provider=LLMProvider.ANTHROPIC)
|
|
120
121
|
# or
|
|
121
122
|
# loop=AgentLoop.OMNI,
|
|
122
|
-
# model=LLM(provider=LLMProvider.OLLAMA,
|
|
123
|
+
# model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
|
|
123
124
|
# or
|
|
124
125
|
# loop=AgentLoop.UITARS,
|
|
125
|
-
# model=LLM(provider=LLMProvider.OAICOMPAT,
|
|
126
|
+
# model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
|
126
127
|
)
|
|
127
128
|
|
|
128
129
|
tasks = [
|
|
@@ -148,7 +149,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
|
|
|
148
149
|
|
|
149
150
|
## Using the Gradio UI
|
|
150
151
|
|
|
151
|
-
The agent includes a Gradio-based user interface for
|
|
152
|
+
The agent includes a Gradio-based user interface for easier interaction.
|
|
153
|
+
|
|
154
|
+
<div align="center">
|
|
155
|
+
<img src="../../img/agent_gradio_ui.png"/>
|
|
156
|
+
</div>
|
|
157
|
+
|
|
158
|
+
To use it:
|
|
152
159
|
|
|
153
160
|
```bash
|
|
154
161
|
# Install with Gradio support
|
|
@@ -31,6 +31,7 @@ pip install "cua-agent[all]"
|
|
|
31
31
|
# or install specific loop providers
|
|
32
32
|
pip install "cua-agent[openai]" # OpenAI Cua Loop
|
|
33
33
|
pip install "cua-agent[anthropic]" # Anthropic Cua Loop
|
|
34
|
+
pip install "cua-agent[uitars]" # UI-Tars support
|
|
34
35
|
pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
|
|
35
36
|
pip install "cua-agent[ui]" # Gradio UI for the agent
|
|
36
37
|
```
|
|
@@ -49,10 +50,10 @@ async with Computer() as macos_computer:
|
|
|
49
50
|
# model=LLM(provider=LLMProvider.ANTHROPIC)
|
|
50
51
|
# or
|
|
51
52
|
# loop=AgentLoop.OMNI,
|
|
52
|
-
# model=LLM(provider=LLMProvider.OLLAMA,
|
|
53
|
+
# model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
|
|
53
54
|
# or
|
|
54
55
|
# loop=AgentLoop.UITARS,
|
|
55
|
-
# model=LLM(provider=LLMProvider.OAICOMPAT,
|
|
56
|
+
# model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
|
56
57
|
)
|
|
57
58
|
|
|
58
59
|
tasks = [
|
|
@@ -78,7 +79,13 @@ Refer to these notebooks for step-by-step guides on how to use the Computer-Use
|
|
|
78
79
|
|
|
79
80
|
## Using the Gradio UI
|
|
80
81
|
|
|
81
|
-
The agent includes a Gradio-based user interface for
|
|
82
|
+
The agent includes a Gradio-based user interface for easier interaction.
|
|
83
|
+
|
|
84
|
+
<div align="center">
|
|
85
|
+
<img src="../../img/agent_gradio_ui.png"/>
|
|
86
|
+
</div>
|
|
87
|
+
|
|
88
|
+
To use it:
|
|
82
89
|
|
|
83
90
|
```bash
|
|
84
91
|
# Install with Gradio support
|
|
@@ -279,6 +279,8 @@ class AnthropicLoop(BaseLoop):
|
|
|
279
279
|
messages,
|
|
280
280
|
model=self.model,
|
|
281
281
|
)
|
|
282
|
+
# Log standardized response for ease of parsing
|
|
283
|
+
self._log_api_call("agent_response", request=None, response=openai_compatible_response)
|
|
282
284
|
await queue.put(openai_compatible_response)
|
|
283
285
|
|
|
284
286
|
if not should_continue:
|
|
@@ -161,15 +161,17 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
161
161
|
self.logger.info(f"Moving cursor to ({x}, {y})")
|
|
162
162
|
await self.computer.interface.move_cursor(x, y)
|
|
163
163
|
elif action == "left_click_drag":
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
164
|
+
# Get the start coordinate from kwargs
|
|
165
|
+
start_coordinate = kwargs.get("start_coordinate")
|
|
166
|
+
if not start_coordinate:
|
|
167
|
+
raise ToolError("start_coordinate is required for left_click_drag action")
|
|
168
|
+
|
|
169
|
+
start_x, start_y = start_coordinate
|
|
170
|
+
end_x, end_y = x, y
|
|
171
|
+
|
|
172
|
+
self.logger.info(f"Dragging from ({start_x}, {start_y}) to ({end_x}, {end_y})")
|
|
173
|
+
await self.computer.interface.move_cursor(start_x, start_y)
|
|
174
|
+
await self.computer.interface.drag_to(end_x, end_y)
|
|
173
175
|
|
|
174
176
|
# Wait briefly for any UI changes
|
|
175
177
|
await asyncio.sleep(0.5)
|
|
@@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient):
|
|
|
93
93
|
"""
|
|
94
94
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
|
95
95
|
|
|
96
|
-
final_messages = [
|
|
96
|
+
final_messages = [
|
|
97
|
+
{
|
|
98
|
+
"role": "system",
|
|
99
|
+
"content": [
|
|
100
|
+
{ "type": "text", "text": system }
|
|
101
|
+
]
|
|
102
|
+
}
|
|
103
|
+
]
|
|
97
104
|
|
|
98
105
|
# Process messages
|
|
99
106
|
for item in messages:
|
|
@@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient):
|
|
|
117
124
|
else:
|
|
118
125
|
message = {
|
|
119
126
|
"role": item["role"],
|
|
120
|
-
"content": [{
|
|
127
|
+
"content": [{
|
|
128
|
+
"type": "text",
|
|
129
|
+
"text": item["content"]
|
|
130
|
+
}],
|
|
121
131
|
}
|
|
122
132
|
final_messages.append(message)
|
|
123
133
|
else:
|
|
@@ -670,6 +670,8 @@ class OmniLoop(BaseLoop):
|
|
|
670
670
|
parsed_screen=parsed_screen,
|
|
671
671
|
parser=self.parser
|
|
672
672
|
)
|
|
673
|
+
# Log standardized response for ease of parsing
|
|
674
|
+
self._log_api_call("agent_response", request=None, response=openai_compatible_response)
|
|
673
675
|
|
|
674
676
|
# Yield the response to the caller
|
|
675
677
|
yield openai_compatible_response
|
|
@@ -276,6 +276,10 @@ class OpenAILoop(BaseLoop):
|
|
|
276
276
|
)
|
|
277
277
|
# Don't reset last_response_id to None - keep the previous value if available
|
|
278
278
|
|
|
279
|
+
|
|
280
|
+
# Log standardized response for ease of parsing
|
|
281
|
+
# Since this is the openAI responses format, we don't need to convert it to agent response format
|
|
282
|
+
self._log_api_call("agent_response", request=None, response=response)
|
|
279
283
|
# Process API response
|
|
280
284
|
await queue.put(response)
|
|
281
285
|
|
|
@@ -44,6 +44,7 @@ Action = Literal[
|
|
|
44
44
|
"double_click",
|
|
45
45
|
"screenshot",
|
|
46
46
|
"scroll",
|
|
47
|
+
"drag",
|
|
47
48
|
]
|
|
48
49
|
|
|
49
50
|
|
|
@@ -162,9 +163,14 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
162
163
|
y = kwargs.get("y")
|
|
163
164
|
if x is None or y is None:
|
|
164
165
|
raise ToolError("x and y coordinates are required for scroll action")
|
|
165
|
-
scroll_x = kwargs.get("scroll_x", 0) //
|
|
166
|
-
scroll_y = kwargs.get("scroll_y", 0) //
|
|
166
|
+
scroll_x = kwargs.get("scroll_x", 0) // 50
|
|
167
|
+
scroll_y = kwargs.get("scroll_y", 0) // 50
|
|
167
168
|
return await self.handle_scroll(x, y, scroll_x, scroll_y)
|
|
169
|
+
elif type == "drag":
|
|
170
|
+
path = kwargs.get("path")
|
|
171
|
+
if not path or not isinstance(path, list) or len(path) < 2:
|
|
172
|
+
raise ToolError("path is required for drag action and must contain at least 2 points")
|
|
173
|
+
return await self.handle_drag(path)
|
|
168
174
|
elif type == "screenshot":
|
|
169
175
|
return await self.screenshot()
|
|
170
176
|
elif type == "wait":
|
|
@@ -240,11 +246,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
240
246
|
|
|
241
247
|
if len(mapped_keys) > 1:
|
|
242
248
|
# For key combinations (like Ctrl+C)
|
|
243
|
-
|
|
244
|
-
await self.computer.interface.press_key(k)
|
|
245
|
-
await asyncio.sleep(0.1)
|
|
246
|
-
for k in reversed(mapped_keys):
|
|
247
|
-
await self.computer.interface.press_key(k)
|
|
249
|
+
await self.computer.interface.hotkey(*mapped_keys)
|
|
248
250
|
else:
|
|
249
251
|
# Single key press
|
|
250
252
|
await self.computer.interface.press_key(mapped_keys[0])
|
|
@@ -306,6 +308,41 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
306
308
|
self.logger.error(f"Error in handle_scroll: {str(e)}")
|
|
307
309
|
raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
|
|
308
310
|
|
|
311
|
+
async def handle_drag(self, path: List[Dict[str, int]]) -> ToolResult:
|
|
312
|
+
"""Handle mouse drag operation using a path of coordinates.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
path: List of coordinate points {"x": int, "y": int} defining the drag path
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
ToolResult with the operation result and screenshot
|
|
319
|
+
"""
|
|
320
|
+
try:
|
|
321
|
+
# Convert from [{"x": x, "y": y}, ...] format to [(x, y), ...] format
|
|
322
|
+
points = [(p["x"], p["y"]) for p in path]
|
|
323
|
+
|
|
324
|
+
# Perform drag action
|
|
325
|
+
if len(points) == 2:
|
|
326
|
+
await self.computer.interface.move_cursor(points[0][0], points[0][1])
|
|
327
|
+
await self.computer.interface.drag_to(points[1][0], points[1][1])
|
|
328
|
+
else:
|
|
329
|
+
await self.computer.interface.drag(points, button="left")
|
|
330
|
+
|
|
331
|
+
# Wait for UI to update
|
|
332
|
+
await asyncio.sleep(0.5)
|
|
333
|
+
|
|
334
|
+
# Take screenshot after action
|
|
335
|
+
screenshot = await self.computer.interface.screenshot()
|
|
336
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
337
|
+
|
|
338
|
+
return ToolResult(
|
|
339
|
+
output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
|
|
340
|
+
base64_image=base64_screenshot,
|
|
341
|
+
)
|
|
342
|
+
except Exception as e:
|
|
343
|
+
self.logger.error(f"Error in handle_drag: {str(e)}")
|
|
344
|
+
raise ToolError(f"Failed to perform drag operation: {str(e)}")
|
|
345
|
+
|
|
309
346
|
async def screenshot(self) -> ToolResult:
|
|
310
347
|
"""Take a screenshot."""
|
|
311
348
|
try:
|
|
@@ -94,8 +94,15 @@ class OAICompatClient(BaseUITarsClient):
|
|
|
94
94
|
"""
|
|
95
95
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
|
96
96
|
|
|
97
|
-
final_messages = [
|
|
98
|
-
|
|
97
|
+
final_messages = [
|
|
98
|
+
{
|
|
99
|
+
"role": "system",
|
|
100
|
+
"content": [
|
|
101
|
+
{ "type": "text", "text": system }
|
|
102
|
+
]
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
|
|
99
106
|
# Process messages
|
|
100
107
|
for item in messages:
|
|
101
108
|
if isinstance(item, dict):
|
|
@@ -138,8 +145,13 @@ class OAICompatClient(BaseUITarsClient):
|
|
|
138
145
|
message = {"role": "user", "content": [{"type": "text", "text": item}]}
|
|
139
146
|
final_messages.append(message)
|
|
140
147
|
|
|
141
|
-
payload = {
|
|
142
|
-
|
|
148
|
+
payload = {
|
|
149
|
+
"model": self.model,
|
|
150
|
+
"messages": final_messages,
|
|
151
|
+
"max_tokens": max_tokens or self.max_tokens,
|
|
152
|
+
"temperature": self.temperature,
|
|
153
|
+
"top_p": 0.7,
|
|
154
|
+
}
|
|
143
155
|
|
|
144
156
|
try:
|
|
145
157
|
async with aiohttp.ClientSession() as session:
|
|
@@ -178,25 +190,21 @@ class OAICompatClient(BaseUITarsClient):
|
|
|
178
190
|
response_text = await response.text()
|
|
179
191
|
logger.debug(f"Response content: {response_text}")
|
|
180
192
|
|
|
193
|
+
# if 503, then the endpoint is still warming up
|
|
194
|
+
if response.status == 503:
|
|
195
|
+
logger.error(f"Endpoint is still warming up, please try again later")
|
|
196
|
+
raise Exception(f"Endpoint is still warming up: {response_text}")
|
|
197
|
+
|
|
181
198
|
# Try to parse as JSON if the content type is appropriate
|
|
182
199
|
if "application/json" in response.headers.get('Content-Type', ''):
|
|
183
200
|
response_json = await response.json()
|
|
184
201
|
else:
|
|
185
202
|
raise Exception(f"Response is not JSON format")
|
|
186
|
-
# # Optionally try to parse it anyway
|
|
187
|
-
# try:
|
|
188
|
-
# import json
|
|
189
|
-
# response_json = json.loads(response_text)
|
|
190
|
-
# except json.JSONDecodeError as e:
|
|
191
|
-
# print(f"Failed to parse response as JSON: {e}")
|
|
192
203
|
|
|
193
204
|
if response.status != 200:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
logger.error(f"Error in API call: {error_msg}")
|
|
198
|
-
raise Exception(f"API error: {error_msg}")
|
|
199
|
-
|
|
205
|
+
logger.error(f"Error in API call: {response_text}")
|
|
206
|
+
raise Exception(f"API error: {response_text}")
|
|
207
|
+
|
|
200
208
|
return response_json
|
|
201
209
|
|
|
202
210
|
except Exception as e:
|
|
@@ -17,10 +17,10 @@ from ...core.types import AgentResponse, LLMProvider
|
|
|
17
17
|
from ...core.visualization import VisualizationHelper
|
|
18
18
|
from computer import Computer
|
|
19
19
|
|
|
20
|
-
from .utils import add_box_token, parse_actions, parse_action_parameters
|
|
20
|
+
from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
|
|
21
21
|
from .tools.manager import ToolManager
|
|
22
22
|
from .tools.computer import ToolResult
|
|
23
|
-
from .prompts import COMPUTER_USE, SYSTEM_PROMPT
|
|
23
|
+
from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
|
|
24
24
|
|
|
25
25
|
from .clients.oaicompat import OAICompatClient
|
|
26
26
|
|
|
@@ -184,7 +184,7 @@ class UITARSLoop(BaseLoop):
|
|
|
184
184
|
if first_user_idx is not None and instruction:
|
|
185
185
|
# Create the computer use prompt
|
|
186
186
|
user_prompt = COMPUTER_USE.format(
|
|
187
|
-
instruction=instruction,
|
|
187
|
+
instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]),
|
|
188
188
|
language="English"
|
|
189
189
|
)
|
|
190
190
|
|
|
@@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop):
|
|
|
232
232
|
if self.client is None:
|
|
233
233
|
raise RuntimeError("Failed to initialize client")
|
|
234
234
|
|
|
235
|
-
#
|
|
235
|
+
# Get messages in standard format from the message manager
|
|
236
|
+
self.message_manager.messages = messages.copy()
|
|
236
237
|
prepared_messages = self.message_manager.get_messages()
|
|
238
|
+
|
|
239
|
+
# Convert messages to UI-TARS format
|
|
237
240
|
uitars_messages = self.to_uitars_format(prepared_messages)
|
|
238
241
|
|
|
239
242
|
# Log request
|
|
@@ -437,7 +440,7 @@ class UITARSLoop(BaseLoop):
|
|
|
437
440
|
# MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
|
|
438
441
|
###########################################
|
|
439
442
|
|
|
440
|
-
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[
|
|
443
|
+
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
|
|
441
444
|
"""Run the agent loop with provided messages.
|
|
442
445
|
|
|
443
446
|
Args:
|
|
@@ -504,41 +507,16 @@ class UITARSLoop(BaseLoop):
|
|
|
504
507
|
|
|
505
508
|
# Update whether an action screenshot was saved this turn
|
|
506
509
|
action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
|
|
507
|
-
|
|
508
|
-
# Parse actions from the raw response
|
|
509
|
-
raw_response = response["choices"][0]["message"]["content"]
|
|
510
|
-
parsed_actions = parse_actions(raw_response)
|
|
511
510
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
thought_response = {
|
|
521
|
-
"role": "assistant",
|
|
522
|
-
"content": thought or raw_response,
|
|
523
|
-
"metadata": {
|
|
524
|
-
"title": "🧠 UI-TARS Thoughts"
|
|
525
|
-
}
|
|
526
|
-
}
|
|
511
|
+
agent_response = await to_agent_response_format(
|
|
512
|
+
response,
|
|
513
|
+
messages,
|
|
514
|
+
model=self.model,
|
|
515
|
+
)
|
|
516
|
+
# Log standardized response for ease of parsing
|
|
517
|
+
self._log_api_call("agent_response", request=None, response=agent_response)
|
|
518
|
+
yield agent_response
|
|
527
519
|
|
|
528
|
-
# Create action response format
|
|
529
|
-
action_response = {
|
|
530
|
-
"role": "assistant",
|
|
531
|
-
"content": str(parsed_actions),
|
|
532
|
-
"metadata": {
|
|
533
|
-
"title": "🖱️ UI-TARS Actions",
|
|
534
|
-
}
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
# Yield both responses to the caller (thoughts first, then actions)
|
|
538
|
-
yield thought_response
|
|
539
|
-
if parsed_actions:
|
|
540
|
-
yield action_response
|
|
541
|
-
|
|
542
520
|
# Check if we should continue this conversation
|
|
543
521
|
running = should_continue
|
|
544
522
|
|
|
@@ -559,7 +537,8 @@ class UITARSLoop(BaseLoop):
|
|
|
559
537
|
logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
|
|
560
538
|
|
|
561
539
|
yield {
|
|
562
|
-
"
|
|
540
|
+
"role": "assistant",
|
|
541
|
+
"content": f"Error: {str(e)}",
|
|
563
542
|
"metadata": {"title": "❌ Error"},
|
|
564
543
|
}
|
|
565
544
|
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
"""Prompts for UI-TARS agent."""
|
|
2
2
|
|
|
3
|
+
MAC_SPECIFIC_NOTES = """
|
|
4
|
+
(You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
|
|
5
|
+
"""
|
|
6
|
+
|
|
3
7
|
SYSTEM_PROMPT = "You are a helpful assistant."
|
|
4
8
|
|
|
5
9
|
COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
@@ -56,4 +60,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
|
|
|
56
60
|
|
|
57
61
|
## User Instruction
|
|
58
62
|
{instruction}
|
|
59
|
-
"""
|
|
63
|
+
"""
|
|
@@ -173,9 +173,13 @@ class ComputerTool(BaseComputerTool):
|
|
|
173
173
|
elif action == "hotkey":
|
|
174
174
|
if "keys" in kwargs:
|
|
175
175
|
keys = kwargs["keys"]
|
|
176
|
-
for key in keys:
|
|
177
|
-
await self.computer.interface.press_key(key)
|
|
178
176
|
|
|
177
|
+
if len(keys) > 1:
|
|
178
|
+
await self.computer.interface.hotkey(*keys)
|
|
179
|
+
else:
|
|
180
|
+
# Single key press
|
|
181
|
+
await self.computer.interface.press_key(keys[0])
|
|
182
|
+
|
|
179
183
|
# Wait for UI to update
|
|
180
184
|
await asyncio.sleep(0.3)
|
|
181
185
|
|
|
@@ -4,9 +4,114 @@ import logging
|
|
|
4
4
|
import base64
|
|
5
5
|
import re
|
|
6
6
|
from typing import Any, Dict, List, Optional, Union, Tuple
|
|
7
|
+
from datetime import datetime
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
11
|
+
from ...core.types import AgentResponse
|
|
12
|
+
|
|
13
|
+
async def to_agent_response_format(
|
|
14
|
+
response: Dict[str, Any],
|
|
15
|
+
messages: List[Dict[str, Any]],
|
|
16
|
+
model: Optional[str] = None,
|
|
17
|
+
) -> AgentResponse:
|
|
18
|
+
"""Convert raw UI-TARS response to agent response format.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
response: Raw UI-TARS response
|
|
22
|
+
messages: List of messages in standard format
|
|
23
|
+
model: Optional model name
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
AgentResponse: Standardized agent response format
|
|
27
|
+
"""
|
|
28
|
+
# Create unique IDs for this response
|
|
29
|
+
response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
|
|
30
|
+
reasoning_id = f"rs_{response_id}"
|
|
31
|
+
action_id = f"cu_{response_id}"
|
|
32
|
+
call_id = f"call_{response_id}"
|
|
33
|
+
|
|
34
|
+
# Parse actions from the raw response
|
|
35
|
+
content = response["choices"][0]["message"]["content"]
|
|
36
|
+
actions = parse_actions(content)
|
|
37
|
+
|
|
38
|
+
# Extract thought content if available
|
|
39
|
+
reasoning_text = ""
|
|
40
|
+
if "Thought:" in content:
|
|
41
|
+
thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
|
|
42
|
+
if thought_match:
|
|
43
|
+
reasoning_text = thought_match.group(1).strip()
|
|
44
|
+
|
|
45
|
+
# Create output items
|
|
46
|
+
output_items = []
|
|
47
|
+
if reasoning_text:
|
|
48
|
+
output_items.append({
|
|
49
|
+
"type": "reasoning",
|
|
50
|
+
"id": reasoning_id,
|
|
51
|
+
"text": reasoning_text
|
|
52
|
+
})
|
|
53
|
+
if actions:
|
|
54
|
+
for i, action in enumerate(actions):
|
|
55
|
+
action_name, tool_args = parse_action_parameters(action)
|
|
56
|
+
if action_name == "finished":
|
|
57
|
+
output_items.append({
|
|
58
|
+
"type": "message",
|
|
59
|
+
"role": "assistant",
|
|
60
|
+
"content": [{
|
|
61
|
+
"type": "output_text",
|
|
62
|
+
"text": tool_args["content"]
|
|
63
|
+
}],
|
|
64
|
+
"id": f"action_{i}_{action_id}",
|
|
65
|
+
"status": "completed"
|
|
66
|
+
})
|
|
67
|
+
else:
|
|
68
|
+
if tool_args.get("action") == action_name:
|
|
69
|
+
del tool_args["action"]
|
|
70
|
+
output_items.append({
|
|
71
|
+
"type": "computer_call",
|
|
72
|
+
"id": f"{action}_{i}_{action_id}",
|
|
73
|
+
"call_id": f"call_{i}_{action_id}",
|
|
74
|
+
"action": { "type": action_name, **tool_args },
|
|
75
|
+
"pending_safety_checks": [],
|
|
76
|
+
"status": "completed"
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
# Create agent response
|
|
80
|
+
agent_response = AgentResponse(
|
|
81
|
+
id=response_id,
|
|
82
|
+
object="response",
|
|
83
|
+
created_at=int(datetime.now().timestamp()),
|
|
84
|
+
status="completed",
|
|
85
|
+
error=None,
|
|
86
|
+
incomplete_details=None,
|
|
87
|
+
instructions=None,
|
|
88
|
+
max_output_tokens=None,
|
|
89
|
+
model=model or response["model"],
|
|
90
|
+
output=output_items,
|
|
91
|
+
parallel_tool_calls=True,
|
|
92
|
+
previous_response_id=None,
|
|
93
|
+
reasoning={"effort": "medium"},
|
|
94
|
+
store=True,
|
|
95
|
+
temperature=0.0,
|
|
96
|
+
top_p=0.7,
|
|
97
|
+
text={"format": {"type": "text"}},
|
|
98
|
+
tool_choice="auto",
|
|
99
|
+
tools=[
|
|
100
|
+
{
|
|
101
|
+
"type": "computer_use_preview",
|
|
102
|
+
"display_height": 768,
|
|
103
|
+
"display_width": 1024,
|
|
104
|
+
"environment": "mac",
|
|
105
|
+
}
|
|
106
|
+
],
|
|
107
|
+
truncation="auto",
|
|
108
|
+
usage=response["usage"],
|
|
109
|
+
user=None,
|
|
110
|
+
metadata={},
|
|
111
|
+
response=response
|
|
112
|
+
)
|
|
113
|
+
return agent_response
|
|
114
|
+
|
|
10
115
|
|
|
11
116
|
def add_box_token(input_string: str) -> str:
|
|
12
117
|
"""Add box tokens to the coordinates in the model response.
|
|
@@ -74,7 +179,13 @@ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
|
|
|
74
179
|
"""
|
|
75
180
|
# Handle "finished" action
|
|
76
181
|
if action.startswith("finished"):
|
|
77
|
-
|
|
182
|
+
# Parse content if it exists
|
|
183
|
+
content_match = re.search(r"content='([^']*)'", action)
|
|
184
|
+
if content_match:
|
|
185
|
+
content = content_match.group(1)
|
|
186
|
+
return "finished", {"content": content}
|
|
187
|
+
else:
|
|
188
|
+
return "finished", {}
|
|
78
189
|
|
|
79
190
|
# Parse action parameters
|
|
80
191
|
action_match = re.match(r'(\w+)\((.*)\)', action)
|
|
@@ -35,6 +35,7 @@ from pathlib import Path
|
|
|
35
35
|
from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
|
|
36
36
|
import gradio as gr
|
|
37
37
|
from gradio.components.chatbot import MetadataDict
|
|
38
|
+
from typing import cast
|
|
38
39
|
|
|
39
40
|
# Import from agent package
|
|
40
41
|
from agent.core.types import AgentResponse
|
|
@@ -322,63 +323,6 @@ def get_ollama_models() -> List[str]:
|
|
|
322
323
|
logging.error(f"Error getting Ollama models: {e}")
|
|
323
324
|
return []
|
|
324
325
|
|
|
325
|
-
|
|
326
|
-
def extract_synthesized_text(
|
|
327
|
-
result: Union[AgentResponse, Dict[str, Any]],
|
|
328
|
-
) -> Tuple[str, MetadataDict]:
|
|
329
|
-
"""Extract synthesized text from the agent result."""
|
|
330
|
-
synthesized_text = ""
|
|
331
|
-
metadata = MetadataDict()
|
|
332
|
-
|
|
333
|
-
if "output" in result and result["output"]:
|
|
334
|
-
for output in result["output"]:
|
|
335
|
-
if output.get("type") == "reasoning":
|
|
336
|
-
metadata["title"] = "🧠 Reasoning"
|
|
337
|
-
content = output.get("content", "")
|
|
338
|
-
if content:
|
|
339
|
-
synthesized_text += f"{content}\n"
|
|
340
|
-
elif output.get("type") == "message":
|
|
341
|
-
# Handle message type outputs - can contain rich content
|
|
342
|
-
content = output.get("content", [])
|
|
343
|
-
|
|
344
|
-
# Content is usually an array of content blocks
|
|
345
|
-
if isinstance(content, list):
|
|
346
|
-
for block in content:
|
|
347
|
-
if isinstance(block, dict) and block.get("type") == "output_text":
|
|
348
|
-
text_value = block.get("text", "")
|
|
349
|
-
if text_value:
|
|
350
|
-
synthesized_text += f"{text_value}\n"
|
|
351
|
-
|
|
352
|
-
elif output.get("type") == "computer_call":
|
|
353
|
-
action = output.get("action", {})
|
|
354
|
-
action_type = action.get("type", "")
|
|
355
|
-
|
|
356
|
-
# Create a descriptive text about the action
|
|
357
|
-
if action_type == "click":
|
|
358
|
-
button = action.get("button", "")
|
|
359
|
-
x = action.get("x", "")
|
|
360
|
-
y = action.get("y", "")
|
|
361
|
-
synthesized_text += f"Clicked {button} at position ({x}, {y}).\n"
|
|
362
|
-
elif action_type == "type":
|
|
363
|
-
text = action.get("text", "")
|
|
364
|
-
synthesized_text += f"Typed: {text}.\n"
|
|
365
|
-
elif action_type == "keypress":
|
|
366
|
-
# Extract key correctly from either keys array or key field
|
|
367
|
-
if isinstance(action.get("keys"), list):
|
|
368
|
-
key = ", ".join(action.get("keys"))
|
|
369
|
-
else:
|
|
370
|
-
key = action.get("key", "")
|
|
371
|
-
|
|
372
|
-
synthesized_text += f"Pressed key: {key}\n"
|
|
373
|
-
else:
|
|
374
|
-
synthesized_text += f"Performed {action_type} action.\n"
|
|
375
|
-
|
|
376
|
-
metadata["status"] = "done"
|
|
377
|
-
metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
|
|
378
|
-
|
|
379
|
-
return synthesized_text.strip(), metadata
|
|
380
|
-
|
|
381
|
-
|
|
382
326
|
def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
|
|
383
327
|
"""Create or get the global Computer instance."""
|
|
384
328
|
global global_computer
|
|
@@ -447,66 +391,6 @@ def create_agent(
|
|
|
447
391
|
|
|
448
392
|
return global_agent
|
|
449
393
|
|
|
450
|
-
|
|
451
|
-
def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
|
|
452
|
-
"""Process agent results for the Gradio UI."""
|
|
453
|
-
# Extract text content
|
|
454
|
-
text_obj = result.get("text", {})
|
|
455
|
-
metadata = result.get("metadata", {})
|
|
456
|
-
|
|
457
|
-
# Create a properly typed MetadataDict
|
|
458
|
-
metadata_dict = MetadataDict()
|
|
459
|
-
metadata_dict["title"] = metadata.get("title", "")
|
|
460
|
-
metadata_dict["status"] = "done"
|
|
461
|
-
metadata = metadata_dict
|
|
462
|
-
|
|
463
|
-
# For OpenAI's Computer-Use Agent, text field is an object with format property
|
|
464
|
-
if (
|
|
465
|
-
text_obj
|
|
466
|
-
and isinstance(text_obj, dict)
|
|
467
|
-
and "format" in text_obj
|
|
468
|
-
and not text_obj.get("value", "")
|
|
469
|
-
):
|
|
470
|
-
content, metadata = extract_synthesized_text(result)
|
|
471
|
-
else:
|
|
472
|
-
if not text_obj:
|
|
473
|
-
text_obj = result
|
|
474
|
-
|
|
475
|
-
# For other types of results, try to get text directly
|
|
476
|
-
if isinstance(text_obj, dict):
|
|
477
|
-
if "value" in text_obj:
|
|
478
|
-
content = text_obj["value"]
|
|
479
|
-
elif "text" in text_obj:
|
|
480
|
-
content = text_obj["text"]
|
|
481
|
-
elif "content" in text_obj:
|
|
482
|
-
content = text_obj["content"]
|
|
483
|
-
else:
|
|
484
|
-
content = ""
|
|
485
|
-
else:
|
|
486
|
-
content = str(text_obj) if text_obj else ""
|
|
487
|
-
|
|
488
|
-
# If still no content but we have outputs, create a summary
|
|
489
|
-
if not content and "output" in result and result["output"]:
|
|
490
|
-
output = result["output"]
|
|
491
|
-
for out in output:
|
|
492
|
-
if out.get("type") == "reasoning":
|
|
493
|
-
content = out.get("content", "")
|
|
494
|
-
if content:
|
|
495
|
-
break
|
|
496
|
-
elif out.get("type") == "computer_call":
|
|
497
|
-
action = out.get("action", {})
|
|
498
|
-
action_type = action.get("type", "")
|
|
499
|
-
if action_type:
|
|
500
|
-
content = f"Performing action: {action_type}"
|
|
501
|
-
break
|
|
502
|
-
|
|
503
|
-
# Clean up the text - ensure content is a string
|
|
504
|
-
if not isinstance(content, str):
|
|
505
|
-
content = str(content) if content else ""
|
|
506
|
-
|
|
507
|
-
return content, metadata
|
|
508
|
-
|
|
509
|
-
|
|
510
394
|
def create_gradio_ui(
|
|
511
395
|
provider_name: str = "openai",
|
|
512
396
|
model_name: str = "gpt-4o",
|
|
@@ -907,17 +791,64 @@ def create_gradio_ui(
|
|
|
907
791
|
|
|
908
792
|
# Stream responses from the agent
|
|
909
793
|
async for result in global_agent.run(last_user_message):
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
794
|
+
print(f"DEBUG - Agent response ------- START")
|
|
795
|
+
from pprint import pprint
|
|
796
|
+
pprint(result)
|
|
797
|
+
print(f"DEBUG - Agent response ------- END")
|
|
798
|
+
|
|
799
|
+
def generate_gradio_messages():
|
|
800
|
+
if result.get("content"):
|
|
801
|
+
yield gr.ChatMessage(
|
|
802
|
+
role="assistant",
|
|
803
|
+
content=result.get("content", ""),
|
|
804
|
+
metadata=cast(MetadataDict, result.get("metadata", {}))
|
|
918
805
|
)
|
|
919
|
-
|
|
920
|
-
|
|
806
|
+
else:
|
|
807
|
+
outputs = result.get("output", [])
|
|
808
|
+
for output in outputs:
|
|
809
|
+
if output.get("type") == "message":
|
|
810
|
+
content = output.get("content", [])
|
|
811
|
+
for content_part in content:
|
|
812
|
+
if content_part.get("text"):
|
|
813
|
+
yield gr.ChatMessage(
|
|
814
|
+
role=output.get("role", "assistant"),
|
|
815
|
+
content=content_part.get("text", ""),
|
|
816
|
+
metadata=content_part.get("metadata", {})
|
|
817
|
+
)
|
|
818
|
+
elif output.get("type") == "reasoning":
|
|
819
|
+
# if it's openAI, we only have access to a summary of the reasoning
|
|
820
|
+
summary_content = output.get("summary", [])
|
|
821
|
+
if summary_content:
|
|
822
|
+
for summary_part in summary_content:
|
|
823
|
+
if summary_part.get("type") == "summary_text":
|
|
824
|
+
yield gr.ChatMessage(
|
|
825
|
+
role="assistant",
|
|
826
|
+
content=summary_part.get("text", "")
|
|
827
|
+
)
|
|
828
|
+
else:
|
|
829
|
+
summary_content = output.get("text", "")
|
|
830
|
+
if summary_content:
|
|
831
|
+
yield gr.ChatMessage(
|
|
832
|
+
role="assistant",
|
|
833
|
+
content=summary_content,
|
|
834
|
+
)
|
|
835
|
+
elif output.get("type") == "computer_call":
|
|
836
|
+
action = output.get("action", {})
|
|
837
|
+
action_type = action.get("type", "")
|
|
838
|
+
if action_type:
|
|
839
|
+
action_title = f"🛠️ Performing {action_type}"
|
|
840
|
+
if action.get("x") and action.get("y"):
|
|
841
|
+
action_title += f" at ({action['x']}, {action['y']})"
|
|
842
|
+
yield gr.ChatMessage(
|
|
843
|
+
role="assistant",
|
|
844
|
+
content=f"```json\n{json.dumps(action)}\n```",
|
|
845
|
+
metadata={"title": action_title}
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
for message in generate_gradio_messages():
|
|
849
|
+
history.append(message)
|
|
850
|
+
yield history
|
|
851
|
+
|
|
921
852
|
except Exception as e:
|
|
922
853
|
import traceback
|
|
923
854
|
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "cua-agent"
|
|
9
|
-
version = "0.1.
|
|
9
|
+
version = "0.1.32"
|
|
10
10
|
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
authors = [
|
|
@@ -108,7 +108,7 @@ target-version = [
|
|
|
108
108
|
|
|
109
109
|
[tool.ruff]
|
|
110
110
|
line-length = 100
|
|
111
|
-
target-version = "0.1.
|
|
111
|
+
target-version = "0.1.32"
|
|
112
112
|
select = [
|
|
113
113
|
"E",
|
|
114
114
|
"F",
|
|
@@ -122,7 +122,7 @@ docstring-code-format = true
|
|
|
122
122
|
|
|
123
123
|
[tool.mypy]
|
|
124
124
|
strict = true
|
|
125
|
-
python_version = "0.1.
|
|
125
|
+
python_version = "0.1.32"
|
|
126
126
|
ignore_missing_imports = true
|
|
127
127
|
disallow_untyped_defs = true
|
|
128
128
|
check_untyped_defs = true
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|