cua-agent 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/providers/anthropic/loop.py +2 -0
- agent/providers/anthropic/tools/computer.py +11 -9
- agent/providers/omni/loop.py +2 -0
- agent/providers/openai/loop.py +4 -0
- agent/providers/openai/tools/computer.py +41 -0
- agent/providers/uitars/clients/oaicompat.py +8 -12
- agent/providers/uitars/loop.py +12 -36
- agent/providers/uitars/utils.py +112 -1
- agent/ui/gradio/app.py +58 -127
- {cua_agent-0.1.31.dist-info → cua_agent-0.1.32.dist-info}/METADATA +3 -3
- {cua_agent-0.1.31.dist-info → cua_agent-0.1.32.dist-info}/RECORD +13 -13
- {cua_agent-0.1.31.dist-info → cua_agent-0.1.32.dist-info}/WHEEL +0 -0
- {cua_agent-0.1.31.dist-info → cua_agent-0.1.32.dist-info}/entry_points.txt +0 -0
|
@@ -279,6 +279,8 @@ class AnthropicLoop(BaseLoop):
|
|
|
279
279
|
messages,
|
|
280
280
|
model=self.model,
|
|
281
281
|
)
|
|
282
|
+
# Log standardized response for ease of parsing
|
|
283
|
+
self._log_api_call("agent_response", request=None, response=openai_compatible_response)
|
|
282
284
|
await queue.put(openai_compatible_response)
|
|
283
285
|
|
|
284
286
|
if not should_continue:
|
|
@@ -161,15 +161,17 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
|
|
|
161
161
|
self.logger.info(f"Moving cursor to ({x}, {y})")
|
|
162
162
|
await self.computer.interface.move_cursor(x, y)
|
|
163
163
|
elif action == "left_click_drag":
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
164
|
+
# Get the start coordinate from kwargs
|
|
165
|
+
start_coordinate = kwargs.get("start_coordinate")
|
|
166
|
+
if not start_coordinate:
|
|
167
|
+
raise ToolError("start_coordinate is required for left_click_drag action")
|
|
168
|
+
|
|
169
|
+
start_x, start_y = start_coordinate
|
|
170
|
+
end_x, end_y = x, y
|
|
171
|
+
|
|
172
|
+
self.logger.info(f"Dragging from ({start_x}, {start_y}) to ({end_x}, {end_y})")
|
|
173
|
+
await self.computer.interface.move_cursor(start_x, start_y)
|
|
174
|
+
await self.computer.interface.drag_to(end_x, end_y)
|
|
173
175
|
|
|
174
176
|
# Wait briefly for any UI changes
|
|
175
177
|
await asyncio.sleep(0.5)
|
agent/providers/omni/loop.py
CHANGED
|
@@ -670,6 +670,8 @@ class OmniLoop(BaseLoop):
|
|
|
670
670
|
parsed_screen=parsed_screen,
|
|
671
671
|
parser=self.parser
|
|
672
672
|
)
|
|
673
|
+
# Log standardized response for ease of parsing
|
|
674
|
+
self._log_api_call("agent_response", request=None, response=openai_compatible_response)
|
|
673
675
|
|
|
674
676
|
# Yield the response to the caller
|
|
675
677
|
yield openai_compatible_response
|
agent/providers/openai/loop.py
CHANGED
|
@@ -276,6 +276,10 @@ class OpenAILoop(BaseLoop):
|
|
|
276
276
|
)
|
|
277
277
|
# Don't reset last_response_id to None - keep the previous value if available
|
|
278
278
|
|
|
279
|
+
|
|
280
|
+
# Log standardized response for ease of parsing
|
|
281
|
+
# Since this is the openAI responses format, we don't need to convert it to agent response format
|
|
282
|
+
self._log_api_call("agent_response", request=None, response=response)
|
|
279
283
|
# Process API response
|
|
280
284
|
await queue.put(response)
|
|
281
285
|
|
|
@@ -44,6 +44,7 @@ Action = Literal[
|
|
|
44
44
|
"double_click",
|
|
45
45
|
"screenshot",
|
|
46
46
|
"scroll",
|
|
47
|
+
"drag",
|
|
47
48
|
]
|
|
48
49
|
|
|
49
50
|
|
|
@@ -165,6 +166,11 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
165
166
|
scroll_x = kwargs.get("scroll_x", 0) // 50
|
|
166
167
|
scroll_y = kwargs.get("scroll_y", 0) // 50
|
|
167
168
|
return await self.handle_scroll(x, y, scroll_x, scroll_y)
|
|
169
|
+
elif type == "drag":
|
|
170
|
+
path = kwargs.get("path")
|
|
171
|
+
if not path or not isinstance(path, list) or len(path) < 2:
|
|
172
|
+
raise ToolError("path is required for drag action and must contain at least 2 points")
|
|
173
|
+
return await self.handle_drag(path)
|
|
168
174
|
elif type == "screenshot":
|
|
169
175
|
return await self.screenshot()
|
|
170
176
|
elif type == "wait":
|
|
@@ -302,6 +308,41 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
|
|
|
302
308
|
self.logger.error(f"Error in handle_scroll: {str(e)}")
|
|
303
309
|
raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
|
|
304
310
|
|
|
311
|
+
async def handle_drag(self, path: List[Dict[str, int]]) -> ToolResult:
|
|
312
|
+
"""Handle mouse drag operation using a path of coordinates.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
path: List of coordinate points {"x": int, "y": int} defining the drag path
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
ToolResult with the operation result and screenshot
|
|
319
|
+
"""
|
|
320
|
+
try:
|
|
321
|
+
# Convert from [{"x": x, "y": y}, ...] format to [(x, y), ...] format
|
|
322
|
+
points = [(p["x"], p["y"]) for p in path]
|
|
323
|
+
|
|
324
|
+
# Perform drag action
|
|
325
|
+
if len(points) == 2:
|
|
326
|
+
await self.computer.interface.move_cursor(points[0][0], points[0][1])
|
|
327
|
+
await self.computer.interface.drag_to(points[1][0], points[1][1])
|
|
328
|
+
else:
|
|
329
|
+
await self.computer.interface.drag(points, button="left")
|
|
330
|
+
|
|
331
|
+
# Wait for UI to update
|
|
332
|
+
await asyncio.sleep(0.5)
|
|
333
|
+
|
|
334
|
+
# Take screenshot after action
|
|
335
|
+
screenshot = await self.computer.interface.screenshot()
|
|
336
|
+
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
|
|
337
|
+
|
|
338
|
+
return ToolResult(
|
|
339
|
+
output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
|
|
340
|
+
base64_image=base64_screenshot,
|
|
341
|
+
)
|
|
342
|
+
except Exception as e:
|
|
343
|
+
self.logger.error(f"Error in handle_drag: {str(e)}")
|
|
344
|
+
raise ToolError(f"Failed to perform drag operation: {str(e)}")
|
|
345
|
+
|
|
305
346
|
async def screenshot(self) -> ToolResult:
|
|
306
347
|
"""Take a screenshot."""
|
|
307
348
|
try:
|
|
@@ -190,25 +190,21 @@ class OAICompatClient(BaseUITarsClient):
|
|
|
190
190
|
response_text = await response.text()
|
|
191
191
|
logger.debug(f"Response content: {response_text}")
|
|
192
192
|
|
|
193
|
+
# if 503, then the endpoint is still warming up
|
|
194
|
+
if response.status == 503:
|
|
195
|
+
logger.error(f"Endpoint is still warming up, please try again later")
|
|
196
|
+
raise Exception(f"Endpoint is still warming up: {response_text}")
|
|
197
|
+
|
|
193
198
|
# Try to parse as JSON if the content type is appropriate
|
|
194
199
|
if "application/json" in response.headers.get('Content-Type', ''):
|
|
195
200
|
response_json = await response.json()
|
|
196
201
|
else:
|
|
197
202
|
raise Exception(f"Response is not JSON format")
|
|
198
|
-
# # Optionally try to parse it anyway
|
|
199
|
-
# try:
|
|
200
|
-
# import json
|
|
201
|
-
# response_json = json.loads(response_text)
|
|
202
|
-
# except json.JSONDecodeError as e:
|
|
203
|
-
# print(f"Failed to parse response as JSON: {e}")
|
|
204
203
|
|
|
205
204
|
if response.status != 200:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
logger.error(f"Error in API call: {error_msg}")
|
|
210
|
-
raise Exception(f"API error: {error_msg}")
|
|
211
|
-
|
|
205
|
+
logger.error(f"Error in API call: {response_text}")
|
|
206
|
+
raise Exception(f"API error: {response_text}")
|
|
207
|
+
|
|
212
208
|
return response_json
|
|
213
209
|
|
|
214
210
|
except Exception as e:
|
agent/providers/uitars/loop.py
CHANGED
|
@@ -17,7 +17,7 @@ from ...core.types import AgentResponse, LLMProvider
|
|
|
17
17
|
from ...core.visualization import VisualizationHelper
|
|
18
18
|
from computer import Computer
|
|
19
19
|
|
|
20
|
-
from .utils import add_box_token, parse_actions, parse_action_parameters
|
|
20
|
+
from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
|
|
21
21
|
from .tools.manager import ToolManager
|
|
22
22
|
from .tools.computer import ToolResult
|
|
23
23
|
from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
|
|
@@ -440,7 +440,7 @@ class UITARSLoop(BaseLoop):
|
|
|
440
440
|
# MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
|
|
441
441
|
###########################################
|
|
442
442
|
|
|
443
|
-
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[
|
|
443
|
+
async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
|
|
444
444
|
"""Run the agent loop with provided messages.
|
|
445
445
|
|
|
446
446
|
Args:
|
|
@@ -507,41 +507,16 @@ class UITARSLoop(BaseLoop):
|
|
|
507
507
|
|
|
508
508
|
# Update whether an action screenshot was saved this turn
|
|
509
509
|
action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
|
|
510
|
-
|
|
511
|
-
# Parse actions from the raw response
|
|
512
|
-
raw_response = response["choices"][0]["message"]["content"]
|
|
513
|
-
parsed_actions = parse_actions(raw_response)
|
|
514
|
-
|
|
515
|
-
# Extract thought content if available
|
|
516
|
-
thought = ""
|
|
517
|
-
if "Thought:" in raw_response:
|
|
518
|
-
thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", raw_response, re.DOTALL)
|
|
519
|
-
if thought_match:
|
|
520
|
-
thought = thought_match.group(1).strip()
|
|
521
510
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
511
|
+
agent_response = await to_agent_response_format(
|
|
512
|
+
response,
|
|
513
|
+
messages,
|
|
514
|
+
model=self.model,
|
|
515
|
+
)
|
|
516
|
+
# Log standardized response for ease of parsing
|
|
517
|
+
self._log_api_call("agent_response", request=None, response=agent_response)
|
|
518
|
+
yield agent_response
|
|
530
519
|
|
|
531
|
-
# Create action response format
|
|
532
|
-
action_response = {
|
|
533
|
-
"role": "assistant",
|
|
534
|
-
"content": str(parsed_actions),
|
|
535
|
-
"metadata": {
|
|
536
|
-
"title": "🖱️ UI-TARS Actions",
|
|
537
|
-
}
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
# Yield both responses to the caller (thoughts first, then actions)
|
|
541
|
-
yield thought_response
|
|
542
|
-
if parsed_actions:
|
|
543
|
-
yield action_response
|
|
544
|
-
|
|
545
520
|
# Check if we should continue this conversation
|
|
546
521
|
running = should_continue
|
|
547
522
|
|
|
@@ -562,7 +537,8 @@ class UITARSLoop(BaseLoop):
|
|
|
562
537
|
logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
|
|
563
538
|
|
|
564
539
|
yield {
|
|
565
|
-
"
|
|
540
|
+
"role": "assistant",
|
|
541
|
+
"content": f"Error: {str(e)}",
|
|
566
542
|
"metadata": {"title": "❌ Error"},
|
|
567
543
|
}
|
|
568
544
|
|
agent/providers/uitars/utils.py
CHANGED
|
@@ -4,9 +4,114 @@ import logging
|
|
|
4
4
|
import base64
|
|
5
5
|
import re
|
|
6
6
|
from typing import Any, Dict, List, Optional, Union, Tuple
|
|
7
|
+
from datetime import datetime
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
11
|
+
from ...core.types import AgentResponse
|
|
12
|
+
|
|
13
|
+
async def to_agent_response_format(
|
|
14
|
+
response: Dict[str, Any],
|
|
15
|
+
messages: List[Dict[str, Any]],
|
|
16
|
+
model: Optional[str] = None,
|
|
17
|
+
) -> AgentResponse:
|
|
18
|
+
"""Convert raw UI-TARS response to agent response format.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
response: Raw UI-TARS response
|
|
22
|
+
messages: List of messages in standard format
|
|
23
|
+
model: Optional model name
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
AgentResponse: Standardized agent response format
|
|
27
|
+
"""
|
|
28
|
+
# Create unique IDs for this response
|
|
29
|
+
response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
|
|
30
|
+
reasoning_id = f"rs_{response_id}"
|
|
31
|
+
action_id = f"cu_{response_id}"
|
|
32
|
+
call_id = f"call_{response_id}"
|
|
33
|
+
|
|
34
|
+
# Parse actions from the raw response
|
|
35
|
+
content = response["choices"][0]["message"]["content"]
|
|
36
|
+
actions = parse_actions(content)
|
|
37
|
+
|
|
38
|
+
# Extract thought content if available
|
|
39
|
+
reasoning_text = ""
|
|
40
|
+
if "Thought:" in content:
|
|
41
|
+
thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
|
|
42
|
+
if thought_match:
|
|
43
|
+
reasoning_text = thought_match.group(1).strip()
|
|
44
|
+
|
|
45
|
+
# Create output items
|
|
46
|
+
output_items = []
|
|
47
|
+
if reasoning_text:
|
|
48
|
+
output_items.append({
|
|
49
|
+
"type": "reasoning",
|
|
50
|
+
"id": reasoning_id,
|
|
51
|
+
"text": reasoning_text
|
|
52
|
+
})
|
|
53
|
+
if actions:
|
|
54
|
+
for i, action in enumerate(actions):
|
|
55
|
+
action_name, tool_args = parse_action_parameters(action)
|
|
56
|
+
if action_name == "finished":
|
|
57
|
+
output_items.append({
|
|
58
|
+
"type": "message",
|
|
59
|
+
"role": "assistant",
|
|
60
|
+
"content": [{
|
|
61
|
+
"type": "output_text",
|
|
62
|
+
"text": tool_args["content"]
|
|
63
|
+
}],
|
|
64
|
+
"id": f"action_{i}_{action_id}",
|
|
65
|
+
"status": "completed"
|
|
66
|
+
})
|
|
67
|
+
else:
|
|
68
|
+
if tool_args.get("action") == action_name:
|
|
69
|
+
del tool_args["action"]
|
|
70
|
+
output_items.append({
|
|
71
|
+
"type": "computer_call",
|
|
72
|
+
"id": f"{action}_{i}_{action_id}",
|
|
73
|
+
"call_id": f"call_{i}_{action_id}",
|
|
74
|
+
"action": { "type": action_name, **tool_args },
|
|
75
|
+
"pending_safety_checks": [],
|
|
76
|
+
"status": "completed"
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
# Create agent response
|
|
80
|
+
agent_response = AgentResponse(
|
|
81
|
+
id=response_id,
|
|
82
|
+
object="response",
|
|
83
|
+
created_at=int(datetime.now().timestamp()),
|
|
84
|
+
status="completed",
|
|
85
|
+
error=None,
|
|
86
|
+
incomplete_details=None,
|
|
87
|
+
instructions=None,
|
|
88
|
+
max_output_tokens=None,
|
|
89
|
+
model=model or response["model"],
|
|
90
|
+
output=output_items,
|
|
91
|
+
parallel_tool_calls=True,
|
|
92
|
+
previous_response_id=None,
|
|
93
|
+
reasoning={"effort": "medium"},
|
|
94
|
+
store=True,
|
|
95
|
+
temperature=0.0,
|
|
96
|
+
top_p=0.7,
|
|
97
|
+
text={"format": {"type": "text"}},
|
|
98
|
+
tool_choice="auto",
|
|
99
|
+
tools=[
|
|
100
|
+
{
|
|
101
|
+
"type": "computer_use_preview",
|
|
102
|
+
"display_height": 768,
|
|
103
|
+
"display_width": 1024,
|
|
104
|
+
"environment": "mac",
|
|
105
|
+
}
|
|
106
|
+
],
|
|
107
|
+
truncation="auto",
|
|
108
|
+
usage=response["usage"],
|
|
109
|
+
user=None,
|
|
110
|
+
metadata={},
|
|
111
|
+
response=response
|
|
112
|
+
)
|
|
113
|
+
return agent_response
|
|
114
|
+
|
|
10
115
|
|
|
11
116
|
def add_box_token(input_string: str) -> str:
|
|
12
117
|
"""Add box tokens to the coordinates in the model response.
|
|
@@ -74,7 +179,13 @@ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
|
|
|
74
179
|
"""
|
|
75
180
|
# Handle "finished" action
|
|
76
181
|
if action.startswith("finished"):
|
|
77
|
-
|
|
182
|
+
# Parse content if it exists
|
|
183
|
+
content_match = re.search(r"content='([^']*)'", action)
|
|
184
|
+
if content_match:
|
|
185
|
+
content = content_match.group(1)
|
|
186
|
+
return "finished", {"content": content}
|
|
187
|
+
else:
|
|
188
|
+
return "finished", {}
|
|
78
189
|
|
|
79
190
|
# Parse action parameters
|
|
80
191
|
action_match = re.match(r'(\w+)\((.*)\)', action)
|
agent/ui/gradio/app.py
CHANGED
|
@@ -35,6 +35,7 @@ from pathlib import Path
|
|
|
35
35
|
from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
|
|
36
36
|
import gradio as gr
|
|
37
37
|
from gradio.components.chatbot import MetadataDict
|
|
38
|
+
from typing import cast
|
|
38
39
|
|
|
39
40
|
# Import from agent package
|
|
40
41
|
from agent.core.types import AgentResponse
|
|
@@ -322,63 +323,6 @@ def get_ollama_models() -> List[str]:
|
|
|
322
323
|
logging.error(f"Error getting Ollama models: {e}")
|
|
323
324
|
return []
|
|
324
325
|
|
|
325
|
-
|
|
326
|
-
def extract_synthesized_text(
|
|
327
|
-
result: Union[AgentResponse, Dict[str, Any]],
|
|
328
|
-
) -> Tuple[str, MetadataDict]:
|
|
329
|
-
"""Extract synthesized text from the agent result."""
|
|
330
|
-
synthesized_text = ""
|
|
331
|
-
metadata = MetadataDict()
|
|
332
|
-
|
|
333
|
-
if "output" in result and result["output"]:
|
|
334
|
-
for output in result["output"]:
|
|
335
|
-
if output.get("type") == "reasoning":
|
|
336
|
-
metadata["title"] = "🧠 Reasoning"
|
|
337
|
-
content = output.get("content", "")
|
|
338
|
-
if content:
|
|
339
|
-
synthesized_text += f"{content}\n"
|
|
340
|
-
elif output.get("type") == "message":
|
|
341
|
-
# Handle message type outputs - can contain rich content
|
|
342
|
-
content = output.get("content", [])
|
|
343
|
-
|
|
344
|
-
# Content is usually an array of content blocks
|
|
345
|
-
if isinstance(content, list):
|
|
346
|
-
for block in content:
|
|
347
|
-
if isinstance(block, dict) and block.get("type") == "output_text":
|
|
348
|
-
text_value = block.get("text", "")
|
|
349
|
-
if text_value:
|
|
350
|
-
synthesized_text += f"{text_value}\n"
|
|
351
|
-
|
|
352
|
-
elif output.get("type") == "computer_call":
|
|
353
|
-
action = output.get("action", {})
|
|
354
|
-
action_type = action.get("type", "")
|
|
355
|
-
|
|
356
|
-
# Create a descriptive text about the action
|
|
357
|
-
if action_type == "click":
|
|
358
|
-
button = action.get("button", "")
|
|
359
|
-
x = action.get("x", "")
|
|
360
|
-
y = action.get("y", "")
|
|
361
|
-
synthesized_text += f"Clicked {button} at position ({x}, {y}).\n"
|
|
362
|
-
elif action_type == "type":
|
|
363
|
-
text = action.get("text", "")
|
|
364
|
-
synthesized_text += f"Typed: {text}.\n"
|
|
365
|
-
elif action_type == "keypress":
|
|
366
|
-
# Extract key correctly from either keys array or key field
|
|
367
|
-
if isinstance(action.get("keys"), list):
|
|
368
|
-
key = ", ".join(action.get("keys"))
|
|
369
|
-
else:
|
|
370
|
-
key = action.get("key", "")
|
|
371
|
-
|
|
372
|
-
synthesized_text += f"Pressed key: {key}\n"
|
|
373
|
-
else:
|
|
374
|
-
synthesized_text += f"Performed {action_type} action.\n"
|
|
375
|
-
|
|
376
|
-
metadata["status"] = "done"
|
|
377
|
-
metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
|
|
378
|
-
|
|
379
|
-
return synthesized_text.strip(), metadata
|
|
380
|
-
|
|
381
|
-
|
|
382
326
|
def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
|
|
383
327
|
"""Create or get the global Computer instance."""
|
|
384
328
|
global global_computer
|
|
@@ -447,66 +391,6 @@ def create_agent(
|
|
|
447
391
|
|
|
448
392
|
return global_agent
|
|
449
393
|
|
|
450
|
-
|
|
451
|
-
def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
|
|
452
|
-
"""Process agent results for the Gradio UI."""
|
|
453
|
-
# Extract text content
|
|
454
|
-
text_obj = result.get("text", {})
|
|
455
|
-
metadata = result.get("metadata", {})
|
|
456
|
-
|
|
457
|
-
# Create a properly typed MetadataDict
|
|
458
|
-
metadata_dict = MetadataDict()
|
|
459
|
-
metadata_dict["title"] = metadata.get("title", "")
|
|
460
|
-
metadata_dict["status"] = "done"
|
|
461
|
-
metadata = metadata_dict
|
|
462
|
-
|
|
463
|
-
# For OpenAI's Computer-Use Agent, text field is an object with format property
|
|
464
|
-
if (
|
|
465
|
-
text_obj
|
|
466
|
-
and isinstance(text_obj, dict)
|
|
467
|
-
and "format" in text_obj
|
|
468
|
-
and not text_obj.get("value", "")
|
|
469
|
-
):
|
|
470
|
-
content, metadata = extract_synthesized_text(result)
|
|
471
|
-
else:
|
|
472
|
-
if not text_obj:
|
|
473
|
-
text_obj = result
|
|
474
|
-
|
|
475
|
-
# For other types of results, try to get text directly
|
|
476
|
-
if isinstance(text_obj, dict):
|
|
477
|
-
if "value" in text_obj:
|
|
478
|
-
content = text_obj["value"]
|
|
479
|
-
elif "text" in text_obj:
|
|
480
|
-
content = text_obj["text"]
|
|
481
|
-
elif "content" in text_obj:
|
|
482
|
-
content = text_obj["content"]
|
|
483
|
-
else:
|
|
484
|
-
content = ""
|
|
485
|
-
else:
|
|
486
|
-
content = str(text_obj) if text_obj else ""
|
|
487
|
-
|
|
488
|
-
# If still no content but we have outputs, create a summary
|
|
489
|
-
if not content and "output" in result and result["output"]:
|
|
490
|
-
output = result["output"]
|
|
491
|
-
for out in output:
|
|
492
|
-
if out.get("type") == "reasoning":
|
|
493
|
-
content = out.get("content", "")
|
|
494
|
-
if content:
|
|
495
|
-
break
|
|
496
|
-
elif out.get("type") == "computer_call":
|
|
497
|
-
action = out.get("action", {})
|
|
498
|
-
action_type = action.get("type", "")
|
|
499
|
-
if action_type:
|
|
500
|
-
content = f"Performing action: {action_type}"
|
|
501
|
-
break
|
|
502
|
-
|
|
503
|
-
# Clean up the text - ensure content is a string
|
|
504
|
-
if not isinstance(content, str):
|
|
505
|
-
content = str(content) if content else ""
|
|
506
|
-
|
|
507
|
-
return content, metadata
|
|
508
|
-
|
|
509
|
-
|
|
510
394
|
def create_gradio_ui(
|
|
511
395
|
provider_name: str = "openai",
|
|
512
396
|
model_name: str = "gpt-4o",
|
|
@@ -907,17 +791,64 @@ def create_gradio_ui(
|
|
|
907
791
|
|
|
908
792
|
# Stream responses from the agent
|
|
909
793
|
async for result in global_agent.run(last_user_message):
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
794
|
+
print(f"DEBUG - Agent response ------- START")
|
|
795
|
+
from pprint import pprint
|
|
796
|
+
pprint(result)
|
|
797
|
+
print(f"DEBUG - Agent response ------- END")
|
|
798
|
+
|
|
799
|
+
def generate_gradio_messages():
|
|
800
|
+
if result.get("content"):
|
|
801
|
+
yield gr.ChatMessage(
|
|
802
|
+
role="assistant",
|
|
803
|
+
content=result.get("content", ""),
|
|
804
|
+
metadata=cast(MetadataDict, result.get("metadata", {}))
|
|
918
805
|
)
|
|
919
|
-
|
|
920
|
-
|
|
806
|
+
else:
|
|
807
|
+
outputs = result.get("output", [])
|
|
808
|
+
for output in outputs:
|
|
809
|
+
if output.get("type") == "message":
|
|
810
|
+
content = output.get("content", [])
|
|
811
|
+
for content_part in content:
|
|
812
|
+
if content_part.get("text"):
|
|
813
|
+
yield gr.ChatMessage(
|
|
814
|
+
role=output.get("role", "assistant"),
|
|
815
|
+
content=content_part.get("text", ""),
|
|
816
|
+
metadata=content_part.get("metadata", {})
|
|
817
|
+
)
|
|
818
|
+
elif output.get("type") == "reasoning":
|
|
819
|
+
# if it's openAI, we only have access to a summary of the reasoning
|
|
820
|
+
summary_content = output.get("summary", [])
|
|
821
|
+
if summary_content:
|
|
822
|
+
for summary_part in summary_content:
|
|
823
|
+
if summary_part.get("type") == "summary_text":
|
|
824
|
+
yield gr.ChatMessage(
|
|
825
|
+
role="assistant",
|
|
826
|
+
content=summary_part.get("text", "")
|
|
827
|
+
)
|
|
828
|
+
else:
|
|
829
|
+
summary_content = output.get("text", "")
|
|
830
|
+
if summary_content:
|
|
831
|
+
yield gr.ChatMessage(
|
|
832
|
+
role="assistant",
|
|
833
|
+
content=summary_content,
|
|
834
|
+
)
|
|
835
|
+
elif output.get("type") == "computer_call":
|
|
836
|
+
action = output.get("action", {})
|
|
837
|
+
action_type = action.get("type", "")
|
|
838
|
+
if action_type:
|
|
839
|
+
action_title = f"🛠️ Performing {action_type}"
|
|
840
|
+
if action.get("x") and action.get("y"):
|
|
841
|
+
action_title += f" at ({action['x']}, {action['y']})"
|
|
842
|
+
yield gr.ChatMessage(
|
|
843
|
+
role="assistant",
|
|
844
|
+
content=f"```json\n{json.dumps(action)}\n```",
|
|
845
|
+
metadata={"title": action_title}
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
for message in generate_gradio_messages():
|
|
849
|
+
history.append(message)
|
|
850
|
+
yield history
|
|
851
|
+
|
|
921
852
|
except Exception as e:
|
|
922
853
|
import traceback
|
|
923
854
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cua-agent
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.32
|
|
4
4
|
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
5
|
Author-Email: TryCua <gh@trycua.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -120,10 +120,10 @@ async with Computer() as macos_computer:
|
|
|
120
120
|
# model=LLM(provider=LLMProvider.ANTHROPIC)
|
|
121
121
|
# or
|
|
122
122
|
# loop=AgentLoop.OMNI,
|
|
123
|
-
# model=LLM(provider=LLMProvider.OLLAMA,
|
|
123
|
+
# model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
|
|
124
124
|
# or
|
|
125
125
|
# loop=AgentLoop.UITARS,
|
|
126
|
-
# model=LLM(provider=LLMProvider.OAICOMPAT,
|
|
126
|
+
# model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
|
127
127
|
)
|
|
128
128
|
|
|
129
129
|
tasks = [
|
|
@@ -25,14 +25,14 @@ agent/providers/anthropic/api/logging.py,sha256=vHpwkIyOZdkSTVIH4ycbBPd4a_rzhP7O
|
|
|
25
25
|
agent/providers/anthropic/api_handler.py,sha256=pWXcqDs0ruviDhRNRrz5Ac9ZH4yDv6ZlwpeG3a42cDg,5206
|
|
26
26
|
agent/providers/anthropic/callbacks/__init__.py,sha256=PciBb6Z6MKSwfXqDjU3pV_0FS4MOn_Np_A7_skD-6dA,104
|
|
27
27
|
agent/providers/anthropic/callbacks/manager.py,sha256=euIah5yiM8nhisN-RWXewo6v0WQr0c-FbMBO04r6dJk,1865
|
|
28
|
-
agent/providers/anthropic/loop.py,sha256=
|
|
28
|
+
agent/providers/anthropic/loop.py,sha256=A9ce3q5a4SQfOKVUk0En4x6fuc8a8s9wXFlQg4ypIkg,20394
|
|
29
29
|
agent/providers/anthropic/prompts.py,sha256=nHFfgPrfvnWrEdVP7EUBGUHAI85D2X9HeZirk9EwncU,1941
|
|
30
30
|
agent/providers/anthropic/response_handler.py,sha256=ZTprV4NTP9Eb9jQ7QgEKZBX0L6rMj5nqBRiE3Zfws8I,8008
|
|
31
31
|
agent/providers/anthropic/tools/__init__.py,sha256=JyZwuVtPUnZwRSZBSCdQv9yxbLCsygm3l8Ywjjt9qTQ,661
|
|
32
32
|
agent/providers/anthropic/tools/base.py,sha256=WnRDbqO25tQzLpS2RU2ZXTLF5wd5IqU7SiyRAglQat4,2752
|
|
33
33
|
agent/providers/anthropic/tools/bash.py,sha256=QODuFjWuHM4GgGTqK2HizSyYqGqQwX70AdwrFiGSp2Q,2218
|
|
34
34
|
agent/providers/anthropic/tools/collection.py,sha256=RBK_6hxfHExR-EOxadiLl0OznmFj07nyIUjFgaYZ6Eo,960
|
|
35
|
-
agent/providers/anthropic/tools/computer.py,sha256=
|
|
35
|
+
agent/providers/anthropic/tools/computer.py,sha256=HWirLMkG_29X7wYSysQlKt2Zliq6ifsHaY8-bKrp4Fw,25933
|
|
36
36
|
agent/providers/anthropic/tools/edit.py,sha256=EGRP61MDA4Oue1D7Q-_vLpd6LdGbdBA1Z4HSZ66DbmI,13465
|
|
37
37
|
agent/providers/anthropic/tools/manager.py,sha256=yNvgTkfEqnOz5isDF0RxvmBMZB0uh2PipFEH-PUXpoY,2020
|
|
38
38
|
agent/providers/anthropic/tools/run.py,sha256=xhXdnBK1di9muaO44CEirL9hpGy3NmKbjfMpyeVmn8Y,1595
|
|
@@ -47,7 +47,7 @@ agent/providers/omni/clients/ollama.py,sha256=PmR5EhU9Mi43_o5mZN36XcpiGKp5HbQwlX
|
|
|
47
47
|
agent/providers/omni/clients/openai.py,sha256=iTSYWEJEM8INFPGJMiUVs8rFn0781XF_ofRkd7NT3gk,5920
|
|
48
48
|
agent/providers/omni/clients/utils.py,sha256=Ani9CVVBm_J2Dl51WG6p1GVuoI6cq8scISrG0pmQ37o,688
|
|
49
49
|
agent/providers/omni/image_utils.py,sha256=wejhWb36yqedsPnLFTFwk2wth8a6txfVWSg4EaNrRdA,908
|
|
50
|
-
agent/providers/omni/loop.py,sha256=
|
|
50
|
+
agent/providers/omni/loop.py,sha256=3eL80w2btw7Gt9FpvJjZeHf97fnDSzYpTi1hBjzRjIk,40929
|
|
51
51
|
agent/providers/omni/parser.py,sha256=REpQwlwvY1z_N8wbMj6GhOeTiiWVWHhVja_LOxgzbks,11734
|
|
52
52
|
agent/providers/omni/prompts.py,sha256=Mupjy0bUwBjcAeLXpE1r1jisYPSlhwsp-IXJKEKrEtw,3779
|
|
53
53
|
agent/providers/omni/tools/__init__.py,sha256=IC1cMEDoR2ljGcNNthzBRF_VtnDbRL5qvHJWErtNp98,774
|
|
@@ -58,28 +58,28 @@ agent/providers/omni/tools/manager.py,sha256=UhtasaxGcmkxtz-bP1UJ1a4xdYnD3Cv8Pbt
|
|
|
58
58
|
agent/providers/omni/utils.py,sha256=Ikp6ONL1HO637o3KDtv5yv6q-4uIWAzMSQDvGetWXC8,8724
|
|
59
59
|
agent/providers/openai/__init__.py,sha256=8DS6YNZp42NLCacwXsfRaghyczaOCVovX8TgzXUZf_o,165
|
|
60
60
|
agent/providers/openai/api_handler.py,sha256=L1K56dR1j4JsX1sX4OFYeKoCUMM25Fwj2y9nqv8oOhw,17736
|
|
61
|
-
agent/providers/openai/loop.py,sha256=
|
|
61
|
+
agent/providers/openai/loop.py,sha256=XkwGXgqKXH0bHz7DYstJ7HZjrm7upIRpYBm-UsuYGSU,19608
|
|
62
62
|
agent/providers/openai/response_handler.py,sha256=K8v_92uSr9R74Y5INY4naeEZZZm35CLIl4h74MBZhsw,7953
|
|
63
63
|
agent/providers/openai/tools/__init__.py,sha256=-KbHMWcd2OVTk5RYQ3ACBEMygwbH-VW6n_98p0lwM4A,344
|
|
64
64
|
agent/providers/openai/tools/base.py,sha256=Np_BC9Cm6TslK99etE9hVTtsBlcEaGhoNCK3NXdB_Lw,2474
|
|
65
|
-
agent/providers/openai/tools/computer.py,sha256=
|
|
65
|
+
agent/providers/openai/tools/computer.py,sha256=Jo243sNNy3_N1kO07tTMe2YWbzJGLUzOHOj5CGfwwM4,13924
|
|
66
66
|
agent/providers/openai/tools/manager.py,sha256=-wM641dLf8vcv6QF9x_ViGJeDl2YTuUV93j6u7GBI18,3903
|
|
67
67
|
agent/providers/openai/types.py,sha256=0mFUxeFy23fJhMwc6lAFVXKngg2fJIXkPS5oV284V1M,898
|
|
68
68
|
agent/providers/openai/utils.py,sha256=YeCZWIqOFSeugWoqAS0rhxOKAfL-9uN9nrYSBGBgPdc,3175
|
|
69
69
|
agent/providers/uitars/__init__.py,sha256=sq5OMVJP9E_sok9tIiKJreGkjmNWXPMObjPTClYv1es,38
|
|
70
70
|
agent/providers/uitars/clients/base.py,sha256=5w8Ajmq1JiPyUQJUAq1lSkfpA8_Ts80NQiDxPMTtQrI,948
|
|
71
|
-
agent/providers/uitars/clients/oaicompat.py,sha256=
|
|
72
|
-
agent/providers/uitars/loop.py,sha256=
|
|
71
|
+
agent/providers/uitars/clients/oaicompat.py,sha256=uYjwrGCVpFi8wj4kcaJ905ABiY6ksJZXaLlM61B2DUA,8907
|
|
72
|
+
agent/providers/uitars/loop.py,sha256=CoZDk4ltz5nsw9yDnFKET5skP1uzibl3QDZOUfJQsKQ,22774
|
|
73
73
|
agent/providers/uitars/prompts.py,sha256=_pQNd438mFpZKZT0aMl6Bd0_GgQxuy9y08kQAMPi9UM,2536
|
|
74
74
|
agent/providers/uitars/tools/__init__.py,sha256=0hc3W6u5TvcXYztYKIyve_C2G3XMfwt_y7grmH0ZHC0,29
|
|
75
75
|
agent/providers/uitars/tools/computer.py,sha256=TeIg_aCtMroxWOBJEiYY_YI4krW_C3pYu51tgGsVUYU,11808
|
|
76
76
|
agent/providers/uitars/tools/manager.py,sha256=2dK9STtz6NuZG3i0nH7ZuHJpb7vKJ2mOVbxGsb0t8lQ,1945
|
|
77
|
-
agent/providers/uitars/utils.py,sha256=
|
|
77
|
+
agent/providers/uitars/utils.py,sha256=S6FiZ3P-O4B15P1Gdup2o7SyuIu4nSQbspxcektpwmM,8870
|
|
78
78
|
agent/telemetry.py,sha256=pVGxbj0ewnvq4EGj28CydN4a1iOfvZR_XKL3vIOqhOM,390
|
|
79
79
|
agent/ui/__init__.py,sha256=ohhxJLBin6k1hl5sKcmBST8mgh23WXgAXz3pN4f470E,45
|
|
80
80
|
agent/ui/gradio/__init__.py,sha256=ANKZhv1HqsLheWbLVBlyRQ7Q5qGeXuPi5jDs8vu-ZMo,579
|
|
81
|
-
agent/ui/gradio/app.py,sha256=
|
|
82
|
-
cua_agent-0.1.
|
|
83
|
-
cua_agent-0.1.
|
|
84
|
-
cua_agent-0.1.
|
|
85
|
-
cua_agent-0.1.
|
|
81
|
+
agent/ui/gradio/app.py,sha256=C38XbAAqBIsIfSh-3eF_IqHBEguRSfaMycNX8jRXTjE,41060
|
|
82
|
+
cua_agent-0.1.32.dist-info/METADATA,sha256=UraY0Du9vO80XAWx7FRw9FyUig1WHIHawNOD0T7qHDE,11335
|
|
83
|
+
cua_agent-0.1.32.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
|
|
84
|
+
cua_agent-0.1.32.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
85
|
+
cua_agent-0.1.32.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|