cua-agent 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +2 -2
- agent/adapters/huggingfacelocal_adapter.py +5 -1
- agent/agent.py +82 -15
- agent/cli.py +9 -3
- agent/computer_handler.py +3 -1
- agent/decorators.py +28 -66
- agent/loops/__init__.py +3 -1
- agent/loops/anthropic.py +200 -84
- agent/loops/base.py +76 -0
- agent/loops/composed_grounded.py +318 -0
- agent/loops/gta1.py +178 -0
- agent/loops/model_types.csv +6 -0
- agent/loops/omniparser.py +178 -84
- agent/loops/openai.py +198 -58
- agent/loops/uitars.py +305 -178
- agent/responses.py +477 -1
- agent/types.py +7 -5
- {cua_agent-0.4.7.dist-info → cua_agent-0.4.8.dist-info}/METADATA +2 -2
- cua_agent-0.4.8.dist-info/RECORD +37 -0
- cua_agent-0.4.7.dist-info/RECORD +0 -33
- {cua_agent-0.4.7.dist-info → cua_agent-0.4.8.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.7.dist-info → cua_agent-0.4.8.dist-info}/entry_points.txt +0 -0
agent/loops/uitars.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
|
|
3
|
+
Paper: https://arxiv.org/abs/2501.12326
|
|
4
|
+
Code: https://github.com/bytedance/UI-TARS
|
|
3
5
|
"""
|
|
4
6
|
|
|
5
7
|
import asyncio
|
|
@@ -9,7 +11,7 @@ import base64
|
|
|
9
11
|
import math
|
|
10
12
|
import re
|
|
11
13
|
import ast
|
|
12
|
-
from typing import Dict, List, Any, AsyncGenerator, Union, Optional
|
|
14
|
+
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
|
13
15
|
from io import BytesIO
|
|
14
16
|
from PIL import Image
|
|
15
17
|
import litellm
|
|
@@ -21,8 +23,8 @@ from openai.types.responses.response_input_param import ComputerCallOutput
|
|
|
21
23
|
from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
|
|
22
24
|
from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
|
|
23
25
|
|
|
24
|
-
from ..decorators import
|
|
25
|
-
from ..types import Messages, AgentResponse, Tools
|
|
26
|
+
from ..decorators import register_agent
|
|
27
|
+
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
|
26
28
|
from ..responses import (
|
|
27
29
|
make_reasoning_item,
|
|
28
30
|
make_output_text_item,
|
|
@@ -79,6 +81,18 @@ Action: ...
|
|
|
79
81
|
{instruction}
|
|
80
82
|
"""
|
|
81
83
|
|
|
84
|
+
GROUNDING_UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
85
|
+
|
|
86
|
+
## Output Format
|
|
87
|
+
|
|
88
|
+
Action: ...
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
## Action Space
|
|
92
|
+
click(point='<|box_start|>(x1,y1)<|box_end|>')
|
|
93
|
+
|
|
94
|
+
## User Instruction
|
|
95
|
+
{instruction}"""
|
|
82
96
|
|
|
83
97
|
def round_by_factor(number: float, factor: int) -> int:
|
|
84
98
|
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
|
@@ -501,188 +515,301 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
|
|
|
501
515
|
|
|
502
516
|
return litellm_messages
|
|
503
517
|
|
|
504
|
-
@
|
|
505
|
-
|
|
506
|
-
messages: Messages,
|
|
507
|
-
model: str,
|
|
508
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
509
|
-
max_retries: Optional[int] = None,
|
|
510
|
-
stream: bool = False,
|
|
511
|
-
computer_handler=None,
|
|
512
|
-
use_prompt_caching: Optional[bool] = False,
|
|
513
|
-
_on_api_start=None,
|
|
514
|
-
_on_api_end=None,
|
|
515
|
-
_on_usage=None,
|
|
516
|
-
_on_screenshot=None,
|
|
517
|
-
**kwargs
|
|
518
|
-
) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
|
|
518
|
+
@register_agent(models=r"(?i).*ui-?tars.*")
|
|
519
|
+
class UITARSConfig:
|
|
519
520
|
"""
|
|
520
|
-
UITARS agent
|
|
521
|
+
UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
|
|
521
522
|
|
|
522
523
|
Supports UITARS vision-language models for computer control.
|
|
523
524
|
"""
|
|
524
|
-
tools = tools or []
|
|
525
|
-
|
|
526
|
-
# Create response items
|
|
527
|
-
response_items = []
|
|
528
|
-
|
|
529
|
-
# Find computer tool for screen dimensions
|
|
530
|
-
computer_tool = None
|
|
531
|
-
for tool_schema in tools:
|
|
532
|
-
if tool_schema["type"] == "computer":
|
|
533
|
-
computer_tool = tool_schema["computer"]
|
|
534
|
-
break
|
|
535
525
|
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
526
|
+
async def predict_step(
|
|
527
|
+
self,
|
|
528
|
+
messages: List[Dict[str, Any]],
|
|
529
|
+
model: str,
|
|
530
|
+
tools: Optional[List[Dict[str, Any]]] = None,
|
|
531
|
+
max_retries: Optional[int] = None,
|
|
532
|
+
stream: bool = False,
|
|
533
|
+
computer_handler=None,
|
|
534
|
+
use_prompt_caching: Optional[bool] = False,
|
|
535
|
+
_on_api_start=None,
|
|
536
|
+
_on_api_end=None,
|
|
537
|
+
_on_usage=None,
|
|
538
|
+
_on_screenshot=None,
|
|
539
|
+
**kwargs
|
|
540
|
+
) -> Dict[str, Any]:
|
|
541
|
+
"""
|
|
542
|
+
Predict the next step based on input messages.
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
messages: Input messages following Responses format
|
|
546
|
+
model: Model name to use
|
|
547
|
+
tools: Optional list of tool schemas
|
|
548
|
+
max_retries: Maximum number of retries
|
|
549
|
+
stream: Whether to stream responses
|
|
550
|
+
computer_handler: Computer handler instance
|
|
551
|
+
_on_api_start: Callback for API start
|
|
552
|
+
_on_api_end: Callback for API end
|
|
553
|
+
_on_usage: Callback for usage tracking
|
|
554
|
+
_on_screenshot: Callback for screenshot events
|
|
555
|
+
**kwargs: Additional arguments
|
|
556
556
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
if
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
557
|
+
Returns:
|
|
558
|
+
Dictionary with "output" (output items) and "usage" array
|
|
559
|
+
"""
|
|
560
|
+
tools = tools or []
|
|
561
|
+
|
|
562
|
+
# Create response items
|
|
563
|
+
response_items = []
|
|
564
|
+
|
|
565
|
+
# Find computer tool for screen dimensions
|
|
566
|
+
computer_tool = None
|
|
567
|
+
for tool_schema in tools:
|
|
568
|
+
if tool_schema["type"] == "computer":
|
|
569
|
+
computer_tool = tool_schema["computer"]
|
|
570
|
+
break
|
|
571
|
+
|
|
572
|
+
# Get screen dimensions
|
|
573
|
+
screen_width, screen_height = 1024, 768
|
|
574
|
+
if computer_tool:
|
|
575
|
+
try:
|
|
576
|
+
screen_width, screen_height = await computer_tool.get_dimensions()
|
|
577
|
+
except:
|
|
578
|
+
pass
|
|
579
|
+
|
|
580
|
+
# Process messages to extract instruction and image
|
|
581
|
+
instruction = ""
|
|
582
|
+
image_data = None
|
|
583
|
+
|
|
584
|
+
# Convert messages to list if string
|
|
585
|
+
if isinstance(messages, str):
|
|
586
|
+
messages = [{"role": "user", "content": messages}]
|
|
587
|
+
|
|
588
|
+
# Extract instruction and latest screenshot
|
|
589
|
+
for message in reversed(messages):
|
|
590
|
+
if isinstance(message, dict):
|
|
591
|
+
content = message.get("content", "")
|
|
592
|
+
|
|
593
|
+
# Handle different content formats
|
|
594
|
+
if isinstance(content, str):
|
|
595
|
+
if not instruction and message.get("role") == "user":
|
|
596
|
+
instruction = content
|
|
597
|
+
elif isinstance(content, list):
|
|
598
|
+
for item in content:
|
|
599
|
+
if isinstance(item, dict):
|
|
600
|
+
if item.get("type") == "text" and not instruction:
|
|
601
|
+
instruction = item.get("text", "")
|
|
602
|
+
elif item.get("type") == "image_url" and not image_data:
|
|
603
|
+
image_url = item.get("image_url", {})
|
|
604
|
+
if isinstance(image_url, dict):
|
|
605
|
+
image_data = image_url.get("url", "")
|
|
606
|
+
else:
|
|
607
|
+
image_data = image_url
|
|
608
|
+
|
|
609
|
+
# Also check for computer_call_output with screenshots
|
|
610
|
+
if message.get("type") == "computer_call_output" and not image_data:
|
|
611
|
+
output = message.get("output", {})
|
|
612
|
+
if isinstance(output, dict) and output.get("type") == "input_image":
|
|
613
|
+
image_data = output.get("image_url", "")
|
|
614
|
+
|
|
615
|
+
if instruction and image_data:
|
|
616
|
+
break
|
|
617
|
+
|
|
618
|
+
if not instruction:
|
|
619
|
+
instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
|
|
620
|
+
|
|
621
|
+
# Create prompt
|
|
622
|
+
user_prompt = UITARS_PROMPT_TEMPLATE.format(
|
|
623
|
+
instruction=instruction,
|
|
624
|
+
action_space=UITARS_ACTION_SPACE,
|
|
625
|
+
language="English"
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
# Convert conversation history to LiteLLM format
|
|
629
|
+
history_messages = convert_uitars_messages_to_litellm(messages)
|
|
630
|
+
|
|
631
|
+
# Prepare messages for liteLLM
|
|
632
|
+
litellm_messages = [
|
|
633
|
+
{
|
|
634
|
+
"role": "system",
|
|
635
|
+
"content": "You are a helpful assistant."
|
|
636
|
+
}
|
|
608
637
|
]
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
if not image_data:
|
|
614
|
-
# Take screenshot if none found in messages
|
|
615
|
-
if computer_handler:
|
|
616
|
-
image_data = await computer_handler.screenshot()
|
|
617
|
-
await _on_screenshot(image_data, "screenshot_before")
|
|
618
|
-
|
|
619
|
-
# Add screenshot to output items so it can be retained in history
|
|
620
|
-
response_items.append(make_input_image_item(image_data))
|
|
621
|
-
else:
|
|
622
|
-
raise ValueError("No screenshot found in messages and no computer_handler provided")
|
|
623
|
-
processed_image, original_width, original_height = process_image_for_uitars(image_data)
|
|
624
|
-
encoded_image = pil_to_base64(processed_image)
|
|
625
|
-
|
|
626
|
-
# Add conversation history
|
|
627
|
-
if history_messages:
|
|
628
|
-
litellm_messages.extend(history_messages)
|
|
629
|
-
else:
|
|
630
|
-
litellm_messages.append({
|
|
631
|
-
"role": "user",
|
|
638
|
+
|
|
639
|
+
# Add current user instruction with screenshot
|
|
640
|
+
current_user_message = {
|
|
641
|
+
"role": "user",
|
|
632
642
|
"content": [
|
|
633
|
-
{"type": "
|
|
643
|
+
{"type": "text", "text": user_prompt},
|
|
634
644
|
]
|
|
635
|
-
}
|
|
645
|
+
}
|
|
646
|
+
litellm_messages.append(current_user_message)
|
|
647
|
+
|
|
648
|
+
# Process image for UITARS
|
|
649
|
+
if not image_data:
|
|
650
|
+
# Take screenshot if none found in messages
|
|
651
|
+
if computer_handler:
|
|
652
|
+
image_data = await computer_handler.screenshot()
|
|
653
|
+
await _on_screenshot(image_data, "screenshot_before")
|
|
654
|
+
|
|
655
|
+
# Add screenshot to output items so it can be retained in history
|
|
656
|
+
response_items.append(make_input_image_item(image_data))
|
|
657
|
+
else:
|
|
658
|
+
raise ValueError("No screenshot found in messages and no computer_handler provided")
|
|
659
|
+
processed_image, original_width, original_height = process_image_for_uitars(image_data)
|
|
660
|
+
encoded_image = pil_to_base64(processed_image)
|
|
661
|
+
|
|
662
|
+
# Add conversation history
|
|
663
|
+
if history_messages:
|
|
664
|
+
litellm_messages.extend(history_messages)
|
|
665
|
+
else:
|
|
666
|
+
litellm_messages.append({
|
|
667
|
+
"role": "user",
|
|
668
|
+
"content": [
|
|
669
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
|
|
670
|
+
]
|
|
671
|
+
})
|
|
672
|
+
|
|
673
|
+
# Prepare API call kwargs
|
|
674
|
+
api_kwargs = {
|
|
675
|
+
"model": model,
|
|
676
|
+
"messages": litellm_messages,
|
|
677
|
+
"max_tokens": kwargs.get("max_tokens", 500),
|
|
678
|
+
"temperature": kwargs.get("temperature", 0.0),
|
|
679
|
+
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
|
|
680
|
+
"num_retries": max_retries,
|
|
681
|
+
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
# Call API start hook
|
|
685
|
+
if _on_api_start:
|
|
686
|
+
await _on_api_start(api_kwargs)
|
|
687
|
+
|
|
688
|
+
# Call liteLLM with UITARS model
|
|
689
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
690
|
+
|
|
691
|
+
# Call API end hook
|
|
692
|
+
if _on_api_end:
|
|
693
|
+
await _on_api_end(api_kwargs, response)
|
|
694
|
+
|
|
695
|
+
# Extract response content
|
|
696
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
697
|
+
|
|
698
|
+
# Parse UITARS response
|
|
699
|
+
parsed_responses = parse_uitars_response(response_content, original_width, original_height)
|
|
700
|
+
|
|
701
|
+
# Convert to computer actions
|
|
702
|
+
computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
|
|
703
|
+
|
|
704
|
+
# Add computer actions to response items
|
|
705
|
+
thought = parsed_responses[0].get("thought", "")
|
|
706
|
+
if thought:
|
|
707
|
+
response_items.append(make_reasoning_item(thought))
|
|
708
|
+
response_items.extend(computer_actions)
|
|
709
|
+
|
|
710
|
+
# Extract usage information
|
|
711
|
+
response_usage = {
|
|
712
|
+
**LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
|
|
713
|
+
"response_cost": response._hidden_params.get("response_cost", 0.0),
|
|
714
|
+
}
|
|
715
|
+
if _on_usage:
|
|
716
|
+
await _on_usage(response_usage)
|
|
636
717
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
718
|
+
# Create agent response
|
|
719
|
+
agent_response = {
|
|
720
|
+
"output": response_items,
|
|
721
|
+
"usage": response_usage
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
return agent_response
|
|
725
|
+
|
|
726
|
+
async def predict_click(
|
|
727
|
+
self,
|
|
728
|
+
model: str,
|
|
729
|
+
image_b64: str,
|
|
730
|
+
instruction: str
|
|
731
|
+
) -> Optional[Tuple[int, int]]:
|
|
732
|
+
"""
|
|
733
|
+
Predict click coordinates based on image and instruction.
|
|
734
|
+
|
|
735
|
+
UITARS supports click prediction through its action parsing.
|
|
736
|
+
|
|
737
|
+
Args:
|
|
738
|
+
model: Model name to use
|
|
739
|
+
image_b64: Base64 encoded image
|
|
740
|
+
instruction: Instruction for where to click
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
Tuple with (x, y) coordinates or None
|
|
744
|
+
"""
|
|
745
|
+
try:
|
|
746
|
+
# Create prompt using grounding template
|
|
747
|
+
user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
|
|
748
|
+
instruction=instruction
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
# Process image for UITARS
|
|
752
|
+
processed_image, original_width, original_height = process_image_for_uitars(image_b64)
|
|
753
|
+
encoded_image = pil_to_base64(processed_image)
|
|
754
|
+
|
|
755
|
+
# Prepare messages for liteLLM
|
|
756
|
+
litellm_messages = [
|
|
757
|
+
{
|
|
758
|
+
"role": "system",
|
|
759
|
+
"content": "You are a helpful assistant."
|
|
760
|
+
},
|
|
761
|
+
{
|
|
762
|
+
"role": "user",
|
|
763
|
+
"content": [
|
|
764
|
+
{"type": "text", "text": user_prompt},
|
|
765
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
|
|
766
|
+
]
|
|
767
|
+
}
|
|
768
|
+
]
|
|
769
|
+
|
|
770
|
+
# Prepare API call kwargs
|
|
771
|
+
api_kwargs = {
|
|
772
|
+
"model": model,
|
|
773
|
+
"messages": litellm_messages,
|
|
774
|
+
"max_tokens": 100,
|
|
775
|
+
"temperature": 0.0,
|
|
776
|
+
"do_sample": False
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
# Call liteLLM with UITARS model
|
|
780
|
+
response = await litellm.acompletion(**api_kwargs)
|
|
781
|
+
|
|
782
|
+
# Extract response content
|
|
783
|
+
response_content = response.choices[0].message.content.strip() # type: ignore
|
|
784
|
+
|
|
785
|
+
# Parse the response to extract click coordinates
|
|
786
|
+
# Look for click action with coordinates
|
|
787
|
+
click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
|
|
788
|
+
match = re.search(click_pattern, response_content)
|
|
789
|
+
|
|
790
|
+
if match:
|
|
791
|
+
x, y = int(match.group(1)), int(match.group(2))
|
|
792
|
+
# Scale coordinates back to original image dimensions
|
|
793
|
+
scale_x = original_width / processed_image.width
|
|
794
|
+
scale_y = original_height / processed_image.height
|
|
795
|
+
|
|
796
|
+
scaled_x = int(x * scale_x)
|
|
797
|
+
scaled_y = int(y * scale_y)
|
|
798
|
+
|
|
799
|
+
return (scaled_x, scaled_y)
|
|
800
|
+
|
|
801
|
+
return None
|
|
802
|
+
|
|
803
|
+
except Exception as e:
|
|
804
|
+
# Log error and return None
|
|
805
|
+
print(f"Error in predict_click: {e}")
|
|
806
|
+
return None
|
|
807
|
+
|
|
808
|
+
def get_capabilities(self) -> List[AgentCapability]:
|
|
809
|
+
"""
|
|
810
|
+
Get list of capabilities supported by this agent config.
|
|
811
|
+
|
|
812
|
+
Returns:
|
|
813
|
+
List of capability strings
|
|
814
|
+
"""
|
|
815
|
+
return ["step", "click"]
|