cua-agent 0.4.6__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/loops/uitars.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """
2
2
  UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
3
+ Paper: https://arxiv.org/abs/2501.12326
4
+ Code: https://github.com/bytedance/UI-TARS
3
5
  """
4
6
 
5
7
  import asyncio
@@ -9,7 +11,7 @@ import base64
9
11
  import math
10
12
  import re
11
13
  import ast
12
- from typing import Dict, List, Any, AsyncGenerator, Union, Optional
14
+ from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
13
15
  from io import BytesIO
14
16
  from PIL import Image
15
17
  import litellm
@@ -21,8 +23,8 @@ from openai.types.responses.response_input_param import ComputerCallOutput
21
23
  from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
22
24
  from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
23
25
 
24
- from ..decorators import agent_loop
25
- from ..types import Messages, AgentResponse, Tools
26
+ from ..decorators import register_agent
27
+ from ..types import Messages, AgentResponse, Tools, AgentCapability
26
28
  from ..responses import (
27
29
  make_reasoning_item,
28
30
  make_output_text_item,
@@ -79,6 +81,18 @@ Action: ...
79
81
  {instruction}
80
82
  """
81
83
 
84
+ GROUNDING_UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
85
+
86
+ ## Output Format
87
+
88
+ Action: ...
89
+
90
+
91
+ ## Action Space
92
+ click(point='<|box_start|>(x1,y1)<|box_end|>')
93
+
94
+ ## User Instruction
95
+ {instruction}"""
82
96
 
83
97
  def round_by_factor(number: float, factor: int) -> int:
84
98
  """Returns the closest integer to 'number' that is divisible by 'factor'."""
@@ -501,188 +515,301 @@ def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any
501
515
 
502
516
  return litellm_messages
503
517
 
504
- @agent_loop(models=r"(?i).*ui-?tars.*", priority=10)
505
- async def uitars_loop(
506
- messages: Messages,
507
- model: str,
508
- tools: Optional[List[Dict[str, Any]]] = None,
509
- max_retries: Optional[int] = None,
510
- stream: bool = False,
511
- computer_handler=None,
512
- use_prompt_caching: Optional[bool] = False,
513
- _on_api_start=None,
514
- _on_api_end=None,
515
- _on_usage=None,
516
- _on_screenshot=None,
517
- **kwargs
518
- ) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
518
+ @register_agent(models=r"(?i).*ui-?tars.*")
519
+ class UITARSConfig:
519
520
  """
520
- UITARS agent loop using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
521
+ UITARS agent configuration using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
521
522
 
522
523
  Supports UITARS vision-language models for computer control.
523
524
  """
524
- tools = tools or []
525
-
526
- # Create response items
527
- response_items = []
528
-
529
- # Find computer tool for screen dimensions
530
- computer_tool = None
531
- for tool_schema in tools:
532
- if tool_schema["type"] == "computer":
533
- computer_tool = tool_schema["computer"]
534
- break
535
525
 
536
- # Get screen dimensions
537
- screen_width, screen_height = 1024, 768
538
- if computer_tool:
539
- try:
540
- screen_width, screen_height = await computer_tool.get_dimensions()
541
- except:
542
- pass
543
-
544
- # Process messages to extract instruction and image
545
- instruction = ""
546
- image_data = None
547
-
548
- # Convert messages to list if string
549
- if isinstance(messages, str):
550
- messages = [{"role": "user", "content": messages}]
551
-
552
- # Extract instruction and latest screenshot
553
- for message in reversed(messages):
554
- if isinstance(message, dict):
555
- content = message.get("content", "")
526
+ async def predict_step(
527
+ self,
528
+ messages: List[Dict[str, Any]],
529
+ model: str,
530
+ tools: Optional[List[Dict[str, Any]]] = None,
531
+ max_retries: Optional[int] = None,
532
+ stream: bool = False,
533
+ computer_handler=None,
534
+ use_prompt_caching: Optional[bool] = False,
535
+ _on_api_start=None,
536
+ _on_api_end=None,
537
+ _on_usage=None,
538
+ _on_screenshot=None,
539
+ **kwargs
540
+ ) -> Dict[str, Any]:
541
+ """
542
+ Predict the next step based on input messages.
543
+
544
+ Args:
545
+ messages: Input messages following Responses format
546
+ model: Model name to use
547
+ tools: Optional list of tool schemas
548
+ max_retries: Maximum number of retries
549
+ stream: Whether to stream responses
550
+ computer_handler: Computer handler instance
551
+ _on_api_start: Callback for API start
552
+ _on_api_end: Callback for API end
553
+ _on_usage: Callback for usage tracking
554
+ _on_screenshot: Callback for screenshot events
555
+ **kwargs: Additional arguments
556
556
 
557
- # Handle different content formats
558
- if isinstance(content, str):
559
- if not instruction and message.get("role") == "user":
560
- instruction = content
561
- elif isinstance(content, list):
562
- for item in content:
563
- if isinstance(item, dict):
564
- if item.get("type") == "text" and not instruction:
565
- instruction = item.get("text", "")
566
- elif item.get("type") == "image_url" and not image_data:
567
- image_url = item.get("image_url", {})
568
- if isinstance(image_url, dict):
569
- image_data = image_url.get("url", "")
570
- else:
571
- image_data = image_url
572
-
573
- # Also check for computer_call_output with screenshots
574
- if message.get("type") == "computer_call_output" and not image_data:
575
- output = message.get("output", {})
576
- if isinstance(output, dict) and output.get("type") == "input_image":
577
- image_data = output.get("image_url", "")
578
-
579
- if instruction and image_data:
580
- break
581
-
582
- if not instruction:
583
- instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
584
-
585
- # Create prompt
586
- user_prompt = UITARS_PROMPT_TEMPLATE.format(
587
- instruction=instruction,
588
- action_space=UITARS_ACTION_SPACE,
589
- language="English"
590
- )
591
-
592
- # Convert conversation history to LiteLLM format
593
- history_messages = convert_uitars_messages_to_litellm(messages)
594
-
595
- # Prepare messages for liteLLM
596
- litellm_messages = [
597
- {
598
- "role": "system",
599
- "content": "You are a helpful assistant."
600
- }
601
- ]
602
-
603
- # Add current user instruction with screenshot
604
- current_user_message = {
605
- "role": "user",
606
- "content": [
607
- {"type": "text", "text": user_prompt},
557
+ Returns:
558
+ Dictionary with "output" (output items) and "usage" array
559
+ """
560
+ tools = tools or []
561
+
562
+ # Create response items
563
+ response_items = []
564
+
565
+ # Find computer tool for screen dimensions
566
+ computer_tool = None
567
+ for tool_schema in tools:
568
+ if tool_schema["type"] == "computer":
569
+ computer_tool = tool_schema["computer"]
570
+ break
571
+
572
+ # Get screen dimensions
573
+ screen_width, screen_height = 1024, 768
574
+ if computer_tool:
575
+ try:
576
+ screen_width, screen_height = await computer_tool.get_dimensions()
577
+ except:
578
+ pass
579
+
580
+ # Process messages to extract instruction and image
581
+ instruction = ""
582
+ image_data = None
583
+
584
+ # Convert messages to list if string
585
+ if isinstance(messages, str):
586
+ messages = [{"role": "user", "content": messages}]
587
+
588
+ # Extract instruction and latest screenshot
589
+ for message in reversed(messages):
590
+ if isinstance(message, dict):
591
+ content = message.get("content", "")
592
+
593
+ # Handle different content formats
594
+ if isinstance(content, str):
595
+ if not instruction and message.get("role") == "user":
596
+ instruction = content
597
+ elif isinstance(content, list):
598
+ for item in content:
599
+ if isinstance(item, dict):
600
+ if item.get("type") == "text" and not instruction:
601
+ instruction = item.get("text", "")
602
+ elif item.get("type") == "image_url" and not image_data:
603
+ image_url = item.get("image_url", {})
604
+ if isinstance(image_url, dict):
605
+ image_data = image_url.get("url", "")
606
+ else:
607
+ image_data = image_url
608
+
609
+ # Also check for computer_call_output with screenshots
610
+ if message.get("type") == "computer_call_output" and not image_data:
611
+ output = message.get("output", {})
612
+ if isinstance(output, dict) and output.get("type") == "input_image":
613
+ image_data = output.get("image_url", "")
614
+
615
+ if instruction and image_data:
616
+ break
617
+
618
+ if not instruction:
619
+ instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
620
+
621
+ # Create prompt
622
+ user_prompt = UITARS_PROMPT_TEMPLATE.format(
623
+ instruction=instruction,
624
+ action_space=UITARS_ACTION_SPACE,
625
+ language="English"
626
+ )
627
+
628
+ # Convert conversation history to LiteLLM format
629
+ history_messages = convert_uitars_messages_to_litellm(messages)
630
+
631
+ # Prepare messages for liteLLM
632
+ litellm_messages = [
633
+ {
634
+ "role": "system",
635
+ "content": "You are a helpful assistant."
636
+ }
608
637
  ]
609
- }
610
- litellm_messages.append(current_user_message)
611
-
612
- # Process image for UITARS
613
- if not image_data:
614
- # Take screenshot if none found in messages
615
- if computer_handler:
616
- image_data = await computer_handler.screenshot()
617
- await _on_screenshot(image_data, "screenshot_before")
618
-
619
- # Add screenshot to output items so it can be retained in history
620
- response_items.append(make_input_image_item(image_data))
621
- else:
622
- raise ValueError("No screenshot found in messages and no computer_handler provided")
623
- processed_image, original_width, original_height = process_image_for_uitars(image_data)
624
- encoded_image = pil_to_base64(processed_image)
625
-
626
- # Add conversation history
627
- if history_messages:
628
- litellm_messages.extend(history_messages)
629
- else:
630
- litellm_messages.append({
631
- "role": "user",
638
+
639
+ # Add current user instruction with screenshot
640
+ current_user_message = {
641
+ "role": "user",
632
642
  "content": [
633
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
643
+ {"type": "text", "text": user_prompt},
634
644
  ]
635
- })
645
+ }
646
+ litellm_messages.append(current_user_message)
647
+
648
+ # Process image for UITARS
649
+ if not image_data:
650
+ # Take screenshot if none found in messages
651
+ if computer_handler:
652
+ image_data = await computer_handler.screenshot()
653
+ await _on_screenshot(image_data, "screenshot_before")
654
+
655
+ # Add screenshot to output items so it can be retained in history
656
+ response_items.append(make_input_image_item(image_data))
657
+ else:
658
+ raise ValueError("No screenshot found in messages and no computer_handler provided")
659
+ processed_image, original_width, original_height = process_image_for_uitars(image_data)
660
+ encoded_image = pil_to_base64(processed_image)
661
+
662
+ # Add conversation history
663
+ if history_messages:
664
+ litellm_messages.extend(history_messages)
665
+ else:
666
+ litellm_messages.append({
667
+ "role": "user",
668
+ "content": [
669
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
670
+ ]
671
+ })
672
+
673
+ # Prepare API call kwargs
674
+ api_kwargs = {
675
+ "model": model,
676
+ "messages": litellm_messages,
677
+ "max_tokens": kwargs.get("max_tokens", 500),
678
+ "temperature": kwargs.get("temperature", 0.0),
679
+ "do_sample": kwargs.get("temperature", 0.0) > 0.0,
680
+ "num_retries": max_retries,
681
+ **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
682
+ }
683
+
684
+ # Call API start hook
685
+ if _on_api_start:
686
+ await _on_api_start(api_kwargs)
687
+
688
+ # Call liteLLM with UITARS model
689
+ response = await litellm.acompletion(**api_kwargs)
690
+
691
+ # Call API end hook
692
+ if _on_api_end:
693
+ await _on_api_end(api_kwargs, response)
694
+
695
+ # Extract response content
696
+ response_content = response.choices[0].message.content.strip() # type: ignore
697
+
698
+ # Parse UITARS response
699
+ parsed_responses = parse_uitars_response(response_content, original_width, original_height)
700
+
701
+ # Convert to computer actions
702
+ computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
703
+
704
+ # Add computer actions to response items
705
+ thought = parsed_responses[0].get("thought", "")
706
+ if thought:
707
+ response_items.append(make_reasoning_item(thought))
708
+ response_items.extend(computer_actions)
709
+
710
+ # Extract usage information
711
+ response_usage = {
712
+ **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
713
+ "response_cost": response._hidden_params.get("response_cost", 0.0),
714
+ }
715
+ if _on_usage:
716
+ await _on_usage(response_usage)
636
717
 
637
- # Prepare API call kwargs
638
- api_kwargs = {
639
- "model": model,
640
- "messages": litellm_messages,
641
- "max_tokens": kwargs.get("max_tokens", 500),
642
- "temperature": kwargs.get("temperature", 0.0),
643
- "do_sample": kwargs.get("temperature", 0.0) > 0.0,
644
- "num_retries": max_retries,
645
- **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
646
- }
647
-
648
- # Call API start hook
649
- if _on_api_start:
650
- await _on_api_start(api_kwargs)
651
-
652
- # Call liteLLM with UITARS model
653
- response = await litellm.acompletion(**api_kwargs)
654
-
655
- # Call API end hook
656
- if _on_api_end:
657
- await _on_api_end(api_kwargs, response)
658
-
659
- # Extract response content
660
- response_content = response.choices[0].message.content.strip() # type: ignore
661
-
662
- # Parse UITARS response
663
- parsed_responses = parse_uitars_response(response_content, original_width, original_height)
664
-
665
- # Convert to computer actions
666
- computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
667
-
668
- # Add computer actions to response items
669
- thought = parsed_responses[0].get("thought", "")
670
- if thought:
671
- response_items.append(make_reasoning_item(thought))
672
- response_items.extend(computer_actions)
673
-
674
- # Extract usage information
675
- response_usage = {
676
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
677
- "response_cost": response._hidden_params.get("response_cost", 0.0),
678
- }
679
- if _on_usage:
680
- await _on_usage(response_usage)
681
-
682
- # Create agent response
683
- agent_response = {
684
- "output": response_items,
685
- "usage": response_usage
686
- }
687
-
688
- return agent_response
718
+ # Create agent response
719
+ agent_response = {
720
+ "output": response_items,
721
+ "usage": response_usage
722
+ }
723
+
724
+ return agent_response
725
+
726
+ async def predict_click(
727
+ self,
728
+ model: str,
729
+ image_b64: str,
730
+ instruction: str
731
+ ) -> Optional[Tuple[int, int]]:
732
+ """
733
+ Predict click coordinates based on image and instruction.
734
+
735
+ UITARS supports click prediction through its action parsing.
736
+
737
+ Args:
738
+ model: Model name to use
739
+ image_b64: Base64 encoded image
740
+ instruction: Instruction for where to click
741
+
742
+ Returns:
743
+ Tuple with (x, y) coordinates or None
744
+ """
745
+ try:
746
+ # Create prompt using grounding template
747
+ user_prompt = GROUNDING_UITARS_PROMPT_TEMPLATE.format(
748
+ instruction=instruction
749
+ )
750
+
751
+ # Process image for UITARS
752
+ processed_image, original_width, original_height = process_image_for_uitars(image_b64)
753
+ encoded_image = pil_to_base64(processed_image)
754
+
755
+ # Prepare messages for liteLLM
756
+ litellm_messages = [
757
+ {
758
+ "role": "system",
759
+ "content": "You are a helpful assistant."
760
+ },
761
+ {
762
+ "role": "user",
763
+ "content": [
764
+ {"type": "text", "text": user_prompt},
765
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
766
+ ]
767
+ }
768
+ ]
769
+
770
+ # Prepare API call kwargs
771
+ api_kwargs = {
772
+ "model": model,
773
+ "messages": litellm_messages,
774
+ "max_tokens": 100,
775
+ "temperature": 0.0,
776
+ "do_sample": False
777
+ }
778
+
779
+ # Call liteLLM with UITARS model
780
+ response = await litellm.acompletion(**api_kwargs)
781
+
782
+ # Extract response content
783
+ response_content = response.choices[0].message.content.strip() # type: ignore
784
+
785
+ # Parse the response to extract click coordinates
786
+ # Look for click action with coordinates
787
+ click_pattern = r"click\(point='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)"
788
+ match = re.search(click_pattern, response_content)
789
+
790
+ if match:
791
+ x, y = int(match.group(1)), int(match.group(2))
792
+ # Scale coordinates back to original image dimensions
793
+ scale_x = original_width / processed_image.width
794
+ scale_y = original_height / processed_image.height
795
+
796
+ scaled_x = int(x * scale_x)
797
+ scaled_y = int(y * scale_y)
798
+
799
+ return (scaled_x, scaled_y)
800
+
801
+ return None
802
+
803
+ except Exception as e:
804
+ # Log error and return None
805
+ print(f"Error in predict_click: {e}")
806
+ return None
807
+
808
+ def get_capabilities(self) -> List[AgentCapability]:
809
+ """
810
+ Get list of capabilities supported by this agent config.
811
+
812
+ Returns:
813
+ List of capability strings
814
+ """
815
+ return ["step", "click"]