cua-agent 0.4.23__tar.gz → 0.4.25__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (53) hide show
  1. cua_agent-0.4.25/PKG-INFO +138 -0
  2. cua_agent-0.4.25/README.md +87 -0
  3. cua_agent-0.4.25/agent/callbacks/image_retention.py +90 -0
  4. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/callbacks/operator_validator.py +32 -32
  5. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/human_tool/ui.py +68 -10
  6. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/anthropic.py +11 -12
  7. {cua_agent-0.4.23 → cua_agent-0.4.25}/pyproject.toml +1 -4
  8. cua_agent-0.4.23/PKG-INFO +0 -436
  9. cua_agent-0.4.23/README.md +0 -382
  10. cua_agent-0.4.23/agent/callbacks/image_retention.py +0 -139
  11. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/__init__.py +0 -0
  12. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/__main__.py +0 -0
  13. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/adapters/__init__.py +0 -0
  14. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/adapters/huggingfacelocal_adapter.py +0 -0
  15. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/adapters/human_adapter.py +0 -0
  16. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/adapters/mlxvlm_adapter.py +0 -0
  17. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/agent.py +0 -0
  18. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/callbacks/__init__.py +0 -0
  19. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/callbacks/base.py +0 -0
  20. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/callbacks/budget_manager.py +0 -0
  21. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/callbacks/logging.py +0 -0
  22. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/callbacks/pii_anonymization.py +0 -0
  23. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/callbacks/telemetry.py +0 -0
  24. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/callbacks/trajectory_saver.py +0 -0
  25. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/cli.py +0 -0
  26. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/computers/__init__.py +0 -0
  27. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/computers/base.py +0 -0
  28. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/computers/cua.py +0 -0
  29. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/computers/custom.py +0 -0
  30. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/decorators.py +0 -0
  31. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/human_tool/__init__.py +0 -0
  32. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/human_tool/__main__.py +0 -0
  33. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/human_tool/server.py +0 -0
  34. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/integrations/hud/__init__.py +0 -0
  35. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/integrations/hud/proxy.py +0 -0
  36. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/__init__.py +0 -0
  37. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/base.py +0 -0
  38. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/composed_grounded.py +0 -0
  39. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/glm45v.py +0 -0
  40. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/gta1.py +0 -0
  41. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/model_types.csv +0 -0
  42. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/omniparser.py +0 -0
  43. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/openai.py +0 -0
  44. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/loops/uitars.py +0 -0
  45. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/proxy/examples.py +0 -0
  46. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/proxy/handlers.py +0 -0
  47. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/responses.py +0 -0
  48. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/types.py +0 -0
  49. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/ui/__init__.py +0 -0
  50. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/ui/__main__.py +0 -0
  51. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/ui/gradio/__init__.py +0 -0
  52. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/ui/gradio/app.py +0 -0
  53. {cua_agent-0.4.23 → cua_agent-0.4.25}/agent/ui/gradio/ui_components.py +0 -0
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.1
2
+ Name: cua-agent
3
+ Version: 0.4.25
4
+ Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
+ Author-Email: TryCua <gh@trycua.com>
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: httpx>=0.27.0
8
+ Requires-Dist: aiohttp>=3.9.3
9
+ Requires-Dist: asyncio
10
+ Requires-Dist: anyio>=4.4.1
11
+ Requires-Dist: typing-extensions>=4.12.2
12
+ Requires-Dist: pydantic>=2.6.4
13
+ Requires-Dist: rich>=13.7.1
14
+ Requires-Dist: python-dotenv>=1.0.1
15
+ Requires-Dist: cua-computer<0.5.0,>=0.4.0
16
+ Requires-Dist: cua-core<0.2.0,>=0.1.8
17
+ Requires-Dist: certifi>=2024.2.2
18
+ Requires-Dist: litellm>=1.74.12
19
+ Provides-Extra: openai
20
+ Provides-Extra: anthropic
21
+ Provides-Extra: omni
22
+ Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "omni"
23
+ Provides-Extra: uitars
24
+ Provides-Extra: uitars-mlx
25
+ Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
26
+ Provides-Extra: uitars-hf
27
+ Requires-Dist: accelerate; extra == "uitars-hf"
28
+ Requires-Dist: torch; extra == "uitars-hf"
29
+ Requires-Dist: transformers>=4.54.0; extra == "uitars-hf"
30
+ Provides-Extra: glm45v-hf
31
+ Requires-Dist: accelerate; extra == "glm45v-hf"
32
+ Requires-Dist: torch; extra == "glm45v-hf"
33
+ Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
34
+ Provides-Extra: ui
35
+ Requires-Dist: gradio>=5.23.3; extra == "ui"
36
+ Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
37
+ Provides-Extra: cli
38
+ Requires-Dist: yaspin>=3.1.0; extra == "cli"
39
+ Provides-Extra: hud
40
+ Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "hud"
41
+ Provides-Extra: all
42
+ Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
43
+ Requires-Dist: accelerate; extra == "all"
44
+ Requires-Dist: torch; extra == "all"
45
+ Requires-Dist: transformers>=4.54.0; extra == "all"
46
+ Requires-Dist: gradio>=5.23.3; extra == "all"
47
+ Requires-Dist: python-dotenv>=1.0.1; extra == "all"
48
+ Requires-Dist: yaspin>=3.1.0; extra == "all"
49
+ Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "all"
50
+ Description-Content-Type: text/markdown
51
+
52
+ <div align="center">
53
+ <h1>
54
+ <div class="image-wrapper" style="display: inline-block;">
55
+ <picture>
56
+ <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
57
+ <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
58
+ <img alt="Shows my svg">
59
+ </picture>
60
+ </div>
61
+
62
+ [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
63
+ [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
64
+ [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
65
+ [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
66
+ </h1>
67
+ </div>
68
+
69
+ **cua-agent** is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility.
70
+
71
+ ## Features
72
+
73
+ - **Safe Computer-Use/Tool-Use**: Using Computer SDK for sandboxed desktops
74
+ - **Multi-Agent Support**: Anthropic Claude, OpenAI computer-use-preview, UI-TARS, Omniparser + any LLM
75
+ - **Multi-API Support**: Take advantage of liteLLM supporting 100+ LLMs / model APIs, including local models (`huggingface-local/`, `ollama_chat/`, `mlx/`)
76
+ - **Cross-Platform**: Works on Windows, macOS, and Linux with cloud and local computer instances
77
+ - **Extensible Callbacks**: Built-in support for image retention, cache control, PII anonymization, budget limits, and trajectory tracking
78
+
79
+ ## Install
80
+
81
+ ```bash
82
+ pip install "cua-agent[all]"
83
+ ```
84
+
85
+ ## Quick Start
86
+
87
+ ```python
88
+ import asyncio
89
+ import os
90
+ from agent import ComputerAgent
91
+ from computer import Computer
92
+
93
+ async def main():
94
+ # Set up computer instance
95
+ async with Computer(
96
+ os_type="linux",
97
+ provider_type="cloud",
98
+ name=os.getenv("CUA_CONTAINER_NAME"),
99
+ api_key=os.getenv("CUA_API_KEY")
100
+ ) as computer:
101
+
102
+ # Create agent
103
+ agent = ComputerAgent(
104
+ model="anthropic/claude-3-5-sonnet-20241022",
105
+ tools=[computer],
106
+ only_n_most_recent_images=3,
107
+ trajectory_dir="trajectories",
108
+ max_trajectory_budget=5.0 # $5 budget limit
109
+ )
110
+
111
+ # Run agent
112
+ messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
113
+
114
+ async for result in agent.run(messages):
115
+ for item in result["output"]:
116
+ if item["type"] == "message":
117
+ print(item["content"][0]["text"])
118
+
119
+ if __name__ == "__main__":
120
+ asyncio.run(main())
121
+ ```
122
+
123
+ ## Docs
124
+
125
+ - [Agent Loops](https://trycua.com/docs/agent-sdk/agent-loops)
126
+ - [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents)
127
+ - [Supported Models](https://trycua.com/docs/agent-sdk/supported-models)
128
+ - [Chat History](https://trycua.com/docs/agent-sdk/chat-history)
129
+ - [Callbacks](https://trycua.com/docs/agent-sdk/callbacks)
130
+ - [Custom Tools](https://trycua.com/docs/agent-sdk/custom-tools)
131
+ - [Custom Computer Handlers](https://trycua.com/docs/agent-sdk/custom-computer-handlers)
132
+ - [Prompt Caching](https://trycua.com/docs/agent-sdk/prompt-caching)
133
+ - [Usage Tracking](https://trycua.com/docs/agent-sdk/usage-tracking)
134
+ - [Benchmarks](https://trycua.com/docs/agent-sdk/benchmarks)
135
+
136
+ ## License
137
+
138
+ MIT License - see LICENSE file for details.
@@ -0,0 +1,87 @@
1
+ <div align="center">
2
+ <h1>
3
+ <div class="image-wrapper" style="display: inline-block;">
4
+ <picture>
5
+ <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
6
+ <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
7
+ <img alt="Shows my svg">
8
+ </picture>
9
+ </div>
10
+
11
+ [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
12
+ [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
13
+ [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
14
+ [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
15
+ </h1>
16
+ </div>
17
+
18
+ **cua-agent** is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility.
19
+
20
+ ## Features
21
+
22
+ - **Safe Computer-Use/Tool-Use**: Using Computer SDK for sandboxed desktops
23
+ - **Multi-Agent Support**: Anthropic Claude, OpenAI computer-use-preview, UI-TARS, Omniparser + any LLM
24
+ - **Multi-API Support**: Take advantage of liteLLM supporting 100+ LLMs / model APIs, including local models (`huggingface-local/`, `ollama_chat/`, `mlx/`)
25
+ - **Cross-Platform**: Works on Windows, macOS, and Linux with cloud and local computer instances
26
+ - **Extensible Callbacks**: Built-in support for image retention, cache control, PII anonymization, budget limits, and trajectory tracking
27
+
28
+ ## Install
29
+
30
+ ```bash
31
+ pip install "cua-agent[all]"
32
+ ```
33
+
34
+ ## Quick Start
35
+
36
+ ```python
37
+ import asyncio
38
+ import os
39
+ from agent import ComputerAgent
40
+ from computer import Computer
41
+
42
+ async def main():
43
+ # Set up computer instance
44
+ async with Computer(
45
+ os_type="linux",
46
+ provider_type="cloud",
47
+ name=os.getenv("CUA_CONTAINER_NAME"),
48
+ api_key=os.getenv("CUA_API_KEY")
49
+ ) as computer:
50
+
51
+ # Create agent
52
+ agent = ComputerAgent(
53
+ model="anthropic/claude-3-5-sonnet-20241022",
54
+ tools=[computer],
55
+ only_n_most_recent_images=3,
56
+ trajectory_dir="trajectories",
57
+ max_trajectory_budget=5.0 # $5 budget limit
58
+ )
59
+
60
+ # Run agent
61
+ messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
62
+
63
+ async for result in agent.run(messages):
64
+ for item in result["output"]:
65
+ if item["type"] == "message":
66
+ print(item["content"][0]["text"])
67
+
68
+ if __name__ == "__main__":
69
+ asyncio.run(main())
70
+ ```
71
+
72
+ ## Docs
73
+
74
+ - [Agent Loops](https://trycua.com/docs/agent-sdk/agent-loops)
75
+ - [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents)
76
+ - [Supported Models](https://trycua.com/docs/agent-sdk/supported-models)
77
+ - [Chat History](https://trycua.com/docs/agent-sdk/chat-history)
78
+ - [Callbacks](https://trycua.com/docs/agent-sdk/callbacks)
79
+ - [Custom Tools](https://trycua.com/docs/agent-sdk/custom-tools)
80
+ - [Custom Computer Handlers](https://trycua.com/docs/agent-sdk/custom-computer-handlers)
81
+ - [Prompt Caching](https://trycua.com/docs/agent-sdk/prompt-caching)
82
+ - [Usage Tracking](https://trycua.com/docs/agent-sdk/usage-tracking)
83
+ - [Benchmarks](https://trycua.com/docs/agent-sdk/benchmarks)
84
+
85
+ ## License
86
+
87
+ MIT License - see LICENSE file for details.
@@ -0,0 +1,90 @@
1
+ """
2
+ Image retention callback handler that limits the number of recent images in message history.
3
+ """
4
+
5
+ from typing import List, Dict, Any, Optional
6
+ from .base import AsyncCallbackHandler
7
+
8
+
9
+ class ImageRetentionCallback(AsyncCallbackHandler):
10
+ """
11
+ Callback handler that applies image retention policy to limit the number
12
+ of recent images in message history to prevent context window overflow.
13
+ """
14
+
15
+ def __init__(self, only_n_most_recent_images: Optional[int] = None):
16
+ """
17
+ Initialize the image retention callback.
18
+
19
+ Args:
20
+ only_n_most_recent_images: If set, only keep the N most recent images in message history
21
+ """
22
+ self.only_n_most_recent_images = only_n_most_recent_images
23
+
24
+ async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
25
+ """
26
+ Apply image retention policy to messages before sending to agent loop.
27
+
28
+ Args:
29
+ messages: List of message dictionaries
30
+
31
+ Returns:
32
+ List of messages with image retention policy applied
33
+ """
34
+ if self.only_n_most_recent_images is None:
35
+ return messages
36
+
37
+ return self._apply_image_retention(messages)
38
+
39
+ def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
40
+ """Apply image retention policy to keep only the N most recent images.
41
+
42
+ Removes computer_call_output items with image_url and their corresponding computer_call items,
43
+ keeping only the most recent N image pairs based on only_n_most_recent_images setting.
44
+
45
+ Args:
46
+ messages: List of message dictionaries
47
+
48
+ Returns:
49
+ Filtered list of messages with image retention applied
50
+ """
51
+ if self.only_n_most_recent_images is None:
52
+ return messages
53
+
54
+ # Gather indices of all computer_call_output messages that contain an image_url
55
+ output_indices: List[int] = []
56
+ for idx, msg in enumerate(messages):
57
+ if msg.get("type") == "computer_call_output":
58
+ out = msg.get("output")
59
+ if isinstance(out, dict) and ("image_url" in out):
60
+ output_indices.append(idx)
61
+
62
+ # Nothing to trim
63
+ if len(output_indices) <= self.only_n_most_recent_images:
64
+ return messages
65
+
66
+ # Determine which outputs to keep (most recent N)
67
+ keep_output_indices = set(output_indices[-self.only_n_most_recent_images :])
68
+
69
+ # Build set of indices to remove in one pass
70
+ to_remove: set[int] = set()
71
+
72
+ for idx in output_indices:
73
+ if idx in keep_output_indices:
74
+ continue # keep this screenshot and its context
75
+
76
+ to_remove.add(idx) # remove the computer_call_output itself
77
+
78
+ # Remove the immediately preceding computer_call with matching call_id (if present)
79
+ call_id = messages[idx].get("call_id")
80
+ prev_idx = idx - 1
81
+ if prev_idx >= 0 and messages[prev_idx].get("type") == "computer_call" and messages[prev_idx].get("call_id") == call_id:
82
+ to_remove.add(prev_idx)
83
+ # Check a single reasoning immediately before that computer_call
84
+ r_idx = prev_idx - 1
85
+ if r_idx >= 0 and messages[r_idx].get("type") == "reasoning":
86
+ to_remove.add(r_idx)
87
+
88
+ # Construct filtered list
89
+ filtered = [m for i, m in enumerate(messages) if i not in to_remove]
90
+ return filtered
@@ -102,37 +102,37 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
102
102
  _keep_keys(action, keep)
103
103
 
104
104
 
105
- # Second pass: if an assistant message is immediately followed by a computer_call,
106
- # replace the assistant message itself with a reasoning message with summary text.
107
- if isinstance(output, list):
108
- for i, item in enumerate(output):
109
- # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
110
- if item.get("type") == "message" and item.get("role") == "assistant":
111
- next_idx = i + 1
112
- if next_idx >= len(output):
113
- continue
114
- next_item = output[next_idx]
115
- if not isinstance(next_item, dict):
116
- continue
117
- if next_item.get("type") != "computer_call":
118
- continue
119
- contents = item.get("content") or []
120
- # Extract text from OutputContent[]
121
- text_parts: List[str] = []
122
- if isinstance(contents, list):
123
- for c in contents:
124
- if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
125
- text_parts.append(c["text"])
126
- text_content = "\n".join(text_parts).strip()
127
- # Replace assistant message with reasoning message
128
- output[i] = {
129
- "type": "reasoning",
130
- "summary": [
131
- {
132
- "type": "summary_text",
133
- "text": text_content,
134
- }
135
- ],
136
- }
105
+ # # Second pass: if an assistant message is immediately followed by a computer_call,
106
+ # # replace the assistant message itself with a reasoning message with summary text.
107
+ # if isinstance(output, list):
108
+ # for i, item in enumerate(output):
109
+ # # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
110
+ # if item.get("type") == "message" and item.get("role") == "assistant":
111
+ # next_idx = i + 1
112
+ # if next_idx >= len(output):
113
+ # continue
114
+ # next_item = output[next_idx]
115
+ # if not isinstance(next_item, dict):
116
+ # continue
117
+ # if next_item.get("type") != "computer_call":
118
+ # continue
119
+ # contents = item.get("content") or []
120
+ # # Extract text from OutputContent[]
121
+ # text_parts: List[str] = []
122
+ # if isinstance(contents, list):
123
+ # for c in contents:
124
+ # if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
125
+ # text_parts.append(c["text"])
126
+ # text_content = "\n".join(text_parts).strip()
127
+ # # Replace assistant message with reasoning message
128
+ # output[i] = {
129
+ # "type": "reasoning",
130
+ # "summary": [
131
+ # {
132
+ # "type": "summary_text",
133
+ # "text": text_content,
134
+ # }
135
+ # ],
136
+ # }
137
137
 
138
138
  return output
@@ -15,6 +15,11 @@ class HumanCompletionUI:
15
15
  self.current_call_id: Optional[str] = None
16
16
  self.refresh_interval = 2.0 # seconds
17
17
  self.last_image = None # Store the last image for display
18
+ # Track current interactive action controls
19
+ self.current_action_type: str = "click"
20
+ self.current_button: str = "left"
21
+ self.current_scroll_x: int = 0
22
+ self.current_scroll_y: int = -120
18
23
 
19
24
  def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
20
25
  """Format messages for display in gr.Chatbot with type='messages'."""
@@ -440,8 +445,8 @@ def create_ui():
440
445
  with gr.Group(visible=False) as click_actions_group:
441
446
  with gr.Row():
442
447
  action_type_radio = gr.Dropdown(
443
- label="Action",
444
- choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
448
+ label="Interactive Action",
449
+ choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down", "scroll"],
445
450
  value="click",
446
451
  scale=2
447
452
  )
@@ -452,6 +457,18 @@ def create_ui():
452
457
  visible=True,
453
458
  scale=1
454
459
  )
460
+ scroll_x_input = gr.Number(
461
+ label="scroll_x",
462
+ value=0,
463
+ visible=False,
464
+ scale=1
465
+ )
466
+ scroll_y_input = gr.Number(
467
+ label="scroll_y",
468
+ value=-120,
469
+ visible=False,
470
+ scale=1
471
+ )
455
472
 
456
473
  conversation_chatbot = gr.Chatbot(
457
474
  label="Conversation",
@@ -545,9 +562,15 @@ def create_ui():
545
562
  def handle_image_click(evt: gr.SelectData):
546
563
  if evt.index is not None:
547
564
  x, y = evt.index
548
- action_type = action_type_radio.value or "click"
549
- button = action_button_radio.value or "left"
550
- result = ui_handler.submit_click_action(x, y, action_type, button)
565
+ action_type = ui_handler.current_action_type or "click"
566
+ button = ui_handler.current_button or "left"
567
+ if action_type == "scroll":
568
+ sx_i = int(ui_handler.current_scroll_x or 0)
569
+ sy_i = int(ui_handler.current_scroll_y or 0)
570
+ # Submit a scroll action with x,y position and scroll deltas
571
+ result = ui_handler.submit_action("scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i)
572
+ else:
573
+ result = ui_handler.submit_click_action(x, y, action_type, button)
551
574
  ui_handler.wait_for_pending_calls()
552
575
  return result
553
576
  return "No coordinates selected"
@@ -570,14 +593,49 @@ def create_ui():
570
593
  outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
571
594
  )
572
595
 
573
- # Toggle button radio visibility based on action type
574
- def toggle_button_visibility(action_type):
575
- return gr.update(visible=(action_type == "click"))
596
+ # Toggle visibility of controls based on action type
597
+ def toggle_action_controls(action_type):
598
+ # Button visible only for click
599
+ button_vis = gr.update(visible=(action_type == "click"))
600
+ # Scroll inputs visible only for scroll
601
+ scroll_x_vis = gr.update(visible=(action_type == "scroll"))
602
+ scroll_y_vis = gr.update(visible=(action_type == "scroll"))
603
+ # Update state
604
+ ui_handler.current_action_type = action_type or "click"
605
+ return button_vis, scroll_x_vis, scroll_y_vis
576
606
 
577
607
  action_type_radio.change(
578
- fn=toggle_button_visibility,
608
+ fn=toggle_action_controls,
579
609
  inputs=[action_type_radio],
580
- outputs=[action_button_radio]
610
+ outputs=[action_button_radio, scroll_x_input, scroll_y_input]
611
+ )
612
+
613
+ # Keep other control values in ui_handler state
614
+ def on_button_change(val):
615
+ ui_handler.current_button = (val or "left")
616
+ action_button_radio.change(
617
+ fn=on_button_change,
618
+ inputs=[action_button_radio]
619
+ )
620
+
621
+ def on_scroll_x_change(val):
622
+ try:
623
+ ui_handler.current_scroll_x = int(val) if val is not None else 0
624
+ except Exception:
625
+ ui_handler.current_scroll_x = 0
626
+ scroll_x_input.change(
627
+ fn=on_scroll_x_change,
628
+ inputs=[scroll_x_input]
629
+ )
630
+
631
+ def on_scroll_y_change(val):
632
+ try:
633
+ ui_handler.current_scroll_y = int(val) if val is not None else 0
634
+ except Exception:
635
+ ui_handler.current_scroll_y = 0
636
+ scroll_y_input.change(
637
+ fn=on_scroll_y_change,
638
+ inputs=[scroll_y_input]
581
639
  )
582
640
 
583
641
  type_submit_btn.click(
@@ -132,23 +132,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
132
132
  converted_content = []
133
133
  for item in content:
134
134
  if isinstance(item, dict) and item.get("type") == "input_image":
135
- # Convert input_image to Anthropic image format
135
+ # Convert input_image to OpenAI image format
136
136
  image_url = item.get("image_url", "")
137
137
  if image_url and image_url != "[omitted]":
138
- # Extract base64 data from data URL
139
- if "," in image_url:
140
- base64_data = image_url.split(",")[-1]
141
- else:
142
- base64_data = image_url
143
-
144
138
  converted_content.append({
145
- "type": "image",
146
- "source": {
147
- "type": "base64",
148
- "media_type": "image/png",
149
- "data": base64_data
139
+ "type": "image_url",
140
+ "image_url": {
141
+ "url": image_url
150
142
  }
151
143
  })
144
+ elif isinstance(item, dict) and item.get("type") == "input_text":
145
+ # Convert input_text to OpenAI text format
146
+ text = item.get("text", "")
147
+ converted_content.append({
148
+ "type": "text",
149
+ "text": text
150
+ })
152
151
  else:
153
152
  # Keep other content types as-is
154
153
  converted_content.append(item)
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "cua-agent"
9
- version = "0.4.23"
9
+ version = "0.4.25"
10
10
  description = "CUA (Computer Use) Agent for AI-driven computer interaction"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -32,7 +32,6 @@ requires-python = ">=3.12"
32
32
  openai = []
33
33
  anthropic = []
34
34
  omni = [
35
- "ultralytics>=8.0.0",
36
35
  "cua-som>=0.1.0,<0.2.0",
37
36
  ]
38
37
  uitars = []
@@ -60,8 +59,6 @@ hud = [
60
59
  "hud-python>=0.4.12,<0.5.0",
61
60
  ]
62
61
  all = [
63
- "ultralytics>=8.0.0",
64
- "cua-som>=0.1.0,<0.2.0",
65
62
  "mlx-vlm>=0.1.27; sys_platform == 'darwin'",
66
63
  "accelerate",
67
64
  "torch",