cua-agent 0.4.24__py3-none-any.whl → 0.4.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/human_tool/ui.py CHANGED
@@ -15,6 +15,11 @@ class HumanCompletionUI:
15
15
  self.current_call_id: Optional[str] = None
16
16
  self.refresh_interval = 2.0 # seconds
17
17
  self.last_image = None # Store the last image for display
18
+ # Track current interactive action controls
19
+ self.current_action_type: str = "click"
20
+ self.current_button: str = "left"
21
+ self.current_scroll_x: int = 0
22
+ self.current_scroll_y: int = -120
18
23
 
19
24
  def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
20
25
  """Format messages for display in gr.Chatbot with type='messages'."""
@@ -440,8 +445,8 @@ def create_ui():
440
445
  with gr.Group(visible=False) as click_actions_group:
441
446
  with gr.Row():
442
447
  action_type_radio = gr.Dropdown(
443
- label="Action",
444
- choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
448
+ label="Interactive Action",
449
+ choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down", "scroll"],
445
450
  value="click",
446
451
  scale=2
447
452
  )
@@ -452,6 +457,18 @@ def create_ui():
452
457
  visible=True,
453
458
  scale=1
454
459
  )
460
+ scroll_x_input = gr.Number(
461
+ label="scroll_x",
462
+ value=0,
463
+ visible=False,
464
+ scale=1
465
+ )
466
+ scroll_y_input = gr.Number(
467
+ label="scroll_y",
468
+ value=-120,
469
+ visible=False,
470
+ scale=1
471
+ )
455
472
 
456
473
  conversation_chatbot = gr.Chatbot(
457
474
  label="Conversation",
@@ -545,9 +562,15 @@ def create_ui():
545
562
  def handle_image_click(evt: gr.SelectData):
546
563
  if evt.index is not None:
547
564
  x, y = evt.index
548
- action_type = action_type_radio.value or "click"
549
- button = action_button_radio.value or "left"
550
- result = ui_handler.submit_click_action(x, y, action_type, button)
565
+ action_type = ui_handler.current_action_type or "click"
566
+ button = ui_handler.current_button or "left"
567
+ if action_type == "scroll":
568
+ sx_i = int(ui_handler.current_scroll_x or 0)
569
+ sy_i = int(ui_handler.current_scroll_y or 0)
570
+ # Submit a scroll action with x,y position and scroll deltas
571
+ result = ui_handler.submit_action("scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i)
572
+ else:
573
+ result = ui_handler.submit_click_action(x, y, action_type, button)
551
574
  ui_handler.wait_for_pending_calls()
552
575
  return result
553
576
  return "No coordinates selected"
@@ -570,14 +593,49 @@ def create_ui():
570
593
  outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
571
594
  )
572
595
 
573
- # Toggle button radio visibility based on action type
574
- def toggle_button_visibility(action_type):
575
- return gr.update(visible=(action_type == "click"))
596
+ # Toggle visibility of controls based on action type
597
+ def toggle_action_controls(action_type):
598
+ # Button visible only for click
599
+ button_vis = gr.update(visible=(action_type == "click"))
600
+ # Scroll inputs visible only for scroll
601
+ scroll_x_vis = gr.update(visible=(action_type == "scroll"))
602
+ scroll_y_vis = gr.update(visible=(action_type == "scroll"))
603
+ # Update state
604
+ ui_handler.current_action_type = action_type or "click"
605
+ return button_vis, scroll_x_vis, scroll_y_vis
576
606
 
577
607
  action_type_radio.change(
578
- fn=toggle_button_visibility,
608
+ fn=toggle_action_controls,
579
609
  inputs=[action_type_radio],
580
- outputs=[action_button_radio]
610
+ outputs=[action_button_radio, scroll_x_input, scroll_y_input]
611
+ )
612
+
613
+ # Keep other control values in ui_handler state
614
+ def on_button_change(val):
615
+ ui_handler.current_button = (val or "left")
616
+ action_button_radio.change(
617
+ fn=on_button_change,
618
+ inputs=[action_button_radio]
619
+ )
620
+
621
+ def on_scroll_x_change(val):
622
+ try:
623
+ ui_handler.current_scroll_x = int(val) if val is not None else 0
624
+ except Exception:
625
+ ui_handler.current_scroll_x = 0
626
+ scroll_x_input.change(
627
+ fn=on_scroll_x_change,
628
+ inputs=[scroll_x_input]
629
+ )
630
+
631
+ def on_scroll_y_change(val):
632
+ try:
633
+ ui_handler.current_scroll_y = int(val) if val is not None else 0
634
+ except Exception:
635
+ ui_handler.current_scroll_y = 0
636
+ scroll_y_input.change(
637
+ fn=on_scroll_y_change,
638
+ inputs=[scroll_y_input]
581
639
  )
582
640
 
583
641
  type_submit_btn.click(
agent/loops/anthropic.py CHANGED
@@ -132,23 +132,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
132
132
  converted_content = []
133
133
  for item in content:
134
134
  if isinstance(item, dict) and item.get("type") == "input_image":
135
- # Convert input_image to Anthropic image format
135
+ # Convert input_image to OpenAI image format
136
136
  image_url = item.get("image_url", "")
137
137
  if image_url and image_url != "[omitted]":
138
- # Extract base64 data from data URL
139
- if "," in image_url:
140
- base64_data = image_url.split(",")[-1]
141
- else:
142
- base64_data = image_url
143
-
144
138
  converted_content.append({
145
- "type": "image",
146
- "source": {
147
- "type": "base64",
148
- "media_type": "image/png",
149
- "data": base64_data
139
+ "type": "image_url",
140
+ "image_url": {
141
+ "url": image_url
150
142
  }
151
143
  })
144
+ elif isinstance(item, dict) and item.get("type") == "input_text":
145
+ # Convert input_text to OpenAI text format
146
+ text = item.get("text", "")
147
+ converted_content.append({
148
+ "type": "text",
149
+ "text": text
150
+ })
152
151
  else:
153
152
  # Keep other content types as-is
154
153
  converted_content.append(item)
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.1
2
+ Name: cua-agent
3
+ Version: 0.4.25
4
+ Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
+ Author-Email: TryCua <gh@trycua.com>
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: httpx>=0.27.0
8
+ Requires-Dist: aiohttp>=3.9.3
9
+ Requires-Dist: asyncio
10
+ Requires-Dist: anyio>=4.4.1
11
+ Requires-Dist: typing-extensions>=4.12.2
12
+ Requires-Dist: pydantic>=2.6.4
13
+ Requires-Dist: rich>=13.7.1
14
+ Requires-Dist: python-dotenv>=1.0.1
15
+ Requires-Dist: cua-computer<0.5.0,>=0.4.0
16
+ Requires-Dist: cua-core<0.2.0,>=0.1.8
17
+ Requires-Dist: certifi>=2024.2.2
18
+ Requires-Dist: litellm>=1.74.12
19
+ Provides-Extra: openai
20
+ Provides-Extra: anthropic
21
+ Provides-Extra: omni
22
+ Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "omni"
23
+ Provides-Extra: uitars
24
+ Provides-Extra: uitars-mlx
25
+ Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
26
+ Provides-Extra: uitars-hf
27
+ Requires-Dist: accelerate; extra == "uitars-hf"
28
+ Requires-Dist: torch; extra == "uitars-hf"
29
+ Requires-Dist: transformers>=4.54.0; extra == "uitars-hf"
30
+ Provides-Extra: glm45v-hf
31
+ Requires-Dist: accelerate; extra == "glm45v-hf"
32
+ Requires-Dist: torch; extra == "glm45v-hf"
33
+ Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
34
+ Provides-Extra: ui
35
+ Requires-Dist: gradio>=5.23.3; extra == "ui"
36
+ Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
37
+ Provides-Extra: cli
38
+ Requires-Dist: yaspin>=3.1.0; extra == "cli"
39
+ Provides-Extra: hud
40
+ Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "hud"
41
+ Provides-Extra: all
42
+ Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
43
+ Requires-Dist: accelerate; extra == "all"
44
+ Requires-Dist: torch; extra == "all"
45
+ Requires-Dist: transformers>=4.54.0; extra == "all"
46
+ Requires-Dist: gradio>=5.23.3; extra == "all"
47
+ Requires-Dist: python-dotenv>=1.0.1; extra == "all"
48
+ Requires-Dist: yaspin>=3.1.0; extra == "all"
49
+ Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "all"
50
+ Description-Content-Type: text/markdown
51
+
52
+ <div align="center">
53
+ <h1>
54
+ <div class="image-wrapper" style="display: inline-block;">
55
+ <picture>
56
+ <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
57
+ <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
58
+ <img alt="Shows my svg">
59
+ </picture>
60
+ </div>
61
+
62
+ [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
63
+ [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
64
+ [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
65
+ [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
66
+ </h1>
67
+ </div>
68
+
69
+ **cua-agent** is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility.
70
+
71
+ ## Features
72
+
73
+ - **Safe Computer-Use/Tool-Use**: Using Computer SDK for sandboxed desktops
74
+ - **Multi-Agent Support**: Anthropic Claude, OpenAI computer-use-preview, UI-TARS, Omniparser + any LLM
75
+ - **Multi-API Support**: Take advantage of liteLLM supporting 100+ LLMs / model APIs, including local models (`huggingface-local/`, `ollama_chat/`, `mlx/`)
76
+ - **Cross-Platform**: Works on Windows, macOS, and Linux with cloud and local computer instances
77
+ - **Extensible Callbacks**: Built-in support for image retention, cache control, PII anonymization, budget limits, and trajectory tracking
78
+
79
+ ## Install
80
+
81
+ ```bash
82
+ pip install "cua-agent[all]"
83
+ ```
84
+
85
+ ## Quick Start
86
+
87
+ ```python
88
+ import asyncio
89
+ import os
90
+ from agent import ComputerAgent
91
+ from computer import Computer
92
+
93
+ async def main():
94
+ # Set up computer instance
95
+ async with Computer(
96
+ os_type="linux",
97
+ provider_type="cloud",
98
+ name=os.getenv("CUA_CONTAINER_NAME"),
99
+ api_key=os.getenv("CUA_API_KEY")
100
+ ) as computer:
101
+
102
+ # Create agent
103
+ agent = ComputerAgent(
104
+ model="anthropic/claude-3-5-sonnet-20241022",
105
+ tools=[computer],
106
+ only_n_most_recent_images=3,
107
+ trajectory_dir="trajectories",
108
+ max_trajectory_budget=5.0 # $5 budget limit
109
+ )
110
+
111
+ # Run agent
112
+ messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
113
+
114
+ async for result in agent.run(messages):
115
+ for item in result["output"]:
116
+ if item["type"] == "message":
117
+ print(item["content"][0]["text"])
118
+
119
+ if __name__ == "__main__":
120
+ asyncio.run(main())
121
+ ```
122
+
123
+ ## Docs
124
+
125
+ - [Agent Loops](https://trycua.com/docs/agent-sdk/agent-loops)
126
+ - [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents)
127
+ - [Supported Models](https://trycua.com/docs/agent-sdk/supported-models)
128
+ - [Chat History](https://trycua.com/docs/agent-sdk/chat-history)
129
+ - [Callbacks](https://trycua.com/docs/agent-sdk/callbacks)
130
+ - [Custom Tools](https://trycua.com/docs/agent-sdk/custom-tools)
131
+ - [Custom Computer Handlers](https://trycua.com/docs/agent-sdk/custom-computer-handlers)
132
+ - [Prompt Caching](https://trycua.com/docs/agent-sdk/prompt-caching)
133
+ - [Usage Tracking](https://trycua.com/docs/agent-sdk/usage-tracking)
134
+ - [Benchmarks](https://trycua.com/docs/agent-sdk/benchmarks)
135
+
136
+ ## License
137
+
138
+ MIT License - see LICENSE file for details.
@@ -23,11 +23,11 @@ agent/decorators.py,sha256=n8VvMsififWkmuk75Q7HIpo0xAA2yAeQ6J-OOiwbAKc,1836
23
23
  agent/human_tool/__init__.py,sha256=3m5_g-Fo_0yX5vi7eg-A92oTqO0N3aY929Ajp78HKsE,771
24
24
  agent/human_tool/__main__.py,sha256=VsW2BAghlonOuqZbP_xuCsaec9bemA1I_ibnDcED9D4,1068
25
25
  agent/human_tool/server.py,sha256=ceuL5kw_RjgAi8fueLU3nTjyzOLE25Shv1oTJnSHsoQ,7964
26
- agent/human_tool/ui.py,sha256=c5IbzVbj6dTtrswK3KKo6svjatQhQHHwzFA848U2Cw0,28130
26
+ agent/human_tool/ui.py,sha256=wu9eZorhxCkyPTlBSZjYaVzutoHMlucAz8UGNpAT4bM,30644
27
27
  agent/integrations/hud/__init__.py,sha256=q0QEyJZSrcjiN2sRi_hoX-ePmLyYm9CpAIvA0xMxGJI,8360
28
28
  agent/integrations/hud/proxy.py,sha256=yA7C2jeXnrpI5HS0VgCvn0BflVbAORZynIfyE27rvBg,7782
29
29
  agent/loops/__init__.py,sha256=Ef8aj07l3osibwDk-DTo80PrpL4_GdKRTP1ikl_b-BQ,328
30
- agent/loops/anthropic.py,sha256=Th3dNv8FULvyDXx7aPVGSzbrqBiDNFXRso3DSa88d_w,70301
30
+ agent/loops/anthropic.py,sha256=q7lr1PjI6VPtlozoweluY2c3hCGqa_2s-whzxa37iKE,70250
31
31
  agent/loops/base.py,sha256=LK7kSTnc2CB88LI7qr2VP7LMq0eS5r2bSEnrxO6IN5U,2345
32
32
  agent/loops/composed_grounded.py,sha256=8oJoqaRzKWbI9I4VoFuAoUzQ11_CFnYT-EdPOy-NVEQ,12349
33
33
  agent/loops/glm45v.py,sha256=V1f-5vAifbYcY-qTc7fW2KXVRkAfApQI_EjavH3X2ak,35110
@@ -45,7 +45,7 @@ agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
45
45
  agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
46
46
  agent/ui/gradio/app.py,sha256=Ol97YEbwREZZQ9_PMjVHlfOcu9BGsawxgAGAm79hT80,9117
47
47
  agent/ui/gradio/ui_components.py,sha256=dJUvKDmc1oSejtoR_gU_oWWYwxaOOQyPloSYRGMrUCQ,36068
48
- cua_agent-0.4.24.dist-info/METADATA,sha256=-yvFHUziugRMdDqtf_NDVnQfcNbHKut_rr-yswIDYkM,12712
49
- cua_agent-0.4.24.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
50
- cua_agent-0.4.24.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
51
- cua_agent-0.4.24.dist-info/RECORD,,
48
+ cua_agent-0.4.25.dist-info/METADATA,sha256=RddHOGfOJVdXhPQMXCj1c7RLBJcmH2yZMwS36dbnB5Q,5624
49
+ cua_agent-0.4.25.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
50
+ cua_agent-0.4.25.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
51
+ cua_agent-0.4.25.dist-info/RECORD,,
@@ -1,436 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: cua-agent
3
- Version: 0.4.24
4
- Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
- Author-Email: TryCua <gh@trycua.com>
6
- Requires-Python: >=3.12
7
- Requires-Dist: httpx>=0.27.0
8
- Requires-Dist: aiohttp>=3.9.3
9
- Requires-Dist: asyncio
10
- Requires-Dist: anyio>=4.4.1
11
- Requires-Dist: typing-extensions>=4.12.2
12
- Requires-Dist: pydantic>=2.6.4
13
- Requires-Dist: rich>=13.7.1
14
- Requires-Dist: python-dotenv>=1.0.1
15
- Requires-Dist: cua-computer<0.5.0,>=0.4.0
16
- Requires-Dist: cua-core<0.2.0,>=0.1.8
17
- Requires-Dist: certifi>=2024.2.2
18
- Requires-Dist: litellm>=1.74.12
19
- Provides-Extra: openai
20
- Provides-Extra: anthropic
21
- Provides-Extra: omni
22
- Requires-Dist: ultralytics>=8.0.0; extra == "omni"
23
- Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "omni"
24
- Provides-Extra: uitars
25
- Provides-Extra: uitars-mlx
26
- Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
27
- Provides-Extra: uitars-hf
28
- Requires-Dist: accelerate; extra == "uitars-hf"
29
- Requires-Dist: torch; extra == "uitars-hf"
30
- Requires-Dist: transformers>=4.54.0; extra == "uitars-hf"
31
- Provides-Extra: glm45v-hf
32
- Requires-Dist: accelerate; extra == "glm45v-hf"
33
- Requires-Dist: torch; extra == "glm45v-hf"
34
- Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
35
- Provides-Extra: ui
36
- Requires-Dist: gradio>=5.23.3; extra == "ui"
37
- Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
38
- Provides-Extra: cli
39
- Requires-Dist: yaspin>=3.1.0; extra == "cli"
40
- Provides-Extra: hud
41
- Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "hud"
42
- Provides-Extra: all
43
- Requires-Dist: ultralytics>=8.0.0; extra == "all"
44
- Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
45
- Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
46
- Requires-Dist: accelerate; extra == "all"
47
- Requires-Dist: torch; extra == "all"
48
- Requires-Dist: transformers>=4.54.0; extra == "all"
49
- Requires-Dist: gradio>=5.23.3; extra == "all"
50
- Requires-Dist: python-dotenv>=1.0.1; extra == "all"
51
- Requires-Dist: yaspin>=3.1.0; extra == "all"
52
- Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "all"
53
- Description-Content-Type: text/markdown
54
-
55
- <div align="center">
56
- <h1>
57
- <div class="image-wrapper" style="display: inline-block;">
58
- <picture>
59
- <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
60
- <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
61
- <img alt="Shows my svg">
62
- </picture>
63
- </div>
64
-
65
- [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
66
- [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
67
- [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
68
- [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
69
- </h1>
70
- </div>
71
-
72
- **cua-agent** is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility.
73
-
74
- ## Features
75
-
76
- - **Safe Computer-Use/Tool-Use**: Using Computer SDK for sandboxed desktops
77
- - **Multi-Agent Support**: Anthropic Claude, OpenAI computer-use-preview, UI-TARS, Omniparser + any LLM
78
- - **Multi-API Support**: Take advantage of liteLLM supporting 100+ LLMs / model APIs, including local models (`huggingface-local/`, `ollama_chat/`, `mlx/`)
79
- - **Cross-Platform**: Works on Windows, macOS, and Linux with cloud and local computer instances
80
- - **Extensible Callbacks**: Built-in support for image retention, cache control, PII anonymization, budget limits, and trajectory tracking
81
-
82
- ## Install
83
-
84
- ```bash
85
- pip install "cua-agent[all]"
86
-
87
- # or install specific providers
88
- pip install "cua-agent[openai]" # OpenAI computer-use-preview support
89
- pip install "cua-agent[anthropic]" # Anthropic Claude support
90
- pip install "cua-agent[omni]" # Omniparser + any LLM support
91
- pip install "cua-agent[uitars]" # UI-TARS
92
- pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support
93
- pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support
94
- pip install "cua-agent[glm45v-hf]" # GLM-4.5V + Huggingface support
95
- pip install "cua-agent[ui]" # Gradio UI support
96
- ```
97
-
98
- ## Quick Start
99
-
100
- ```python
101
- import asyncio
102
- import os
103
- from agent import ComputerAgent
104
- from computer import Computer
105
-
106
- async def main():
107
- # Set up computer instance
108
- async with Computer(
109
- os_type="linux",
110
- provider_type="cloud",
111
- name=os.getenv("CUA_CONTAINER_NAME"),
112
- api_key=os.getenv("CUA_API_KEY")
113
- ) as computer:
114
-
115
- # Create agent
116
- agent = ComputerAgent(
117
- model="anthropic/claude-3-5-sonnet-20241022",
118
- tools=[computer],
119
- only_n_most_recent_images=3,
120
- trajectory_dir="trajectories",
121
- max_trajectory_budget=5.0 # $5 budget limit
122
- )
123
-
124
- # Run agent
125
- messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
126
-
127
- async for result in agent.run(messages):
128
- for item in result["output"]:
129
- if item["type"] == "message":
130
- print(item["content"][0]["text"])
131
-
132
- if __name__ == "__main__":
133
- asyncio.run(main())
134
- ```
135
-
136
- ## Supported Models
137
-
138
- ### Anthropic Claude (Computer Use API)
139
- ```python
140
- model="anthropic/claude-3-5-sonnet-20241022"
141
- model="anthropic/claude-3-7-sonnet-20250219"
142
- model="anthropic/claude-opus-4-20250514"
143
- model="anthropic/claude-sonnet-4-20250514"
144
- ```
145
-
146
- ### OpenAI Computer Use Preview
147
- ```python
148
- model="openai/computer-use-preview"
149
- ```
150
-
151
- ### UI-TARS (Local or Huggingface Inference)
152
- ```python
153
- model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
154
- model="ollama_chat/0000/ui-tars-1.5-7b"
155
- ```
156
-
157
- ### Omniparser + Any LLM
158
- ```python
159
- model="omniparser+ollama_chat/mistral-small3.2"
160
- model="omniparser+vertex_ai/gemini-pro"
161
- model="omniparser+anthropic/claude-3-5-sonnet-20241022"
162
- model="omniparser+openai/gpt-4o"
163
- ```
164
-
165
- ## Custom Tools
166
-
167
- Define custom tools using decorated functions:
168
-
169
- ```python
170
- from computer.helpers import sandboxed
171
-
172
- @sandboxed()
173
- def read_file(location: str) -> str:
174
- """Read contents of a file
175
-
176
- Parameters
177
- ----------
178
- location : str
179
- Path to the file to read
180
-
181
- Returns
182
- -------
183
- str
184
- Contents of the file or error message
185
- """
186
- try:
187
- with open(location, 'r') as f:
188
- return f.read()
189
- except Exception as e:
190
- return f"Error reading file: {str(e)}"
191
-
192
- def calculate(a: int, b: int) -> int:
193
- """Calculate the sum of two integers"""
194
- return a + b
195
-
196
- # Use with agent
197
- agent = ComputerAgent(
198
- model="anthropic/claude-3-5-sonnet-20241022",
199
- tools=[computer, read_file, calculate]
200
- )
201
- ```
202
-
203
- ## Callbacks System
204
-
205
- agent provides a comprehensive callback system for extending functionality:
206
-
207
- ### Built-in Callbacks
208
-
209
- ```python
210
- from agent.callbacks import (
211
- ImageRetentionCallback,
212
- TrajectorySaverCallback,
213
- BudgetManagerCallback,
214
- LoggingCallback
215
- )
216
-
217
- agent = ComputerAgent(
218
- model="anthropic/claude-3-5-sonnet-20241022",
219
- tools=[computer],
220
- callbacks=[
221
- ImageRetentionCallback(only_n_most_recent_images=3),
222
- TrajectorySaverCallback(trajectory_dir="trajectories"),
223
- BudgetManagerCallback(max_budget=10.0, raise_error=True),
224
- LoggingCallback(level=logging.INFO)
225
- ]
226
- )
227
- ```
228
-
229
- ### Custom Callbacks
230
-
231
- ```python
232
- from agent.callbacks.base import AsyncCallbackHandler
233
-
234
- class CustomCallback(AsyncCallbackHandler):
235
- async def on_llm_start(self, messages):
236
- """Preprocess messages before LLM call"""
237
- # Add custom preprocessing logic
238
- return messages
239
-
240
- async def on_llm_end(self, messages):
241
- """Postprocess messages after LLM call"""
242
- # Add custom postprocessing logic
243
- return messages
244
-
245
- async def on_usage(self, usage):
246
- """Track usage information"""
247
- print(f"Tokens used: {usage.total_tokens}")
248
- ```
249
-
250
- ## Budget Management
251
-
252
- Control costs with built-in budget management:
253
-
254
- ```python
255
- # Simple budget limit
256
- agent = ComputerAgent(
257
- model="anthropic/claude-3-5-sonnet-20241022",
258
- max_trajectory_budget=5.0 # $5 limit
259
- )
260
-
261
- # Advanced budget configuration
262
- agent = ComputerAgent(
263
- model="anthropic/claude-3-5-sonnet-20241022",
264
- max_trajectory_budget={
265
- "max_budget": 10.0,
266
- "raise_error": True, # Raise error when exceeded
267
- "reset_after_each_run": False # Persistent across runs
268
- }
269
- )
270
- ```
271
-
272
- ## Trajectory Management
273
-
274
- Save and replay agent conversations:
275
-
276
- ```python
277
- agent = ComputerAgent(
278
- model="anthropic/claude-3-5-sonnet-20241022",
279
- trajectory_dir="trajectories", # Auto-save trajectories
280
- tools=[computer]
281
- )
282
-
283
- # Trajectories are saved with:
284
- # - Complete conversation history
285
- # - Usage statistics and costs
286
- # - Timestamps and metadata
287
- # - Screenshots and computer actions
288
- ```
289
-
290
- ## Configuration Options
291
-
292
- ### ComputerAgent Parameters
293
-
294
- - `model`: Model identifier (required)
295
- - `tools`: List of computer objects and decorated functions
296
- - `callbacks`: List of callback handlers for extensibility
297
- - `only_n_most_recent_images`: Limit recent images to prevent context overflow
298
- - `verbosity`: Logging level (logging.INFO, logging.DEBUG, etc.)
299
- - `trajectory_dir`: Directory to save conversation trajectories
300
- - `max_retries`: Maximum API call retries (default: 3)
301
- - `screenshot_delay`: Delay between actions and screenshots (default: 0.5s)
302
- - `use_prompt_caching`: Enable prompt caching for supported models
303
- - `max_trajectory_budget`: Budget limit configuration
304
-
305
- ### Environment Variables
306
-
307
- ```bash
308
- # Computer instance (cloud)
309
- export CUA_CONTAINER_NAME="your-container-name"
310
- export CUA_API_KEY="your-cua-api-key"
311
-
312
- # LLM API keys
313
- export ANTHROPIC_API_KEY="your-anthropic-key"
314
- export OPENAI_API_KEY="your-openai-key"
315
- ```
316
-
317
- ## Advanced Usage
318
-
319
- ### Streaming Responses
320
-
321
- ```python
322
- async for result in agent.run(messages, stream=True):
323
- # Process streaming chunks
324
- for item in result["output"]:
325
- if item["type"] == "message":
326
- print(item["content"][0]["text"], end="", flush=True)
327
- elif item["type"] == "computer_call":
328
- action = item["action"]
329
- print(f"\n[Action: {action['type']}]")
330
- ```
331
-
332
- ### Interactive Chat Loop
333
-
334
- ```python
335
- history = []
336
- while True:
337
- user_input = input("> ")
338
- if user_input.lower() in ['quit', 'exit']:
339
- break
340
-
341
- history.append({"role": "user", "content": user_input})
342
-
343
- async for result in agent.run(history):
344
- history += result["output"]
345
-
346
- # Display assistant responses
347
- for item in result["output"]:
348
- if item["type"] == "message":
349
- print(item["content"][0]["text"])
350
- ```
351
-
352
- ### Error Handling
353
-
354
- ```python
355
- try:
356
- async for result in agent.run(messages):
357
- # Process results
358
- pass
359
- except BudgetExceededException:
360
- print("Budget limit exceeded")
361
- except Exception as e:
362
- print(f"Agent error: {e}")
363
- ```
364
-
365
- ## API Reference
366
-
367
- ### ComputerAgent.run()
368
-
369
- ```python
370
- async def run(
371
- self,
372
- messages: Messages,
373
- stream: bool = False,
374
- **kwargs
375
- ) -> AsyncGenerator[Dict[str, Any], None]:
376
- """
377
- Run the agent with the given messages.
378
-
379
- Args:
380
- messages: List of message dictionaries
381
- stream: Whether to stream the response
382
- **kwargs: Additional arguments
383
-
384
- Returns:
385
- AsyncGenerator that yields response chunks
386
- """
387
- ```
388
-
389
- ### Message Format
390
-
391
- ```python
392
- messages = [
393
- {
394
- "role": "user",
395
- "content": "Take a screenshot and describe what you see"
396
- },
397
- {
398
- "role": "assistant",
399
- "content": "I'll take a screenshot for you."
400
- }
401
- ]
402
- ```
403
-
404
- ### Response Format
405
-
406
- ```python
407
- {
408
- "output": [
409
- {
410
- "type": "message",
411
- "role": "assistant",
412
- "content": [{"type": "output_text", "text": "I can see..."}]
413
- },
414
- {
415
- "type": "computer_call",
416
- "action": {"type": "screenshot"},
417
- "call_id": "call_123"
418
- },
419
- {
420
- "type": "computer_call_output",
421
- "call_id": "call_123",
422
- "output": {"image_url": "data:image/png;base64,..."}
423
- }
424
- ],
425
- "usage": {
426
- "prompt_tokens": 150,
427
- "completion_tokens": 75,
428
- "total_tokens": 225,
429
- "response_cost": 0.01,
430
- }
431
- }
432
- ```
433
-
434
- ## License
435
-
436
- MIT License - see LICENSE file for details.