cua-agent 0.4.23__py3-none-any.whl → 0.4.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/callbacks/image_retention.py +38 -87
- agent/callbacks/operator_validator.py +32 -32
- agent/human_tool/ui.py +68 -10
- agent/loops/anthropic.py +11 -12
- cua_agent-0.4.25.dist-info/METADATA +138 -0
- {cua_agent-0.4.23.dist-info → cua_agent-0.4.25.dist-info}/RECORD +8 -8
- cua_agent-0.4.23.dist-info/METADATA +0 -436
- {cua_agent-0.4.23.dist-info → cua_agent-0.4.25.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.23.dist-info → cua_agent-0.4.25.dist-info}/entry_points.txt +0 -0
|
@@ -50,90 +50,41 @@ class ImageRetentionCallback(AsyncCallbackHandler):
|
|
|
50
50
|
"""
|
|
51
51
|
if self.only_n_most_recent_images is None:
|
|
52
52
|
return messages
|
|
53
|
-
|
|
54
|
-
#
|
|
55
|
-
|
|
56
|
-
for
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
for
|
|
90
|
-
|
|
91
|
-
call_id = msg.get("call_id")
|
|
92
|
-
|
|
93
|
-
# Remove old computer_call items
|
|
94
|
-
if msg_type == "computer_call" and call_id not in keep_call_ids:
|
|
95
|
-
# Check if this call_id corresponds to an image call
|
|
96
|
-
has_image_output = any(
|
|
97
|
-
m.get("type") == "computer_call_output" and
|
|
98
|
-
m.get("call_id") == call_id and
|
|
99
|
-
isinstance(m.get("output"), dict) and
|
|
100
|
-
"image_url" in m.get("output", {})
|
|
101
|
-
for m in messages_with_call_ids
|
|
102
|
-
)
|
|
103
|
-
if has_image_output:
|
|
104
|
-
continue # Skip this computer_call
|
|
105
|
-
|
|
106
|
-
# Remove old computer_call_output items with images
|
|
107
|
-
if (msg_type == "computer_call_output" and
|
|
108
|
-
call_id not in keep_call_ids and
|
|
109
|
-
isinstance(msg.get("output"), dict) and
|
|
110
|
-
"image_url" in msg.get("output", {})):
|
|
111
|
-
continue # Skip this computer_call_output
|
|
112
|
-
|
|
113
|
-
# Remove old reasoning items that are paired with removed computer calls
|
|
114
|
-
if (msg_type == "reasoning" and
|
|
115
|
-
call_id and call_id not in keep_call_ids):
|
|
116
|
-
# Check if this call_id corresponds to an image call that's being removed
|
|
117
|
-
has_image_output = any(
|
|
118
|
-
m.get("type") == "computer_call_output" and
|
|
119
|
-
m.get("call_id") == call_id and
|
|
120
|
-
isinstance(m.get("output"), dict) and
|
|
121
|
-
"image_url" in m.get("output", {})
|
|
122
|
-
for m in messages_with_call_ids
|
|
123
|
-
)
|
|
124
|
-
if has_image_output:
|
|
125
|
-
continue # Skip this reasoning item
|
|
126
|
-
|
|
127
|
-
filtered_messages.append(msg)
|
|
128
|
-
|
|
129
|
-
# Clean up: Remove call_id from reasoning items before returning
|
|
130
|
-
final_messages = []
|
|
131
|
-
for msg in filtered_messages:
|
|
132
|
-
if msg.get("type") == "reasoning" and "call_id" in msg:
|
|
133
|
-
# Create a copy without call_id for reasoning items
|
|
134
|
-
cleaned_msg = {k: v for k, v in msg.items() if k != "call_id"}
|
|
135
|
-
final_messages.append(cleaned_msg)
|
|
136
|
-
else:
|
|
137
|
-
final_messages.append(msg)
|
|
138
|
-
|
|
139
|
-
return final_messages
|
|
53
|
+
|
|
54
|
+
# Gather indices of all computer_call_output messages that contain an image_url
|
|
55
|
+
output_indices: List[int] = []
|
|
56
|
+
for idx, msg in enumerate(messages):
|
|
57
|
+
if msg.get("type") == "computer_call_output":
|
|
58
|
+
out = msg.get("output")
|
|
59
|
+
if isinstance(out, dict) and ("image_url" in out):
|
|
60
|
+
output_indices.append(idx)
|
|
61
|
+
|
|
62
|
+
# Nothing to trim
|
|
63
|
+
if len(output_indices) <= self.only_n_most_recent_images:
|
|
64
|
+
return messages
|
|
65
|
+
|
|
66
|
+
# Determine which outputs to keep (most recent N)
|
|
67
|
+
keep_output_indices = set(output_indices[-self.only_n_most_recent_images :])
|
|
68
|
+
|
|
69
|
+
# Build set of indices to remove in one pass
|
|
70
|
+
to_remove: set[int] = set()
|
|
71
|
+
|
|
72
|
+
for idx in output_indices:
|
|
73
|
+
if idx in keep_output_indices:
|
|
74
|
+
continue # keep this screenshot and its context
|
|
75
|
+
|
|
76
|
+
to_remove.add(idx) # remove the computer_call_output itself
|
|
77
|
+
|
|
78
|
+
# Remove the immediately preceding computer_call with matching call_id (if present)
|
|
79
|
+
call_id = messages[idx].get("call_id")
|
|
80
|
+
prev_idx = idx - 1
|
|
81
|
+
if prev_idx >= 0 and messages[prev_idx].get("type") == "computer_call" and messages[prev_idx].get("call_id") == call_id:
|
|
82
|
+
to_remove.add(prev_idx)
|
|
83
|
+
# Check a single reasoning immediately before that computer_call
|
|
84
|
+
r_idx = prev_idx - 1
|
|
85
|
+
if r_idx >= 0 and messages[r_idx].get("type") == "reasoning":
|
|
86
|
+
to_remove.add(r_idx)
|
|
87
|
+
|
|
88
|
+
# Construct filtered list
|
|
89
|
+
filtered = [m for i, m in enumerate(messages) if i not in to_remove]
|
|
90
|
+
return filtered
|
|
@@ -102,37 +102,37 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
|
|
|
102
102
|
_keep_keys(action, keep)
|
|
103
103
|
|
|
104
104
|
|
|
105
|
-
# Second pass: if an assistant message is immediately followed by a computer_call,
|
|
106
|
-
# replace the assistant message itself with a reasoning message with summary text.
|
|
107
|
-
if isinstance(output, list):
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
105
|
+
# # Second pass: if an assistant message is immediately followed by a computer_call,
|
|
106
|
+
# # replace the assistant message itself with a reasoning message with summary text.
|
|
107
|
+
# if isinstance(output, list):
|
|
108
|
+
# for i, item in enumerate(output):
|
|
109
|
+
# # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
|
|
110
|
+
# if item.get("type") == "message" and item.get("role") == "assistant":
|
|
111
|
+
# next_idx = i + 1
|
|
112
|
+
# if next_idx >= len(output):
|
|
113
|
+
# continue
|
|
114
|
+
# next_item = output[next_idx]
|
|
115
|
+
# if not isinstance(next_item, dict):
|
|
116
|
+
# continue
|
|
117
|
+
# if next_item.get("type") != "computer_call":
|
|
118
|
+
# continue
|
|
119
|
+
# contents = item.get("content") or []
|
|
120
|
+
# # Extract text from OutputContent[]
|
|
121
|
+
# text_parts: List[str] = []
|
|
122
|
+
# if isinstance(contents, list):
|
|
123
|
+
# for c in contents:
|
|
124
|
+
# if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
|
|
125
|
+
# text_parts.append(c["text"])
|
|
126
|
+
# text_content = "\n".join(text_parts).strip()
|
|
127
|
+
# # Replace assistant message with reasoning message
|
|
128
|
+
# output[i] = {
|
|
129
|
+
# "type": "reasoning",
|
|
130
|
+
# "summary": [
|
|
131
|
+
# {
|
|
132
|
+
# "type": "summary_text",
|
|
133
|
+
# "text": text_content,
|
|
134
|
+
# }
|
|
135
|
+
# ],
|
|
136
|
+
# }
|
|
137
137
|
|
|
138
138
|
return output
|
agent/human_tool/ui.py
CHANGED
|
@@ -15,6 +15,11 @@ class HumanCompletionUI:
|
|
|
15
15
|
self.current_call_id: Optional[str] = None
|
|
16
16
|
self.refresh_interval = 2.0 # seconds
|
|
17
17
|
self.last_image = None # Store the last image for display
|
|
18
|
+
# Track current interactive action controls
|
|
19
|
+
self.current_action_type: str = "click"
|
|
20
|
+
self.current_button: str = "left"
|
|
21
|
+
self.current_scroll_x: int = 0
|
|
22
|
+
self.current_scroll_y: int = -120
|
|
18
23
|
|
|
19
24
|
def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
20
25
|
"""Format messages for display in gr.Chatbot with type='messages'."""
|
|
@@ -440,8 +445,8 @@ def create_ui():
|
|
|
440
445
|
with gr.Group(visible=False) as click_actions_group:
|
|
441
446
|
with gr.Row():
|
|
442
447
|
action_type_radio = gr.Dropdown(
|
|
443
|
-
label="Action",
|
|
444
|
-
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
|
|
448
|
+
label="Interactive Action",
|
|
449
|
+
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down", "scroll"],
|
|
445
450
|
value="click",
|
|
446
451
|
scale=2
|
|
447
452
|
)
|
|
@@ -452,6 +457,18 @@ def create_ui():
|
|
|
452
457
|
visible=True,
|
|
453
458
|
scale=1
|
|
454
459
|
)
|
|
460
|
+
scroll_x_input = gr.Number(
|
|
461
|
+
label="scroll_x",
|
|
462
|
+
value=0,
|
|
463
|
+
visible=False,
|
|
464
|
+
scale=1
|
|
465
|
+
)
|
|
466
|
+
scroll_y_input = gr.Number(
|
|
467
|
+
label="scroll_y",
|
|
468
|
+
value=-120,
|
|
469
|
+
visible=False,
|
|
470
|
+
scale=1
|
|
471
|
+
)
|
|
455
472
|
|
|
456
473
|
conversation_chatbot = gr.Chatbot(
|
|
457
474
|
label="Conversation",
|
|
@@ -545,9 +562,15 @@ def create_ui():
|
|
|
545
562
|
def handle_image_click(evt: gr.SelectData):
|
|
546
563
|
if evt.index is not None:
|
|
547
564
|
x, y = evt.index
|
|
548
|
-
action_type =
|
|
549
|
-
button =
|
|
550
|
-
|
|
565
|
+
action_type = ui_handler.current_action_type or "click"
|
|
566
|
+
button = ui_handler.current_button or "left"
|
|
567
|
+
if action_type == "scroll":
|
|
568
|
+
sx_i = int(ui_handler.current_scroll_x or 0)
|
|
569
|
+
sy_i = int(ui_handler.current_scroll_y or 0)
|
|
570
|
+
# Submit a scroll action with x,y position and scroll deltas
|
|
571
|
+
result = ui_handler.submit_action("scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i)
|
|
572
|
+
else:
|
|
573
|
+
result = ui_handler.submit_click_action(x, y, action_type, button)
|
|
551
574
|
ui_handler.wait_for_pending_calls()
|
|
552
575
|
return result
|
|
553
576
|
return "No coordinates selected"
|
|
@@ -570,14 +593,49 @@ def create_ui():
|
|
|
570
593
|
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
|
571
594
|
)
|
|
572
595
|
|
|
573
|
-
# Toggle
|
|
574
|
-
def
|
|
575
|
-
|
|
596
|
+
# Toggle visibility of controls based on action type
|
|
597
|
+
def toggle_action_controls(action_type):
|
|
598
|
+
# Button visible only for click
|
|
599
|
+
button_vis = gr.update(visible=(action_type == "click"))
|
|
600
|
+
# Scroll inputs visible only for scroll
|
|
601
|
+
scroll_x_vis = gr.update(visible=(action_type == "scroll"))
|
|
602
|
+
scroll_y_vis = gr.update(visible=(action_type == "scroll"))
|
|
603
|
+
# Update state
|
|
604
|
+
ui_handler.current_action_type = action_type or "click"
|
|
605
|
+
return button_vis, scroll_x_vis, scroll_y_vis
|
|
576
606
|
|
|
577
607
|
action_type_radio.change(
|
|
578
|
-
fn=
|
|
608
|
+
fn=toggle_action_controls,
|
|
579
609
|
inputs=[action_type_radio],
|
|
580
|
-
outputs=[action_button_radio]
|
|
610
|
+
outputs=[action_button_radio, scroll_x_input, scroll_y_input]
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
# Keep other control values in ui_handler state
|
|
614
|
+
def on_button_change(val):
|
|
615
|
+
ui_handler.current_button = (val or "left")
|
|
616
|
+
action_button_radio.change(
|
|
617
|
+
fn=on_button_change,
|
|
618
|
+
inputs=[action_button_radio]
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
def on_scroll_x_change(val):
|
|
622
|
+
try:
|
|
623
|
+
ui_handler.current_scroll_x = int(val) if val is not None else 0
|
|
624
|
+
except Exception:
|
|
625
|
+
ui_handler.current_scroll_x = 0
|
|
626
|
+
scroll_x_input.change(
|
|
627
|
+
fn=on_scroll_x_change,
|
|
628
|
+
inputs=[scroll_x_input]
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
def on_scroll_y_change(val):
|
|
632
|
+
try:
|
|
633
|
+
ui_handler.current_scroll_y = int(val) if val is not None else 0
|
|
634
|
+
except Exception:
|
|
635
|
+
ui_handler.current_scroll_y = 0
|
|
636
|
+
scroll_y_input.change(
|
|
637
|
+
fn=on_scroll_y_change,
|
|
638
|
+
inputs=[scroll_y_input]
|
|
581
639
|
)
|
|
582
640
|
|
|
583
641
|
type_submit_btn.click(
|
agent/loops/anthropic.py
CHANGED
|
@@ -132,23 +132,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
|
|
132
132
|
converted_content = []
|
|
133
133
|
for item in content:
|
|
134
134
|
if isinstance(item, dict) and item.get("type") == "input_image":
|
|
135
|
-
# Convert input_image to
|
|
135
|
+
# Convert input_image to OpenAI image format
|
|
136
136
|
image_url = item.get("image_url", "")
|
|
137
137
|
if image_url and image_url != "[omitted]":
|
|
138
|
-
# Extract base64 data from data URL
|
|
139
|
-
if "," in image_url:
|
|
140
|
-
base64_data = image_url.split(",")[-1]
|
|
141
|
-
else:
|
|
142
|
-
base64_data = image_url
|
|
143
|
-
|
|
144
138
|
converted_content.append({
|
|
145
|
-
"type": "
|
|
146
|
-
"
|
|
147
|
-
"
|
|
148
|
-
"media_type": "image/png",
|
|
149
|
-
"data": base64_data
|
|
139
|
+
"type": "image_url",
|
|
140
|
+
"image_url": {
|
|
141
|
+
"url": image_url
|
|
150
142
|
}
|
|
151
143
|
})
|
|
144
|
+
elif isinstance(item, dict) and item.get("type") == "input_text":
|
|
145
|
+
# Convert input_text to OpenAI text format
|
|
146
|
+
text = item.get("text", "")
|
|
147
|
+
converted_content.append({
|
|
148
|
+
"type": "text",
|
|
149
|
+
"text": text
|
|
150
|
+
})
|
|
152
151
|
else:
|
|
153
152
|
# Keep other content types as-is
|
|
154
153
|
converted_content.append(item)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: cua-agent
|
|
3
|
+
Version: 0.4.25
|
|
4
|
+
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
|
+
Author-Email: TryCua <gh@trycua.com>
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: httpx>=0.27.0
|
|
8
|
+
Requires-Dist: aiohttp>=3.9.3
|
|
9
|
+
Requires-Dist: asyncio
|
|
10
|
+
Requires-Dist: anyio>=4.4.1
|
|
11
|
+
Requires-Dist: typing-extensions>=4.12.2
|
|
12
|
+
Requires-Dist: pydantic>=2.6.4
|
|
13
|
+
Requires-Dist: rich>=13.7.1
|
|
14
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
15
|
+
Requires-Dist: cua-computer<0.5.0,>=0.4.0
|
|
16
|
+
Requires-Dist: cua-core<0.2.0,>=0.1.8
|
|
17
|
+
Requires-Dist: certifi>=2024.2.2
|
|
18
|
+
Requires-Dist: litellm>=1.74.12
|
|
19
|
+
Provides-Extra: openai
|
|
20
|
+
Provides-Extra: anthropic
|
|
21
|
+
Provides-Extra: omni
|
|
22
|
+
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "omni"
|
|
23
|
+
Provides-Extra: uitars
|
|
24
|
+
Provides-Extra: uitars-mlx
|
|
25
|
+
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
|
|
26
|
+
Provides-Extra: uitars-hf
|
|
27
|
+
Requires-Dist: accelerate; extra == "uitars-hf"
|
|
28
|
+
Requires-Dist: torch; extra == "uitars-hf"
|
|
29
|
+
Requires-Dist: transformers>=4.54.0; extra == "uitars-hf"
|
|
30
|
+
Provides-Extra: glm45v-hf
|
|
31
|
+
Requires-Dist: accelerate; extra == "glm45v-hf"
|
|
32
|
+
Requires-Dist: torch; extra == "glm45v-hf"
|
|
33
|
+
Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
|
|
34
|
+
Provides-Extra: ui
|
|
35
|
+
Requires-Dist: gradio>=5.23.3; extra == "ui"
|
|
36
|
+
Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
|
|
37
|
+
Provides-Extra: cli
|
|
38
|
+
Requires-Dist: yaspin>=3.1.0; extra == "cli"
|
|
39
|
+
Provides-Extra: hud
|
|
40
|
+
Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "hud"
|
|
41
|
+
Provides-Extra: all
|
|
42
|
+
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
|
|
43
|
+
Requires-Dist: accelerate; extra == "all"
|
|
44
|
+
Requires-Dist: torch; extra == "all"
|
|
45
|
+
Requires-Dist: transformers>=4.54.0; extra == "all"
|
|
46
|
+
Requires-Dist: gradio>=5.23.3; extra == "all"
|
|
47
|
+
Requires-Dist: python-dotenv>=1.0.1; extra == "all"
|
|
48
|
+
Requires-Dist: yaspin>=3.1.0; extra == "all"
|
|
49
|
+
Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "all"
|
|
50
|
+
Description-Content-Type: text/markdown
|
|
51
|
+
|
|
52
|
+
<div align="center">
|
|
53
|
+
<h1>
|
|
54
|
+
<div class="image-wrapper" style="display: inline-block;">
|
|
55
|
+
<picture>
|
|
56
|
+
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
|
|
57
|
+
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
|
|
58
|
+
<img alt="Shows my svg">
|
|
59
|
+
</picture>
|
|
60
|
+
</div>
|
|
61
|
+
|
|
62
|
+
[](#)
|
|
63
|
+
[](#)
|
|
64
|
+
[](https://discord.com/invite/mVnXXpdE85)
|
|
65
|
+
[](https://pypi.org/project/cua-computer/)
|
|
66
|
+
</h1>
|
|
67
|
+
</div>
|
|
68
|
+
|
|
69
|
+
**cua-agent** is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility.
|
|
70
|
+
|
|
71
|
+
## Features
|
|
72
|
+
|
|
73
|
+
- **Safe Computer-Use/Tool-Use**: Using Computer SDK for sandboxed desktops
|
|
74
|
+
- **Multi-Agent Support**: Anthropic Claude, OpenAI computer-use-preview, UI-TARS, Omniparser + any LLM
|
|
75
|
+
- **Multi-API Support**: Take advantage of liteLLM supporting 100+ LLMs / model APIs, including local models (`huggingface-local/`, `ollama_chat/`, `mlx/`)
|
|
76
|
+
- **Cross-Platform**: Works on Windows, macOS, and Linux with cloud and local computer instances
|
|
77
|
+
- **Extensible Callbacks**: Built-in support for image retention, cache control, PII anonymization, budget limits, and trajectory tracking
|
|
78
|
+
|
|
79
|
+
## Install
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install "cua-agent[all]"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Quick Start
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
import asyncio
|
|
89
|
+
import os
|
|
90
|
+
from agent import ComputerAgent
|
|
91
|
+
from computer import Computer
|
|
92
|
+
|
|
93
|
+
async def main():
|
|
94
|
+
# Set up computer instance
|
|
95
|
+
async with Computer(
|
|
96
|
+
os_type="linux",
|
|
97
|
+
provider_type="cloud",
|
|
98
|
+
name=os.getenv("CUA_CONTAINER_NAME"),
|
|
99
|
+
api_key=os.getenv("CUA_API_KEY")
|
|
100
|
+
) as computer:
|
|
101
|
+
|
|
102
|
+
# Create agent
|
|
103
|
+
agent = ComputerAgent(
|
|
104
|
+
model="anthropic/claude-3-5-sonnet-20241022",
|
|
105
|
+
tools=[computer],
|
|
106
|
+
only_n_most_recent_images=3,
|
|
107
|
+
trajectory_dir="trajectories",
|
|
108
|
+
max_trajectory_budget=5.0 # $5 budget limit
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Run agent
|
|
112
|
+
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
|
113
|
+
|
|
114
|
+
async for result in agent.run(messages):
|
|
115
|
+
for item in result["output"]:
|
|
116
|
+
if item["type"] == "message":
|
|
117
|
+
print(item["content"][0]["text"])
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
asyncio.run(main())
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Docs
|
|
124
|
+
|
|
125
|
+
- [Agent Loops](https://trycua.com/docs/agent-sdk/agent-loops)
|
|
126
|
+
- [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents)
|
|
127
|
+
- [Supported Models](https://trycua.com/docs/agent-sdk/supported-models)
|
|
128
|
+
- [Chat History](https://trycua.com/docs/agent-sdk/chat-history)
|
|
129
|
+
- [Callbacks](https://trycua.com/docs/agent-sdk/callbacks)
|
|
130
|
+
- [Custom Tools](https://trycua.com/docs/agent-sdk/custom-tools)
|
|
131
|
+
- [Custom Computer Handlers](https://trycua.com/docs/agent-sdk/custom-computer-handlers)
|
|
132
|
+
- [Prompt Caching](https://trycua.com/docs/agent-sdk/prompt-caching)
|
|
133
|
+
- [Usage Tracking](https://trycua.com/docs/agent-sdk/usage-tracking)
|
|
134
|
+
- [Benchmarks](https://trycua.com/docs/agent-sdk/benchmarks)
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
MIT License - see LICENSE file for details.
|
|
@@ -8,9 +8,9 @@ agent/agent.py,sha256=ao3SKnZoX5_P6mTzEg0hgOUam6bNRwpbitzlyvwI3bg,29826
|
|
|
8
8
|
agent/callbacks/__init__.py,sha256=et6pNfX_AiJqhVzUfCvcjzFbDhfLoHazKCXN5sqwxaM,631
|
|
9
9
|
agent/callbacks/base.py,sha256=UnnnYlh6XCm6HKZZsAPaT_Eyo9LUYLyjyNwF-QRm6Ns,4691
|
|
10
10
|
agent/callbacks/budget_manager.py,sha256=RyKM-7iXQcDotYvrw3eURzeEHEXvQjID-NobtvQWE7k,1832
|
|
11
|
-
agent/callbacks/image_retention.py,sha256=
|
|
11
|
+
agent/callbacks/image_retention.py,sha256=8MeLo5-Y7cACpsNk2p_bvnZIYKpW6XgyukmdYGX23rE,3588
|
|
12
12
|
agent/callbacks/logging.py,sha256=OOxU97EzrxlnUAtiEnvy9FB7SwCUK90-rdpDFA2Ae4E,10921
|
|
13
|
-
agent/callbacks/operator_validator.py,sha256=
|
|
13
|
+
agent/callbacks/operator_validator.py,sha256=T5tp62pkShkcdHu2rgREUGdk8fryL_ziJsItXsfgYUQ,6494
|
|
14
14
|
agent/callbacks/pii_anonymization.py,sha256=NEkUTUjQBi82nqus7kT-1E4RaeQ2hQrY7YCnKndLhP8,3272
|
|
15
15
|
agent/callbacks/telemetry.py,sha256=RbUDhE41mTi8g9hNre0EpltK_NUZkLj8buJLWBzs0Ek,7363
|
|
16
16
|
agent/callbacks/trajectory_saver.py,sha256=rslgg4Ak7JHSNmmJgANRQ5TsUYWGuUJDZ6amureaz_o,15963
|
|
@@ -23,11 +23,11 @@ agent/decorators.py,sha256=n8VvMsififWkmuk75Q7HIpo0xAA2yAeQ6J-OOiwbAKc,1836
|
|
|
23
23
|
agent/human_tool/__init__.py,sha256=3m5_g-Fo_0yX5vi7eg-A92oTqO0N3aY929Ajp78HKsE,771
|
|
24
24
|
agent/human_tool/__main__.py,sha256=VsW2BAghlonOuqZbP_xuCsaec9bemA1I_ibnDcED9D4,1068
|
|
25
25
|
agent/human_tool/server.py,sha256=ceuL5kw_RjgAi8fueLU3nTjyzOLE25Shv1oTJnSHsoQ,7964
|
|
26
|
-
agent/human_tool/ui.py,sha256=
|
|
26
|
+
agent/human_tool/ui.py,sha256=wu9eZorhxCkyPTlBSZjYaVzutoHMlucAz8UGNpAT4bM,30644
|
|
27
27
|
agent/integrations/hud/__init__.py,sha256=q0QEyJZSrcjiN2sRi_hoX-ePmLyYm9CpAIvA0xMxGJI,8360
|
|
28
28
|
agent/integrations/hud/proxy.py,sha256=yA7C2jeXnrpI5HS0VgCvn0BflVbAORZynIfyE27rvBg,7782
|
|
29
29
|
agent/loops/__init__.py,sha256=Ef8aj07l3osibwDk-DTo80PrpL4_GdKRTP1ikl_b-BQ,328
|
|
30
|
-
agent/loops/anthropic.py,sha256=
|
|
30
|
+
agent/loops/anthropic.py,sha256=q7lr1PjI6VPtlozoweluY2c3hCGqa_2s-whzxa37iKE,70250
|
|
31
31
|
agent/loops/base.py,sha256=LK7kSTnc2CB88LI7qr2VP7LMq0eS5r2bSEnrxO6IN5U,2345
|
|
32
32
|
agent/loops/composed_grounded.py,sha256=8oJoqaRzKWbI9I4VoFuAoUzQ11_CFnYT-EdPOy-NVEQ,12349
|
|
33
33
|
agent/loops/glm45v.py,sha256=V1f-5vAifbYcY-qTc7fW2KXVRkAfApQI_EjavH3X2ak,35110
|
|
@@ -45,7 +45,7 @@ agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
|
|
|
45
45
|
agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
|
|
46
46
|
agent/ui/gradio/app.py,sha256=Ol97YEbwREZZQ9_PMjVHlfOcu9BGsawxgAGAm79hT80,9117
|
|
47
47
|
agent/ui/gradio/ui_components.py,sha256=dJUvKDmc1oSejtoR_gU_oWWYwxaOOQyPloSYRGMrUCQ,36068
|
|
48
|
-
cua_agent-0.4.
|
|
49
|
-
cua_agent-0.4.
|
|
50
|
-
cua_agent-0.4.
|
|
51
|
-
cua_agent-0.4.
|
|
48
|
+
cua_agent-0.4.25.dist-info/METADATA,sha256=RddHOGfOJVdXhPQMXCj1c7RLBJcmH2yZMwS36dbnB5Q,5624
|
|
49
|
+
cua_agent-0.4.25.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
|
|
50
|
+
cua_agent-0.4.25.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
51
|
+
cua_agent-0.4.25.dist-info/RECORD,,
|
|
@@ -1,436 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: cua-agent
|
|
3
|
-
Version: 0.4.23
|
|
4
|
-
Summary: CUA (Computer Use) Agent for AI-driven computer interaction
|
|
5
|
-
Author-Email: TryCua <gh@trycua.com>
|
|
6
|
-
Requires-Python: >=3.12
|
|
7
|
-
Requires-Dist: httpx>=0.27.0
|
|
8
|
-
Requires-Dist: aiohttp>=3.9.3
|
|
9
|
-
Requires-Dist: asyncio
|
|
10
|
-
Requires-Dist: anyio>=4.4.1
|
|
11
|
-
Requires-Dist: typing-extensions>=4.12.2
|
|
12
|
-
Requires-Dist: pydantic>=2.6.4
|
|
13
|
-
Requires-Dist: rich>=13.7.1
|
|
14
|
-
Requires-Dist: python-dotenv>=1.0.1
|
|
15
|
-
Requires-Dist: cua-computer<0.5.0,>=0.4.0
|
|
16
|
-
Requires-Dist: cua-core<0.2.0,>=0.1.8
|
|
17
|
-
Requires-Dist: certifi>=2024.2.2
|
|
18
|
-
Requires-Dist: litellm>=1.74.12
|
|
19
|
-
Provides-Extra: openai
|
|
20
|
-
Provides-Extra: anthropic
|
|
21
|
-
Provides-Extra: omni
|
|
22
|
-
Requires-Dist: ultralytics>=8.0.0; extra == "omni"
|
|
23
|
-
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "omni"
|
|
24
|
-
Provides-Extra: uitars
|
|
25
|
-
Provides-Extra: uitars-mlx
|
|
26
|
-
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "uitars-mlx"
|
|
27
|
-
Provides-Extra: uitars-hf
|
|
28
|
-
Requires-Dist: accelerate; extra == "uitars-hf"
|
|
29
|
-
Requires-Dist: torch; extra == "uitars-hf"
|
|
30
|
-
Requires-Dist: transformers>=4.54.0; extra == "uitars-hf"
|
|
31
|
-
Provides-Extra: glm45v-hf
|
|
32
|
-
Requires-Dist: accelerate; extra == "glm45v-hf"
|
|
33
|
-
Requires-Dist: torch; extra == "glm45v-hf"
|
|
34
|
-
Requires-Dist: transformers-v4.55.0-GLM-4.5V-preview; extra == "glm45v-hf"
|
|
35
|
-
Provides-Extra: ui
|
|
36
|
-
Requires-Dist: gradio>=5.23.3; extra == "ui"
|
|
37
|
-
Requires-Dist: python-dotenv>=1.0.1; extra == "ui"
|
|
38
|
-
Provides-Extra: cli
|
|
39
|
-
Requires-Dist: yaspin>=3.1.0; extra == "cli"
|
|
40
|
-
Provides-Extra: hud
|
|
41
|
-
Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "hud"
|
|
42
|
-
Provides-Extra: all
|
|
43
|
-
Requires-Dist: ultralytics>=8.0.0; extra == "all"
|
|
44
|
-
Requires-Dist: cua-som<0.2.0,>=0.1.0; extra == "all"
|
|
45
|
-
Requires-Dist: mlx-vlm>=0.1.27; sys_platform == "darwin" and extra == "all"
|
|
46
|
-
Requires-Dist: accelerate; extra == "all"
|
|
47
|
-
Requires-Dist: torch; extra == "all"
|
|
48
|
-
Requires-Dist: transformers>=4.54.0; extra == "all"
|
|
49
|
-
Requires-Dist: gradio>=5.23.3; extra == "all"
|
|
50
|
-
Requires-Dist: python-dotenv>=1.0.1; extra == "all"
|
|
51
|
-
Requires-Dist: yaspin>=3.1.0; extra == "all"
|
|
52
|
-
Requires-Dist: hud-python<0.5.0,>=0.4.12; extra == "all"
|
|
53
|
-
Description-Content-Type: text/markdown
|
|
54
|
-
|
|
55
|
-
<div align="center">
|
|
56
|
-
<h1>
|
|
57
|
-
<div class="image-wrapper" style="display: inline-block;">
|
|
58
|
-
<picture>
|
|
59
|
-
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
|
|
60
|
-
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
|
|
61
|
-
<img alt="Shows my svg">
|
|
62
|
-
</picture>
|
|
63
|
-
</div>
|
|
64
|
-
|
|
65
|
-
[](#)
|
|
66
|
-
[](#)
|
|
67
|
-
[](https://discord.com/invite/mVnXXpdE85)
|
|
68
|
-
[](https://pypi.org/project/cua-computer/)
|
|
69
|
-
</h1>
|
|
70
|
-
</div>
|
|
71
|
-
|
|
72
|
-
**cua-agent** is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility.
|
|
73
|
-
|
|
74
|
-
## Features
|
|
75
|
-
|
|
76
|
-
- **Safe Computer-Use/Tool-Use**: Using Computer SDK for sandboxed desktops
|
|
77
|
-
- **Multi-Agent Support**: Anthropic Claude, OpenAI computer-use-preview, UI-TARS, Omniparser + any LLM
|
|
78
|
-
- **Multi-API Support**: Take advantage of liteLLM supporting 100+ LLMs / model APIs, including local models (`huggingface-local/`, `ollama_chat/`, `mlx/`)
|
|
79
|
-
- **Cross-Platform**: Works on Windows, macOS, and Linux with cloud and local computer instances
|
|
80
|
-
- **Extensible Callbacks**: Built-in support for image retention, cache control, PII anonymization, budget limits, and trajectory tracking
|
|
81
|
-
|
|
82
|
-
## Install
|
|
83
|
-
|
|
84
|
-
```bash
|
|
85
|
-
pip install "cua-agent[all]"
|
|
86
|
-
|
|
87
|
-
# or install specific providers
|
|
88
|
-
pip install "cua-agent[openai]" # OpenAI computer-use-preview support
|
|
89
|
-
pip install "cua-agent[anthropic]" # Anthropic Claude support
|
|
90
|
-
pip install "cua-agent[omni]" # Omniparser + any LLM support
|
|
91
|
-
pip install "cua-agent[uitars]" # UI-TARS
|
|
92
|
-
pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support
|
|
93
|
-
pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support
|
|
94
|
-
pip install "cua-agent[glm45v-hf]" # GLM-4.5V + Huggingface support
|
|
95
|
-
pip install "cua-agent[ui]" # Gradio UI support
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
## Quick Start
|
|
99
|
-
|
|
100
|
-
```python
|
|
101
|
-
import asyncio
|
|
102
|
-
import os
|
|
103
|
-
from agent import ComputerAgent
|
|
104
|
-
from computer import Computer
|
|
105
|
-
|
|
106
|
-
async def main():
|
|
107
|
-
# Set up computer instance
|
|
108
|
-
async with Computer(
|
|
109
|
-
os_type="linux",
|
|
110
|
-
provider_type="cloud",
|
|
111
|
-
name=os.getenv("CUA_CONTAINER_NAME"),
|
|
112
|
-
api_key=os.getenv("CUA_API_KEY")
|
|
113
|
-
) as computer:
|
|
114
|
-
|
|
115
|
-
# Create agent
|
|
116
|
-
agent = ComputerAgent(
|
|
117
|
-
model="anthropic/claude-3-5-sonnet-20241022",
|
|
118
|
-
tools=[computer],
|
|
119
|
-
only_n_most_recent_images=3,
|
|
120
|
-
trajectory_dir="trajectories",
|
|
121
|
-
max_trajectory_budget=5.0 # $5 budget limit
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
# Run agent
|
|
125
|
-
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
|
126
|
-
|
|
127
|
-
async for result in agent.run(messages):
|
|
128
|
-
for item in result["output"]:
|
|
129
|
-
if item["type"] == "message":
|
|
130
|
-
print(item["content"][0]["text"])
|
|
131
|
-
|
|
132
|
-
if __name__ == "__main__":
|
|
133
|
-
asyncio.run(main())
|
|
134
|
-
```
|
|
135
|
-
|
|
136
|
-
## Supported Models
|
|
137
|
-
|
|
138
|
-
### Anthropic Claude (Computer Use API)
|
|
139
|
-
```python
|
|
140
|
-
model="anthropic/claude-3-5-sonnet-20241022"
|
|
141
|
-
model="anthropic/claude-3-7-sonnet-20250219"
|
|
142
|
-
model="anthropic/claude-opus-4-20250514"
|
|
143
|
-
model="anthropic/claude-sonnet-4-20250514"
|
|
144
|
-
```
|
|
145
|
-
|
|
146
|
-
### OpenAI Computer Use Preview
|
|
147
|
-
```python
|
|
148
|
-
model="openai/computer-use-preview"
|
|
149
|
-
```
|
|
150
|
-
|
|
151
|
-
### UI-TARS (Local or Huggingface Inference)
|
|
152
|
-
```python
|
|
153
|
-
model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
|
|
154
|
-
model="ollama_chat/0000/ui-tars-1.5-7b"
|
|
155
|
-
```
|
|
156
|
-
|
|
157
|
-
### Omniparser + Any LLM
|
|
158
|
-
```python
|
|
159
|
-
model="omniparser+ollama_chat/mistral-small3.2"
|
|
160
|
-
model="omniparser+vertex_ai/gemini-pro"
|
|
161
|
-
model="omniparser+anthropic/claude-3-5-sonnet-20241022"
|
|
162
|
-
model="omniparser+openai/gpt-4o"
|
|
163
|
-
```
|
|
164
|
-
|
|
165
|
-
## Custom Tools
|
|
166
|
-
|
|
167
|
-
Define custom tools using decorated functions:
|
|
168
|
-
|
|
169
|
-
```python
|
|
170
|
-
from computer.helpers import sandboxed
|
|
171
|
-
|
|
172
|
-
@sandboxed()
|
|
173
|
-
def read_file(location: str) -> str:
|
|
174
|
-
"""Read contents of a file
|
|
175
|
-
|
|
176
|
-
Parameters
|
|
177
|
-
----------
|
|
178
|
-
location : str
|
|
179
|
-
Path to the file to read
|
|
180
|
-
|
|
181
|
-
Returns
|
|
182
|
-
-------
|
|
183
|
-
str
|
|
184
|
-
Contents of the file or error message
|
|
185
|
-
"""
|
|
186
|
-
try:
|
|
187
|
-
with open(location, 'r') as f:
|
|
188
|
-
return f.read()
|
|
189
|
-
except Exception as e:
|
|
190
|
-
return f"Error reading file: {str(e)}"
|
|
191
|
-
|
|
192
|
-
def calculate(a: int, b: int) -> int:
|
|
193
|
-
"""Calculate the sum of two integers"""
|
|
194
|
-
return a + b
|
|
195
|
-
|
|
196
|
-
# Use with agent
|
|
197
|
-
agent = ComputerAgent(
|
|
198
|
-
model="anthropic/claude-3-5-sonnet-20241022",
|
|
199
|
-
tools=[computer, read_file, calculate]
|
|
200
|
-
)
|
|
201
|
-
```
|
|
202
|
-
|
|
203
|
-
## Callbacks System
|
|
204
|
-
|
|
205
|
-
agent provides a comprehensive callback system for extending functionality:
|
|
206
|
-
|
|
207
|
-
### Built-in Callbacks
|
|
208
|
-
|
|
209
|
-
```python
|
|
210
|
-
from agent.callbacks import (
|
|
211
|
-
ImageRetentionCallback,
|
|
212
|
-
TrajectorySaverCallback,
|
|
213
|
-
BudgetManagerCallback,
|
|
214
|
-
LoggingCallback
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
agent = ComputerAgent(
|
|
218
|
-
model="anthropic/claude-3-5-sonnet-20241022",
|
|
219
|
-
tools=[computer],
|
|
220
|
-
callbacks=[
|
|
221
|
-
ImageRetentionCallback(only_n_most_recent_images=3),
|
|
222
|
-
TrajectorySaverCallback(trajectory_dir="trajectories"),
|
|
223
|
-
BudgetManagerCallback(max_budget=10.0, raise_error=True),
|
|
224
|
-
LoggingCallback(level=logging.INFO)
|
|
225
|
-
]
|
|
226
|
-
)
|
|
227
|
-
```
|
|
228
|
-
|
|
229
|
-
### Custom Callbacks
|
|
230
|
-
|
|
231
|
-
```python
|
|
232
|
-
from agent.callbacks.base import AsyncCallbackHandler
|
|
233
|
-
|
|
234
|
-
class CustomCallback(AsyncCallbackHandler):
|
|
235
|
-
async def on_llm_start(self, messages):
|
|
236
|
-
"""Preprocess messages before LLM call"""
|
|
237
|
-
# Add custom preprocessing logic
|
|
238
|
-
return messages
|
|
239
|
-
|
|
240
|
-
async def on_llm_end(self, messages):
|
|
241
|
-
"""Postprocess messages after LLM call"""
|
|
242
|
-
# Add custom postprocessing logic
|
|
243
|
-
return messages
|
|
244
|
-
|
|
245
|
-
async def on_usage(self, usage):
|
|
246
|
-
"""Track usage information"""
|
|
247
|
-
print(f"Tokens used: {usage.total_tokens}")
|
|
248
|
-
```
|
|
249
|
-
|
|
250
|
-
## Budget Management
|
|
251
|
-
|
|
252
|
-
Control costs with built-in budget management:
|
|
253
|
-
|
|
254
|
-
```python
|
|
255
|
-
# Simple budget limit
|
|
256
|
-
agent = ComputerAgent(
|
|
257
|
-
model="anthropic/claude-3-5-sonnet-20241022",
|
|
258
|
-
max_trajectory_budget=5.0 # $5 limit
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
# Advanced budget configuration
|
|
262
|
-
agent = ComputerAgent(
|
|
263
|
-
model="anthropic/claude-3-5-sonnet-20241022",
|
|
264
|
-
max_trajectory_budget={
|
|
265
|
-
"max_budget": 10.0,
|
|
266
|
-
"raise_error": True, # Raise error when exceeded
|
|
267
|
-
"reset_after_each_run": False # Persistent across runs
|
|
268
|
-
}
|
|
269
|
-
)
|
|
270
|
-
```
|
|
271
|
-
|
|
272
|
-
## Trajectory Management
|
|
273
|
-
|
|
274
|
-
Save and replay agent conversations:
|
|
275
|
-
|
|
276
|
-
```python
|
|
277
|
-
agent = ComputerAgent(
|
|
278
|
-
model="anthropic/claude-3-5-sonnet-20241022",
|
|
279
|
-
trajectory_dir="trajectories", # Auto-save trajectories
|
|
280
|
-
tools=[computer]
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
# Trajectories are saved with:
|
|
284
|
-
# - Complete conversation history
|
|
285
|
-
# - Usage statistics and costs
|
|
286
|
-
# - Timestamps and metadata
|
|
287
|
-
# - Screenshots and computer actions
|
|
288
|
-
```
|
|
289
|
-
|
|
290
|
-
## Configuration Options
|
|
291
|
-
|
|
292
|
-
### ComputerAgent Parameters
|
|
293
|
-
|
|
294
|
-
- `model`: Model identifier (required)
|
|
295
|
-
- `tools`: List of computer objects and decorated functions
|
|
296
|
-
- `callbacks`: List of callback handlers for extensibility
|
|
297
|
-
- `only_n_most_recent_images`: Limit recent images to prevent context overflow
|
|
298
|
-
- `verbosity`: Logging level (logging.INFO, logging.DEBUG, etc.)
|
|
299
|
-
- `trajectory_dir`: Directory to save conversation trajectories
|
|
300
|
-
- `max_retries`: Maximum API call retries (default: 3)
|
|
301
|
-
- `screenshot_delay`: Delay between actions and screenshots (default: 0.5s)
|
|
302
|
-
- `use_prompt_caching`: Enable prompt caching for supported models
|
|
303
|
-
- `max_trajectory_budget`: Budget limit configuration
|
|
304
|
-
|
|
305
|
-
### Environment Variables
|
|
306
|
-
|
|
307
|
-
```bash
|
|
308
|
-
# Computer instance (cloud)
|
|
309
|
-
export CUA_CONTAINER_NAME="your-container-name"
|
|
310
|
-
export CUA_API_KEY="your-cua-api-key"
|
|
311
|
-
|
|
312
|
-
# LLM API keys
|
|
313
|
-
export ANTHROPIC_API_KEY="your-anthropic-key"
|
|
314
|
-
export OPENAI_API_KEY="your-openai-key"
|
|
315
|
-
```
|
|
316
|
-
|
|
317
|
-
## Advanced Usage
|
|
318
|
-
|
|
319
|
-
### Streaming Responses
|
|
320
|
-
|
|
321
|
-
```python
|
|
322
|
-
async for result in agent.run(messages, stream=True):
|
|
323
|
-
# Process streaming chunks
|
|
324
|
-
for item in result["output"]:
|
|
325
|
-
if item["type"] == "message":
|
|
326
|
-
print(item["content"][0]["text"], end="", flush=True)
|
|
327
|
-
elif item["type"] == "computer_call":
|
|
328
|
-
action = item["action"]
|
|
329
|
-
print(f"\n[Action: {action['type']}]")
|
|
330
|
-
```
|
|
331
|
-
|
|
332
|
-
### Interactive Chat Loop
|
|
333
|
-
|
|
334
|
-
```python
|
|
335
|
-
history = []
|
|
336
|
-
while True:
|
|
337
|
-
user_input = input("> ")
|
|
338
|
-
if user_input.lower() in ['quit', 'exit']:
|
|
339
|
-
break
|
|
340
|
-
|
|
341
|
-
history.append({"role": "user", "content": user_input})
|
|
342
|
-
|
|
343
|
-
async for result in agent.run(history):
|
|
344
|
-
history += result["output"]
|
|
345
|
-
|
|
346
|
-
# Display assistant responses
|
|
347
|
-
for item in result["output"]:
|
|
348
|
-
if item["type"] == "message":
|
|
349
|
-
print(item["content"][0]["text"])
|
|
350
|
-
```
|
|
351
|
-
|
|
352
|
-
### Error Handling
|
|
353
|
-
|
|
354
|
-
```python
|
|
355
|
-
try:
|
|
356
|
-
async for result in agent.run(messages):
|
|
357
|
-
# Process results
|
|
358
|
-
pass
|
|
359
|
-
except BudgetExceededException:
|
|
360
|
-
print("Budget limit exceeded")
|
|
361
|
-
except Exception as e:
|
|
362
|
-
print(f"Agent error: {e}")
|
|
363
|
-
```
|
|
364
|
-
|
|
365
|
-
## API Reference
|
|
366
|
-
|
|
367
|
-
### ComputerAgent.run()
|
|
368
|
-
|
|
369
|
-
```python
|
|
370
|
-
async def run(
|
|
371
|
-
self,
|
|
372
|
-
messages: Messages,
|
|
373
|
-
stream: bool = False,
|
|
374
|
-
**kwargs
|
|
375
|
-
) -> AsyncGenerator[Dict[str, Any], None]:
|
|
376
|
-
"""
|
|
377
|
-
Run the agent with the given messages.
|
|
378
|
-
|
|
379
|
-
Args:
|
|
380
|
-
messages: List of message dictionaries
|
|
381
|
-
stream: Whether to stream the response
|
|
382
|
-
**kwargs: Additional arguments
|
|
383
|
-
|
|
384
|
-
Returns:
|
|
385
|
-
AsyncGenerator that yields response chunks
|
|
386
|
-
"""
|
|
387
|
-
```
|
|
388
|
-
|
|
389
|
-
### Message Format
|
|
390
|
-
|
|
391
|
-
```python
|
|
392
|
-
messages = [
|
|
393
|
-
{
|
|
394
|
-
"role": "user",
|
|
395
|
-
"content": "Take a screenshot and describe what you see"
|
|
396
|
-
},
|
|
397
|
-
{
|
|
398
|
-
"role": "assistant",
|
|
399
|
-
"content": "I'll take a screenshot for you."
|
|
400
|
-
}
|
|
401
|
-
]
|
|
402
|
-
```
|
|
403
|
-
|
|
404
|
-
### Response Format
|
|
405
|
-
|
|
406
|
-
```python
|
|
407
|
-
{
|
|
408
|
-
"output": [
|
|
409
|
-
{
|
|
410
|
-
"type": "message",
|
|
411
|
-
"role": "assistant",
|
|
412
|
-
"content": [{"type": "output_text", "text": "I can see..."}]
|
|
413
|
-
},
|
|
414
|
-
{
|
|
415
|
-
"type": "computer_call",
|
|
416
|
-
"action": {"type": "screenshot"},
|
|
417
|
-
"call_id": "call_123"
|
|
418
|
-
},
|
|
419
|
-
{
|
|
420
|
-
"type": "computer_call_output",
|
|
421
|
-
"call_id": "call_123",
|
|
422
|
-
"output": {"image_url": "data:image/png;base64,..."}
|
|
423
|
-
}
|
|
424
|
-
],
|
|
425
|
-
"usage": {
|
|
426
|
-
"prompt_tokens": 150,
|
|
427
|
-
"completion_tokens": 75,
|
|
428
|
-
"total_tokens": 225,
|
|
429
|
-
"response_cost": 0.01,
|
|
430
|
-
}
|
|
431
|
-
}
|
|
432
|
-
```
|
|
433
|
-
|
|
434
|
-
## License
|
|
435
|
-
|
|
436
|
-
MIT License - see LICENSE file for details.
|
|
File without changes
|
|
File without changes
|