cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +32 -19
- agent/computers/cua.py +33 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +215 -210
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +510 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
- cua_agent-0.4.36.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
agent/cli.py
CHANGED
|
@@ -3,7 +3,7 @@ CLI chat interface for agent - Computer Use Agent
|
|
|
3
3
|
|
|
4
4
|
Usage:
|
|
5
5
|
python -m agent.cli <model_string>
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
Examples:
|
|
8
8
|
python -m agent.cli openai/computer-use-preview
|
|
9
9
|
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
|
|
@@ -11,19 +11,22 @@ Examples:
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
try:
|
|
14
|
-
import asyncio
|
|
15
14
|
import argparse
|
|
15
|
+
import asyncio
|
|
16
|
+
import base64
|
|
17
|
+
import json
|
|
16
18
|
import os
|
|
19
|
+
import platform
|
|
17
20
|
import sys
|
|
18
|
-
import json
|
|
19
|
-
from typing import List, Dict, Any
|
|
20
|
-
import dotenv
|
|
21
|
-
import base64
|
|
22
21
|
import time
|
|
23
|
-
import platform
|
|
24
22
|
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List
|
|
24
|
+
|
|
25
|
+
import dotenv
|
|
26
|
+
|
|
25
27
|
try:
|
|
26
28
|
from PIL import Image, ImageDraw
|
|
29
|
+
|
|
27
30
|
PIL_AVAILABLE = True
|
|
28
31
|
except Exception:
|
|
29
32
|
PIL_AVAILABLE = False
|
|
@@ -31,36 +34,44 @@ try:
|
|
|
31
34
|
except ImportError:
|
|
32
35
|
if __name__ == "__main__":
|
|
33
36
|
raise ImportError(
|
|
34
|
-
"CLI dependencies not found. "
|
|
35
|
-
"Please install with: pip install \"cua-agent[cli]\""
|
|
37
|
+
"CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
|
|
36
38
|
)
|
|
37
39
|
|
|
38
40
|
# Load environment variables
|
|
39
41
|
dotenv.load_dotenv()
|
|
40
42
|
|
|
43
|
+
|
|
41
44
|
# Color codes for terminal output
|
|
42
45
|
class Colors:
|
|
43
|
-
RESET =
|
|
44
|
-
BOLD =
|
|
45
|
-
DIM =
|
|
46
|
-
|
|
46
|
+
RESET = "\033[0m"
|
|
47
|
+
BOLD = "\033[1m"
|
|
48
|
+
DIM = "\033[2m"
|
|
49
|
+
|
|
47
50
|
# Text colors
|
|
48
|
-
RED =
|
|
49
|
-
GREEN =
|
|
50
|
-
YELLOW =
|
|
51
|
-
BLUE =
|
|
52
|
-
MAGENTA =
|
|
53
|
-
CYAN =
|
|
54
|
-
WHITE =
|
|
55
|
-
GRAY =
|
|
56
|
-
|
|
57
|
-
# Background colors
|
|
58
|
-
BG_RED = '\033[41m'
|
|
59
|
-
BG_GREEN = '\033[42m'
|
|
60
|
-
BG_YELLOW = '\033[43m'
|
|
61
|
-
BG_BLUE = '\033[44m'
|
|
51
|
+
RED = "\033[31m"
|
|
52
|
+
GREEN = "\033[32m"
|
|
53
|
+
YELLOW = "\033[33m"
|
|
54
|
+
BLUE = "\033[34m"
|
|
55
|
+
MAGENTA = "\033[35m"
|
|
56
|
+
CYAN = "\033[36m"
|
|
57
|
+
WHITE = "\033[37m"
|
|
58
|
+
GRAY = "\033[90m"
|
|
62
59
|
|
|
63
|
-
|
|
60
|
+
# Background colors
|
|
61
|
+
BG_RED = "\033[41m"
|
|
62
|
+
BG_GREEN = "\033[42m"
|
|
63
|
+
BG_YELLOW = "\033[43m"
|
|
64
|
+
BG_BLUE = "\033[44m"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def print_colored(
|
|
68
|
+
text: str,
|
|
69
|
+
color: str = "",
|
|
70
|
+
bold: bool = False,
|
|
71
|
+
dim: bool = False,
|
|
72
|
+
end: str = "\n",
|
|
73
|
+
right: str = "",
|
|
74
|
+
):
|
|
64
75
|
"""Print colored text to terminal with optional right-aligned text."""
|
|
65
76
|
prefix = ""
|
|
66
77
|
if bold:
|
|
@@ -69,24 +80,25 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
|
|
|
69
80
|
prefix += Colors.DIM
|
|
70
81
|
if color:
|
|
71
82
|
prefix += color
|
|
72
|
-
|
|
83
|
+
|
|
73
84
|
if right:
|
|
74
85
|
# Get terminal width (default to 80 if unable to determine)
|
|
75
86
|
try:
|
|
76
87
|
import shutil
|
|
88
|
+
|
|
77
89
|
terminal_width = shutil.get_terminal_size().columns
|
|
78
90
|
except:
|
|
79
91
|
terminal_width = 80
|
|
80
92
|
|
|
81
93
|
# Add right margin
|
|
82
94
|
terminal_width -= 1
|
|
83
|
-
|
|
95
|
+
|
|
84
96
|
# Calculate padding needed
|
|
85
97
|
# Account for ANSI escape codes not taking visual space
|
|
86
98
|
visible_left_len = len(text)
|
|
87
99
|
visible_right_len = len(right)
|
|
88
100
|
padding = terminal_width - visible_left_len - visible_right_len
|
|
89
|
-
|
|
101
|
+
|
|
90
102
|
if padding > 0:
|
|
91
103
|
output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
|
|
92
104
|
else:
|
|
@@ -94,7 +106,7 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
|
|
|
94
106
|
output = f"{prefix}{text} {right}{Colors.RESET}"
|
|
95
107
|
else:
|
|
96
108
|
output = f"{prefix}{text}{Colors.RESET}"
|
|
97
|
-
|
|
109
|
+
|
|
98
110
|
print(output, end=end)
|
|
99
111
|
|
|
100
112
|
|
|
@@ -113,29 +125,34 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
|
|
|
113
125
|
args_str = f"('{details['text']}')"
|
|
114
126
|
elif action_type == "scroll" and "x" in details and "y" in details:
|
|
115
127
|
args_str = f"({details['x']}, {details['y']})"
|
|
116
|
-
|
|
128
|
+
|
|
117
129
|
if total_cost > 0:
|
|
118
130
|
print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
|
|
119
131
|
else:
|
|
120
132
|
print_colored(f"🛠️ {action_type}{args_str}", dim=True)
|
|
121
133
|
|
|
134
|
+
|
|
122
135
|
def print_welcome(model: str, agent_loop: str, container_name: str):
|
|
123
136
|
"""Print welcome message."""
|
|
124
137
|
print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
|
|
125
138
|
print_colored("Type 'exit' to quit.", dim=True)
|
|
126
139
|
|
|
140
|
+
|
|
127
141
|
async def ainput(prompt: str = ""):
|
|
128
142
|
return await asyncio.to_thread(input, prompt)
|
|
129
143
|
|
|
130
|
-
|
|
144
|
+
|
|
145
|
+
async def chat_loop(
|
|
146
|
+
agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
|
|
147
|
+
):
|
|
131
148
|
"""Main chat loop with the agent."""
|
|
132
149
|
print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
|
|
133
|
-
|
|
150
|
+
|
|
134
151
|
history = []
|
|
135
|
-
|
|
152
|
+
|
|
136
153
|
if initial_prompt:
|
|
137
154
|
history.append({"role": "user", "content": initial_prompt})
|
|
138
|
-
|
|
155
|
+
|
|
139
156
|
total_cost = 0
|
|
140
157
|
|
|
141
158
|
while True:
|
|
@@ -143,28 +160,28 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
143
160
|
# Get user input with prompt
|
|
144
161
|
print_colored("> ", end="")
|
|
145
162
|
user_input = await ainput()
|
|
146
|
-
|
|
147
|
-
if user_input.lower() in [
|
|
163
|
+
|
|
164
|
+
if user_input.lower() in ["exit", "quit", "q"]:
|
|
148
165
|
print_colored("\n👋 Goodbye!")
|
|
149
166
|
break
|
|
150
|
-
|
|
167
|
+
|
|
151
168
|
if not user_input:
|
|
152
169
|
continue
|
|
153
|
-
|
|
170
|
+
|
|
154
171
|
# Add user message to history
|
|
155
172
|
history.append({"role": "user", "content": user_input})
|
|
156
|
-
|
|
173
|
+
|
|
157
174
|
# Stream responses from the agent with spinner
|
|
158
175
|
with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
|
|
159
176
|
spinner.hide()
|
|
160
|
-
|
|
177
|
+
|
|
161
178
|
async for result in agent.run(history):
|
|
162
179
|
# Add agent responses to history
|
|
163
180
|
history.extend(result.get("output", []))
|
|
164
181
|
|
|
165
182
|
if show_usage:
|
|
166
183
|
total_cost += result.get("usage", {}).get("response_cost", 0)
|
|
167
|
-
|
|
184
|
+
|
|
168
185
|
# Process and display the output
|
|
169
186
|
for item in result.get("output", []):
|
|
170
187
|
if item.get("type") == "message" and item.get("role") == "assistant":
|
|
@@ -176,7 +193,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
176
193
|
if text:
|
|
177
194
|
spinner.hide()
|
|
178
195
|
print_colored(text)
|
|
179
|
-
|
|
196
|
+
|
|
180
197
|
elif item.get("type") == "computer_call":
|
|
181
198
|
# Display computer action
|
|
182
199
|
action = item.get("action", {})
|
|
@@ -186,7 +203,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
186
203
|
print_action(action_type, action, total_cost)
|
|
187
204
|
spinner.text = f"Performing {action_type}..."
|
|
188
205
|
spinner.show()
|
|
189
|
-
|
|
206
|
+
|
|
190
207
|
elif item.get("type") == "function_call":
|
|
191
208
|
# Display function call
|
|
192
209
|
function_name = item.get("name", "")
|
|
@@ -194,18 +211,18 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
194
211
|
print_colored(f"🔧 Calling function: {function_name}", dim=True)
|
|
195
212
|
spinner.text = f"Calling {function_name}..."
|
|
196
213
|
spinner.show()
|
|
197
|
-
|
|
214
|
+
|
|
198
215
|
elif item.get("type") == "function_call_output":
|
|
199
216
|
# Display function output (dimmed)
|
|
200
217
|
output = item.get("output", "")
|
|
201
218
|
if output and len(output.strip()) > 0:
|
|
202
219
|
spinner.hide()
|
|
203
220
|
print_colored(f"📤 {output}", dim=True)
|
|
204
|
-
|
|
221
|
+
|
|
205
222
|
spinner.hide()
|
|
206
223
|
if show_usage and total_cost > 0:
|
|
207
224
|
print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
|
|
208
|
-
|
|
225
|
+
|
|
209
226
|
|
|
210
227
|
async def main():
|
|
211
228
|
"""Main CLI function."""
|
|
@@ -218,90 +235,74 @@ Examples:
|
|
|
218
235
|
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
|
|
219
236
|
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
|
220
237
|
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
|
221
|
-
"""
|
|
238
|
+
""",
|
|
222
239
|
)
|
|
223
|
-
|
|
240
|
+
|
|
224
241
|
parser.add_argument(
|
|
225
242
|
"model",
|
|
226
|
-
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
|
|
243
|
+
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')",
|
|
227
244
|
)
|
|
228
|
-
|
|
245
|
+
|
|
229
246
|
parser.add_argument(
|
|
230
247
|
"--provider",
|
|
231
248
|
choices=["cloud", "lume", "winsandbox", "docker"],
|
|
232
249
|
default="cloud",
|
|
233
|
-
help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
|
|
250
|
+
help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
|
|
234
251
|
)
|
|
235
|
-
|
|
252
|
+
|
|
236
253
|
parser.add_argument(
|
|
237
254
|
"--images",
|
|
238
255
|
type=int,
|
|
239
256
|
default=3,
|
|
240
|
-
help="Number of recent images to keep in context (default: 3)"
|
|
241
|
-
)
|
|
242
|
-
|
|
243
|
-
parser.add_argument(
|
|
244
|
-
"--trajectory",
|
|
245
|
-
action="store_true",
|
|
246
|
-
help="Save trajectory for debugging"
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
parser.add_argument(
|
|
250
|
-
"--budget",
|
|
251
|
-
type=float,
|
|
252
|
-
help="Maximum budget for the session (in dollars)"
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
parser.add_argument(
|
|
256
|
-
"--verbose",
|
|
257
|
-
action="store_true",
|
|
258
|
-
help="Enable verbose logging"
|
|
257
|
+
help="Number of recent images to keep in context (default: 3)",
|
|
259
258
|
)
|
|
260
259
|
|
|
260
|
+
parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
|
|
261
|
+
|
|
262
|
+
parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
|
|
263
|
+
|
|
264
|
+
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
|
|
265
|
+
|
|
261
266
|
parser.add_argument(
|
|
262
|
-
"-p",
|
|
267
|
+
"-p",
|
|
268
|
+
"--prompt",
|
|
263
269
|
type=str,
|
|
264
|
-
help="Initial prompt to send to the agent. Leave blank for interactive mode."
|
|
270
|
+
help="Initial prompt to send to the agent. Leave blank for interactive mode.",
|
|
265
271
|
)
|
|
266
272
|
|
|
267
273
|
parser.add_argument(
|
|
268
274
|
"--prompt-file",
|
|
269
275
|
type=Path,
|
|
270
|
-
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
|
|
276
|
+
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
|
|
271
277
|
)
|
|
272
278
|
|
|
273
279
|
parser.add_argument(
|
|
274
280
|
"--predict-click",
|
|
275
281
|
dest="predict_click",
|
|
276
282
|
type=str,
|
|
277
|
-
help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
|
|
283
|
+
help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
|
|
278
284
|
)
|
|
279
285
|
|
|
280
|
-
parser.add_argument(
|
|
281
|
-
"-c", "--cache",
|
|
282
|
-
action="store_true",
|
|
283
|
-
help="Tell the API to enable caching"
|
|
284
|
-
)
|
|
286
|
+
parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
|
|
285
287
|
|
|
286
288
|
parser.add_argument(
|
|
287
|
-
"-u", "--usage",
|
|
288
|
-
action="store_true",
|
|
289
|
-
help="Show total cost of the agent runs"
|
|
289
|
+
"-u", "--usage", action="store_true", help="Show total cost of the agent runs"
|
|
290
290
|
)
|
|
291
291
|
|
|
292
292
|
parser.add_argument(
|
|
293
|
-
"-r",
|
|
293
|
+
"-r",
|
|
294
|
+
"--max-retries",
|
|
294
295
|
type=int,
|
|
295
296
|
default=3,
|
|
296
|
-
help="Maximum number of retries for the LLM API calls"
|
|
297
|
+
help="Maximum number of retries for the LLM API calls",
|
|
297
298
|
)
|
|
298
|
-
|
|
299
|
+
|
|
299
300
|
args = parser.parse_args()
|
|
300
|
-
|
|
301
|
+
|
|
301
302
|
# Check for required environment variables
|
|
302
303
|
container_name = os.getenv("CUA_CONTAINER_NAME")
|
|
303
304
|
cua_api_key = os.getenv("CUA_API_KEY")
|
|
304
|
-
|
|
305
|
+
|
|
305
306
|
# Prompt for missing environment variables (container name always required)
|
|
306
307
|
if not container_name:
|
|
307
308
|
if args.provider == "cloud":
|
|
@@ -321,13 +322,13 @@ Examples:
|
|
|
321
322
|
if not cua_api_key:
|
|
322
323
|
print_colored("❌ API key is required for cloud provider.")
|
|
323
324
|
sys.exit(1)
|
|
324
|
-
|
|
325
|
+
|
|
325
326
|
# Check for provider-specific API keys based on model
|
|
326
327
|
provider_api_keys = {
|
|
327
328
|
"openai/": "OPENAI_API_KEY",
|
|
328
329
|
"anthropic/": "ANTHROPIC_API_KEY",
|
|
329
330
|
}
|
|
330
|
-
|
|
331
|
+
|
|
331
332
|
# Find matching provider and check for API key
|
|
332
333
|
for prefix, env_var in provider_api_keys.items():
|
|
333
334
|
if prefix in args.model:
|
|
@@ -340,7 +341,7 @@ Examples:
|
|
|
340
341
|
# Set the environment variable for the session
|
|
341
342
|
os.environ[env_var] = api_key
|
|
342
343
|
break
|
|
343
|
-
|
|
344
|
+
|
|
344
345
|
# Import here to avoid import errors if dependencies are missing
|
|
345
346
|
try:
|
|
346
347
|
from agent import ComputerAgent
|
|
@@ -349,7 +350,7 @@ Examples:
|
|
|
349
350
|
print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
|
|
350
351
|
print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
|
|
351
352
|
sys.exit(1)
|
|
352
|
-
|
|
353
|
+
|
|
353
354
|
# Resolve provider -> os_type, provider_type, api key requirement
|
|
354
355
|
provider_map = {
|
|
355
356
|
"cloud": ("linux", "cloud", True),
|
|
@@ -365,42 +366,46 @@ Examples:
|
|
|
365
366
|
"name": container_name,
|
|
366
367
|
}
|
|
367
368
|
if needs_api_key:
|
|
368
|
-
computer_kwargs["api_key"] = cua_api_key
|
|
369
|
+
computer_kwargs["api_key"] = cua_api_key # type: ignore
|
|
369
370
|
|
|
370
371
|
# Create computer instance
|
|
371
|
-
async with Computer(**computer_kwargs) as computer:
|
|
372
|
-
|
|
372
|
+
async with Computer(**computer_kwargs) as computer: # type: ignore
|
|
373
|
+
|
|
373
374
|
# Create agent
|
|
374
375
|
agent_kwargs = {
|
|
375
376
|
"model": args.model,
|
|
376
377
|
"tools": [computer],
|
|
377
|
-
"trust_remote_code": True,
|
|
378
|
+
"trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
|
|
378
379
|
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
|
|
379
|
-
"max_retries": args.max_retries
|
|
380
|
+
"max_retries": args.max_retries,
|
|
380
381
|
}
|
|
381
382
|
|
|
382
383
|
if args.images > 0:
|
|
383
384
|
agent_kwargs["only_n_most_recent_images"] = args.images
|
|
384
|
-
|
|
385
|
+
|
|
385
386
|
if args.trajectory:
|
|
386
387
|
agent_kwargs["trajectory_dir"] = "trajectories"
|
|
387
|
-
|
|
388
|
+
|
|
388
389
|
if args.budget:
|
|
389
390
|
agent_kwargs["max_trajectory_budget"] = {
|
|
390
391
|
"max_budget": args.budget,
|
|
391
392
|
"raise_error": True,
|
|
392
|
-
"reset_after_each_run": False
|
|
393
|
+
"reset_after_each_run": False,
|
|
393
394
|
}
|
|
394
395
|
|
|
395
396
|
if args.cache:
|
|
396
397
|
agent_kwargs["use_prompt_caching"] = True
|
|
397
|
-
|
|
398
|
+
|
|
398
399
|
agent = ComputerAgent(**agent_kwargs)
|
|
399
|
-
|
|
400
|
+
|
|
400
401
|
# If predict-click mode is requested, run once and exit
|
|
401
402
|
if args.predict_click:
|
|
402
403
|
if not PIL_AVAILABLE:
|
|
403
|
-
print_colored(
|
|
404
|
+
print_colored(
|
|
405
|
+
"❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
|
|
406
|
+
Colors.RED,
|
|
407
|
+
bold=True,
|
|
408
|
+
)
|
|
404
409
|
sys.exit(1)
|
|
405
410
|
|
|
406
411
|
instruction = args.predict_click
|
|
@@ -435,6 +440,7 @@ Examples:
|
|
|
435
440
|
|
|
436
441
|
try:
|
|
437
442
|
from io import BytesIO
|
|
443
|
+
|
|
438
444
|
with Image.open(BytesIO(img_bytes)) as img:
|
|
439
445
|
img = img.convert("RGB")
|
|
440
446
|
draw = ImageDraw.Draw(img)
|
|
@@ -457,9 +463,9 @@ Examples:
|
|
|
457
463
|
if system == "windows":
|
|
458
464
|
os.startfile(str(out_path)) # type: ignore[attr-defined]
|
|
459
465
|
elif system == "darwin":
|
|
460
|
-
os.system(f
|
|
466
|
+
os.system(f'open "{out_path}"')
|
|
461
467
|
else:
|
|
462
|
-
os.system(f
|
|
468
|
+
os.system(f'xdg-open "{out_path}"')
|
|
463
469
|
except Exception:
|
|
464
470
|
pass
|
|
465
471
|
except Exception as e:
|
|
@@ -482,9 +488,8 @@ Examples:
|
|
|
482
488
|
await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
|
|
483
489
|
|
|
484
490
|
|
|
485
|
-
|
|
486
491
|
if __name__ == "__main__":
|
|
487
492
|
try:
|
|
488
493
|
asyncio.run(main())
|
|
489
494
|
except (KeyboardInterrupt, EOFError) as _:
|
|
490
|
-
print_colored("\n\n👋 Goodbye!")
|
|
495
|
+
print_colored("\n\n👋 Goodbye!")
|
agent/computers/__init__.py
CHANGED
|
@@ -6,27 +6,32 @@ computer interface types, supporting both the ComputerHandler protocol and the
|
|
|
6
6
|
Computer library interface.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from computer import Computer as cuaComputer
|
|
10
|
+
|
|
9
11
|
from .base import AsyncComputerHandler
|
|
10
12
|
from .cua import cuaComputerHandler
|
|
11
13
|
from .custom import CustomComputerHandler
|
|
12
|
-
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
def is_agent_computer(computer):
|
|
15
17
|
"""Check if the given computer is a ComputerHandler or CUA Computer."""
|
|
16
|
-
return
|
|
17
|
-
isinstance(computer,
|
|
18
|
-
|
|
18
|
+
return (
|
|
19
|
+
isinstance(computer, AsyncComputerHandler)
|
|
20
|
+
or isinstance(computer, cuaComputer)
|
|
21
|
+
or (isinstance(computer, dict))
|
|
22
|
+
) # and "screenshot" in computer)
|
|
23
|
+
|
|
19
24
|
|
|
20
25
|
async def make_computer_handler(computer):
|
|
21
26
|
"""
|
|
22
27
|
Create a computer handler from a computer interface.
|
|
23
|
-
|
|
28
|
+
|
|
24
29
|
Args:
|
|
25
30
|
computer: Either a ComputerHandler instance, Computer instance, or dict of functions
|
|
26
|
-
|
|
31
|
+
|
|
27
32
|
Returns:
|
|
28
33
|
ComputerHandler: A computer handler instance
|
|
29
|
-
|
|
34
|
+
|
|
30
35
|
Raises:
|
|
31
36
|
ValueError: If the computer type is not supported
|
|
32
37
|
"""
|
|
@@ -38,4 +43,4 @@ async def make_computer_handler(computer):
|
|
|
38
43
|
return computer_handler
|
|
39
44
|
if isinstance(computer, dict):
|
|
40
45
|
return CustomComputerHandler(computer)
|
|
41
|
-
raise ValueError(f"Unsupported computer type: {type(computer)}")
|
|
46
|
+
raise ValueError(f"Unsupported computer type: {type(computer)}")
|
agent/computers/base.py
CHANGED
|
@@ -2,69 +2,82 @@
|
|
|
2
2
|
Base computer interface protocol for agent interactions.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Dict,
|
|
8
|
+
List,
|
|
9
|
+
Literal,
|
|
10
|
+
Optional,
|
|
11
|
+
Protocol,
|
|
12
|
+
Union,
|
|
13
|
+
runtime_checkable,
|
|
14
|
+
)
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
@runtime_checkable
|
|
9
18
|
class AsyncComputerHandler(Protocol):
|
|
10
19
|
"""Protocol defining the interface for computer interactions."""
|
|
11
|
-
|
|
12
|
-
# ==== Computer-Use-Preview Action Space ====
|
|
20
|
+
|
|
21
|
+
# ==== Computer-Use-Preview Action Space ====
|
|
13
22
|
|
|
14
23
|
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
|
15
24
|
"""Get the current environment type."""
|
|
16
25
|
...
|
|
17
|
-
|
|
26
|
+
|
|
18
27
|
async def get_dimensions(self) -> tuple[int, int]:
|
|
19
28
|
"""Get screen dimensions as (width, height)."""
|
|
20
29
|
...
|
|
21
|
-
|
|
22
|
-
async def screenshot(self) -> str:
|
|
23
|
-
"""Take a screenshot and return as base64 string.
|
|
30
|
+
|
|
31
|
+
async def screenshot(self, text: Optional[str] = None) -> str:
|
|
32
|
+
"""Take a screenshot and return as base64 string.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
|
|
36
|
+
"""
|
|
24
37
|
...
|
|
25
|
-
|
|
38
|
+
|
|
26
39
|
async def click(self, x: int, y: int, button: str = "left") -> None:
|
|
27
40
|
"""Click at coordinates with specified button."""
|
|
28
41
|
...
|
|
29
|
-
|
|
42
|
+
|
|
30
43
|
async def double_click(self, x: int, y: int) -> None:
|
|
31
44
|
"""Double click at coordinates."""
|
|
32
45
|
...
|
|
33
|
-
|
|
46
|
+
|
|
34
47
|
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
|
35
48
|
"""Scroll at coordinates with specified scroll amounts."""
|
|
36
49
|
...
|
|
37
|
-
|
|
50
|
+
|
|
38
51
|
async def type(self, text: str) -> None:
|
|
39
52
|
"""Type text."""
|
|
40
53
|
...
|
|
41
|
-
|
|
54
|
+
|
|
42
55
|
async def wait(self, ms: int = 1000) -> None:
|
|
43
56
|
"""Wait for specified milliseconds."""
|
|
44
57
|
...
|
|
45
|
-
|
|
58
|
+
|
|
46
59
|
async def move(self, x: int, y: int) -> None:
|
|
47
60
|
"""Move cursor to coordinates."""
|
|
48
61
|
...
|
|
49
|
-
|
|
62
|
+
|
|
50
63
|
async def keypress(self, keys: Union[List[str], str]) -> None:
|
|
51
64
|
"""Press key combination."""
|
|
52
65
|
...
|
|
53
|
-
|
|
66
|
+
|
|
54
67
|
async def drag(self, path: List[Dict[str, int]]) -> None:
|
|
55
68
|
"""Drag along specified path."""
|
|
56
69
|
...
|
|
57
|
-
|
|
70
|
+
|
|
58
71
|
async def get_current_url(self) -> str:
|
|
59
72
|
"""Get current URL (for browser environments)."""
|
|
60
73
|
...
|
|
61
|
-
|
|
62
|
-
# ==== Anthropic Action Space ====
|
|
74
|
+
|
|
75
|
+
# ==== Anthropic Action Space ====
|
|
63
76
|
|
|
64
77
|
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
|
65
78
|
"""Left mouse down at coordinates."""
|
|
66
79
|
...
|
|
67
|
-
|
|
80
|
+
|
|
68
81
|
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
|
69
82
|
"""Left mouse up at coordinates."""
|
|
70
83
|
...
|