cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
agent/cli.py
CHANGED
|
@@ -3,55 +3,75 @@ CLI chat interface for agent - Computer Use Agent
|
|
|
3
3
|
|
|
4
4
|
Usage:
|
|
5
5
|
python -m agent.cli <model_string>
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
Examples:
|
|
8
8
|
python -m agent.cli openai/computer-use-preview
|
|
9
|
-
python -m agent.cli anthropic/claude-
|
|
10
|
-
python -m agent.cli omniparser+anthropic/claude-
|
|
9
|
+
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
|
|
10
|
+
python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
try:
|
|
14
|
-
import asyncio
|
|
15
14
|
import argparse
|
|
15
|
+
import asyncio
|
|
16
|
+
import base64
|
|
17
|
+
import json
|
|
16
18
|
import os
|
|
19
|
+
import platform
|
|
17
20
|
import sys
|
|
18
|
-
import
|
|
19
|
-
from
|
|
21
|
+
import time
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List
|
|
24
|
+
|
|
20
25
|
import dotenv
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from PIL import Image, ImageDraw
|
|
29
|
+
|
|
30
|
+
PIL_AVAILABLE = True
|
|
31
|
+
except Exception:
|
|
32
|
+
PIL_AVAILABLE = False
|
|
21
33
|
from yaspin import yaspin
|
|
22
34
|
except ImportError:
|
|
23
35
|
if __name__ == "__main__":
|
|
24
36
|
raise ImportError(
|
|
25
|
-
"CLI dependencies not found. "
|
|
26
|
-
"Please install with: pip install \"cua-agent[cli]\""
|
|
37
|
+
"CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
|
|
27
38
|
)
|
|
28
39
|
|
|
29
40
|
# Load environment variables
|
|
30
41
|
dotenv.load_dotenv()
|
|
31
42
|
|
|
43
|
+
|
|
32
44
|
# Color codes for terminal output
|
|
33
45
|
class Colors:
|
|
34
|
-
RESET =
|
|
35
|
-
BOLD =
|
|
36
|
-
DIM =
|
|
37
|
-
|
|
46
|
+
RESET = "\033[0m"
|
|
47
|
+
BOLD = "\033[1m"
|
|
48
|
+
DIM = "\033[2m"
|
|
49
|
+
|
|
38
50
|
# Text colors
|
|
39
|
-
RED =
|
|
40
|
-
GREEN =
|
|
41
|
-
YELLOW =
|
|
42
|
-
BLUE =
|
|
43
|
-
MAGENTA =
|
|
44
|
-
CYAN =
|
|
45
|
-
WHITE =
|
|
46
|
-
GRAY =
|
|
47
|
-
|
|
51
|
+
RED = "\033[31m"
|
|
52
|
+
GREEN = "\033[32m"
|
|
53
|
+
YELLOW = "\033[33m"
|
|
54
|
+
BLUE = "\033[34m"
|
|
55
|
+
MAGENTA = "\033[35m"
|
|
56
|
+
CYAN = "\033[36m"
|
|
57
|
+
WHITE = "\033[37m"
|
|
58
|
+
GRAY = "\033[90m"
|
|
59
|
+
|
|
48
60
|
# Background colors
|
|
49
|
-
BG_RED =
|
|
50
|
-
BG_GREEN =
|
|
51
|
-
BG_YELLOW =
|
|
52
|
-
BG_BLUE =
|
|
61
|
+
BG_RED = "\033[41m"
|
|
62
|
+
BG_GREEN = "\033[42m"
|
|
63
|
+
BG_YELLOW = "\033[43m"
|
|
64
|
+
BG_BLUE = "\033[44m"
|
|
53
65
|
|
|
54
|
-
|
|
66
|
+
|
|
67
|
+
def print_colored(
|
|
68
|
+
text: str,
|
|
69
|
+
color: str = "",
|
|
70
|
+
bold: bool = False,
|
|
71
|
+
dim: bool = False,
|
|
72
|
+
end: str = "\n",
|
|
73
|
+
right: str = "",
|
|
74
|
+
):
|
|
55
75
|
"""Print colored text to terminal with optional right-aligned text."""
|
|
56
76
|
prefix = ""
|
|
57
77
|
if bold:
|
|
@@ -60,24 +80,25 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
|
|
|
60
80
|
prefix += Colors.DIM
|
|
61
81
|
if color:
|
|
62
82
|
prefix += color
|
|
63
|
-
|
|
83
|
+
|
|
64
84
|
if right:
|
|
65
85
|
# Get terminal width (default to 80 if unable to determine)
|
|
66
86
|
try:
|
|
67
87
|
import shutil
|
|
88
|
+
|
|
68
89
|
terminal_width = shutil.get_terminal_size().columns
|
|
69
90
|
except:
|
|
70
91
|
terminal_width = 80
|
|
71
92
|
|
|
72
93
|
# Add right margin
|
|
73
94
|
terminal_width -= 1
|
|
74
|
-
|
|
95
|
+
|
|
75
96
|
# Calculate padding needed
|
|
76
97
|
# Account for ANSI escape codes not taking visual space
|
|
77
98
|
visible_left_len = len(text)
|
|
78
99
|
visible_right_len = len(right)
|
|
79
100
|
padding = terminal_width - visible_left_len - visible_right_len
|
|
80
|
-
|
|
101
|
+
|
|
81
102
|
if padding > 0:
|
|
82
103
|
output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
|
|
83
104
|
else:
|
|
@@ -85,7 +106,7 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
|
|
|
85
106
|
output = f"{prefix}{text} {right}{Colors.RESET}"
|
|
86
107
|
else:
|
|
87
108
|
output = f"{prefix}{text}{Colors.RESET}"
|
|
88
|
-
|
|
109
|
+
|
|
89
110
|
print(output, end=end)
|
|
90
111
|
|
|
91
112
|
|
|
@@ -104,29 +125,34 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
|
|
|
104
125
|
args_str = f"('{details['text']}')"
|
|
105
126
|
elif action_type == "scroll" and "x" in details and "y" in details:
|
|
106
127
|
args_str = f"({details['x']}, {details['y']})"
|
|
107
|
-
|
|
128
|
+
|
|
108
129
|
if total_cost > 0:
|
|
109
130
|
print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
|
|
110
131
|
else:
|
|
111
132
|
print_colored(f"🛠️ {action_type}{args_str}", dim=True)
|
|
112
133
|
|
|
134
|
+
|
|
113
135
|
def print_welcome(model: str, agent_loop: str, container_name: str):
|
|
114
136
|
"""Print welcome message."""
|
|
115
137
|
print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
|
|
116
138
|
print_colored("Type 'exit' to quit.", dim=True)
|
|
117
139
|
|
|
140
|
+
|
|
118
141
|
async def ainput(prompt: str = ""):
|
|
119
142
|
return await asyncio.to_thread(input, prompt)
|
|
120
143
|
|
|
121
|
-
|
|
144
|
+
|
|
145
|
+
async def chat_loop(
|
|
146
|
+
agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
|
|
147
|
+
):
|
|
122
148
|
"""Main chat loop with the agent."""
|
|
123
149
|
print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
|
|
124
|
-
|
|
150
|
+
|
|
125
151
|
history = []
|
|
126
|
-
|
|
152
|
+
|
|
127
153
|
if initial_prompt:
|
|
128
154
|
history.append({"role": "user", "content": initial_prompt})
|
|
129
|
-
|
|
155
|
+
|
|
130
156
|
total_cost = 0
|
|
131
157
|
|
|
132
158
|
while True:
|
|
@@ -134,31 +160,31 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
134
160
|
# Get user input with prompt
|
|
135
161
|
print_colored("> ", end="")
|
|
136
162
|
user_input = await ainput()
|
|
137
|
-
|
|
138
|
-
if user_input.lower() in [
|
|
163
|
+
|
|
164
|
+
if user_input.lower() in ["exit", "quit", "q"]:
|
|
139
165
|
print_colored("\n👋 Goodbye!")
|
|
140
166
|
break
|
|
141
|
-
|
|
167
|
+
|
|
142
168
|
if not user_input:
|
|
143
169
|
continue
|
|
144
|
-
|
|
170
|
+
|
|
145
171
|
# Add user message to history
|
|
146
172
|
history.append({"role": "user", "content": user_input})
|
|
147
|
-
|
|
173
|
+
|
|
148
174
|
# Stream responses from the agent with spinner
|
|
149
175
|
with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
|
|
150
176
|
spinner.hide()
|
|
151
|
-
|
|
177
|
+
|
|
152
178
|
async for result in agent.run(history):
|
|
153
179
|
# Add agent responses to history
|
|
154
180
|
history.extend(result.get("output", []))
|
|
155
181
|
|
|
156
182
|
if show_usage:
|
|
157
183
|
total_cost += result.get("usage", {}).get("response_cost", 0)
|
|
158
|
-
|
|
184
|
+
|
|
159
185
|
# Process and display the output
|
|
160
186
|
for item in result.get("output", []):
|
|
161
|
-
if item.get("type") == "message":
|
|
187
|
+
if item.get("type") == "message" and item.get("role") == "assistant":
|
|
162
188
|
# Display agent text response
|
|
163
189
|
content = item.get("content", [])
|
|
164
190
|
for content_part in content:
|
|
@@ -167,7 +193,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
167
193
|
if text:
|
|
168
194
|
spinner.hide()
|
|
169
195
|
print_colored(text)
|
|
170
|
-
|
|
196
|
+
|
|
171
197
|
elif item.get("type") == "computer_call":
|
|
172
198
|
# Display computer action
|
|
173
199
|
action = item.get("action", {})
|
|
@@ -177,7 +203,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
177
203
|
print_action(action_type, action, total_cost)
|
|
178
204
|
spinner.text = f"Performing {action_type}..."
|
|
179
205
|
spinner.show()
|
|
180
|
-
|
|
206
|
+
|
|
181
207
|
elif item.get("type") == "function_call":
|
|
182
208
|
# Display function call
|
|
183
209
|
function_name = item.get("name", "")
|
|
@@ -185,121 +211,141 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
|
|
185
211
|
print_colored(f"🔧 Calling function: {function_name}", dim=True)
|
|
186
212
|
spinner.text = f"Calling {function_name}..."
|
|
187
213
|
spinner.show()
|
|
188
|
-
|
|
214
|
+
|
|
189
215
|
elif item.get("type") == "function_call_output":
|
|
190
216
|
# Display function output (dimmed)
|
|
191
217
|
output = item.get("output", "")
|
|
192
218
|
if output and len(output.strip()) > 0:
|
|
193
219
|
spinner.hide()
|
|
194
220
|
print_colored(f"📤 {output}", dim=True)
|
|
195
|
-
|
|
221
|
+
|
|
196
222
|
spinner.hide()
|
|
197
223
|
if show_usage and total_cost > 0:
|
|
198
224
|
print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
|
|
199
|
-
|
|
225
|
+
|
|
200
226
|
|
|
201
227
|
async def main():
|
|
202
228
|
"""Main CLI function."""
|
|
203
229
|
parser = argparse.ArgumentParser(
|
|
204
|
-
description="
|
|
230
|
+
description="Cua Agent CLI - Interactive computer use assistant",
|
|
205
231
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
206
232
|
epilog="""
|
|
207
233
|
Examples:
|
|
208
234
|
python -m agent.cli openai/computer-use-preview
|
|
209
|
-
python -m agent.cli anthropic/claude-
|
|
210
|
-
python -m agent.cli omniparser+anthropic/claude-
|
|
235
|
+
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
|
|
236
|
+
python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
|
|
211
237
|
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
|
212
|
-
"""
|
|
238
|
+
""",
|
|
213
239
|
)
|
|
214
|
-
|
|
240
|
+
|
|
215
241
|
parser.add_argument(
|
|
216
242
|
"model",
|
|
217
|
-
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-
|
|
243
|
+
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-sonnet-4-5-20250929')",
|
|
218
244
|
)
|
|
219
|
-
|
|
245
|
+
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--provider",
|
|
248
|
+
choices=["cloud", "lume", "winsandbox", "docker"],
|
|
249
|
+
default="cloud",
|
|
250
|
+
help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
|
|
251
|
+
)
|
|
252
|
+
|
|
220
253
|
parser.add_argument(
|
|
221
254
|
"--images",
|
|
222
255
|
type=int,
|
|
223
256
|
default=3,
|
|
224
|
-
help="Number of recent images to keep in context (default: 3)"
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
parser.add_argument(
|
|
228
|
-
"--trajectory",
|
|
229
|
-
action="store_true",
|
|
230
|
-
help="Save trajectory for debugging"
|
|
257
|
+
help="Number of recent images to keep in context (default: 3)",
|
|
231
258
|
)
|
|
232
|
-
|
|
259
|
+
|
|
260
|
+
parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
|
|
261
|
+
|
|
262
|
+
parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
|
|
263
|
+
|
|
264
|
+
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
|
|
265
|
+
|
|
233
266
|
parser.add_argument(
|
|
234
|
-
"
|
|
235
|
-
|
|
236
|
-
|
|
267
|
+
"-p",
|
|
268
|
+
"--prompt",
|
|
269
|
+
type=str,
|
|
270
|
+
help="Initial prompt to send to the agent. Leave blank for interactive mode.",
|
|
237
271
|
)
|
|
238
|
-
|
|
272
|
+
|
|
239
273
|
parser.add_argument(
|
|
240
|
-
"--
|
|
241
|
-
|
|
242
|
-
help="
|
|
274
|
+
"--prompt-file",
|
|
275
|
+
type=Path,
|
|
276
|
+
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
|
|
243
277
|
)
|
|
244
278
|
|
|
245
279
|
parser.add_argument(
|
|
246
|
-
"-
|
|
280
|
+
"--predict-click",
|
|
281
|
+
dest="predict_click",
|
|
247
282
|
type=str,
|
|
248
|
-
help="
|
|
283
|
+
help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
|
|
249
284
|
)
|
|
250
285
|
|
|
251
|
-
parser.add_argument(
|
|
252
|
-
"-c", "--cache",
|
|
253
|
-
action="store_true",
|
|
254
|
-
help="Tell the API to enable caching"
|
|
255
|
-
)
|
|
286
|
+
parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
|
|
256
287
|
|
|
257
288
|
parser.add_argument(
|
|
258
|
-
"-u", "--usage",
|
|
259
|
-
action="store_true",
|
|
260
|
-
help="Show total cost of the agent runs"
|
|
289
|
+
"-u", "--usage", action="store_true", help="Show total cost of the agent runs"
|
|
261
290
|
)
|
|
262
291
|
|
|
263
292
|
parser.add_argument(
|
|
264
|
-
"-r",
|
|
293
|
+
"-r",
|
|
294
|
+
"--max-retries",
|
|
265
295
|
type=int,
|
|
266
296
|
default=3,
|
|
267
|
-
help="Maximum number of retries for the LLM API calls"
|
|
297
|
+
help="Maximum number of retries for the LLM API calls",
|
|
268
298
|
)
|
|
269
|
-
|
|
299
|
+
|
|
300
|
+
# Provider override credentials
|
|
301
|
+
parser.add_argument(
|
|
302
|
+
"--api-key",
|
|
303
|
+
dest="api_key",
|
|
304
|
+
type=str,
|
|
305
|
+
help="API key override for the model provider (passed to ComputerAgent)",
|
|
306
|
+
)
|
|
307
|
+
parser.add_argument(
|
|
308
|
+
"--api-base",
|
|
309
|
+
dest="api_base",
|
|
310
|
+
type=str,
|
|
311
|
+
help="API base URL override for the model provider (passed to ComputerAgent)",
|
|
312
|
+
)
|
|
313
|
+
|
|
270
314
|
args = parser.parse_args()
|
|
271
|
-
|
|
315
|
+
|
|
272
316
|
# Check for required environment variables
|
|
273
317
|
container_name = os.getenv("CUA_CONTAINER_NAME")
|
|
274
318
|
cua_api_key = os.getenv("CUA_API_KEY")
|
|
275
|
-
|
|
276
|
-
# Prompt for missing environment variables
|
|
319
|
+
|
|
320
|
+
# Prompt for missing environment variables (container name always required)
|
|
277
321
|
if not container_name:
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
322
|
+
if args.provider == "cloud":
|
|
323
|
+
print_colored("CUA_CONTAINER_NAME not set.", dim=True)
|
|
324
|
+
print_colored("You can get a Cua container at https://cua.ai/", dim=True)
|
|
325
|
+
container_name = input("Enter your Cua container name: ").strip()
|
|
326
|
+
if not container_name:
|
|
327
|
+
print_colored("❌ Container name is required.")
|
|
328
|
+
sys.exit(1)
|
|
329
|
+
else:
|
|
330
|
+
container_name = "cli-sandbox"
|
|
331
|
+
|
|
332
|
+
# Only require API key for cloud provider
|
|
333
|
+
if args.provider == "cloud" and not cua_api_key:
|
|
286
334
|
print_colored("CUA_API_KEY not set.", dim=True)
|
|
287
|
-
cua_api_key = input("Enter your
|
|
335
|
+
cua_api_key = input("Enter your Cua API key: ").strip()
|
|
288
336
|
if not cua_api_key:
|
|
289
|
-
print_colored("❌ API key is required.")
|
|
337
|
+
print_colored("❌ API key is required for cloud provider.")
|
|
290
338
|
sys.exit(1)
|
|
291
|
-
|
|
339
|
+
|
|
292
340
|
# Check for provider-specific API keys based on model
|
|
293
341
|
provider_api_keys = {
|
|
294
342
|
"openai/": "OPENAI_API_KEY",
|
|
295
343
|
"anthropic/": "ANTHROPIC_API_KEY",
|
|
296
|
-
"omniparser+": "OPENAI_API_KEY",
|
|
297
|
-
"omniparser+": "ANTHROPIC_API_KEY",
|
|
298
344
|
}
|
|
299
|
-
|
|
345
|
+
|
|
300
346
|
# Find matching provider and check for API key
|
|
301
347
|
for prefix, env_var in provider_api_keys.items():
|
|
302
|
-
if args.model
|
|
348
|
+
if prefix in args.model:
|
|
303
349
|
if not os.getenv(env_var):
|
|
304
350
|
print_colored(f"{env_var} not set.", dim=True)
|
|
305
351
|
api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
|
|
@@ -309,7 +355,7 @@ Examples:
|
|
|
309
355
|
# Set the environment variable for the session
|
|
310
356
|
os.environ[env_var] = api_key
|
|
311
357
|
break
|
|
312
|
-
|
|
358
|
+
|
|
313
359
|
# Import here to avoid import errors if dependencies are missing
|
|
314
360
|
try:
|
|
315
361
|
from agent import ComputerAgent
|
|
@@ -318,48 +364,152 @@ Examples:
|
|
|
318
364
|
print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
|
|
319
365
|
print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
|
|
320
366
|
sys.exit(1)
|
|
321
|
-
|
|
367
|
+
|
|
368
|
+
# Resolve provider -> os_type, provider_type, api key requirement
|
|
369
|
+
provider_map = {
|
|
370
|
+
"cloud": ("linux", "cloud", True),
|
|
371
|
+
"lume": ("macos", "lume", False),
|
|
372
|
+
"winsandbox": ("windows", "winsandbox", False),
|
|
373
|
+
"docker": ("linux", "docker", False),
|
|
374
|
+
}
|
|
375
|
+
os_type, provider_type, needs_api_key = provider_map[args.provider]
|
|
376
|
+
|
|
377
|
+
computer_kwargs = {
|
|
378
|
+
"os_type": os_type,
|
|
379
|
+
"provider_type": provider_type,
|
|
380
|
+
"name": container_name,
|
|
381
|
+
}
|
|
382
|
+
if needs_api_key:
|
|
383
|
+
computer_kwargs["api_key"] = cua_api_key # type: ignore
|
|
384
|
+
|
|
322
385
|
# Create computer instance
|
|
323
|
-
async with Computer(
|
|
324
|
-
|
|
325
|
-
provider_type="cloud",
|
|
326
|
-
name=container_name,
|
|
327
|
-
api_key=cua_api_key
|
|
328
|
-
) as computer:
|
|
329
|
-
|
|
386
|
+
async with Computer(**computer_kwargs) as computer: # type: ignore
|
|
387
|
+
|
|
330
388
|
# Create agent
|
|
331
389
|
agent_kwargs = {
|
|
332
390
|
"model": args.model,
|
|
333
391
|
"tools": [computer],
|
|
392
|
+
"trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
|
|
334
393
|
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
|
|
335
|
-
"max_retries": args.max_retries
|
|
394
|
+
"max_retries": args.max_retries,
|
|
336
395
|
}
|
|
337
396
|
|
|
397
|
+
# Thread API credentials to agent if provided
|
|
398
|
+
if args.api_key:
|
|
399
|
+
agent_kwargs["api_key"] = args.api_key
|
|
400
|
+
if args.api_base:
|
|
401
|
+
agent_kwargs["api_base"] = args.api_base
|
|
402
|
+
|
|
338
403
|
if args.images > 0:
|
|
339
404
|
agent_kwargs["only_n_most_recent_images"] = args.images
|
|
340
|
-
|
|
405
|
+
|
|
341
406
|
if args.trajectory:
|
|
342
407
|
agent_kwargs["trajectory_dir"] = "trajectories"
|
|
343
|
-
|
|
408
|
+
|
|
344
409
|
if args.budget:
|
|
345
410
|
agent_kwargs["max_trajectory_budget"] = {
|
|
346
411
|
"max_budget": args.budget,
|
|
347
412
|
"raise_error": True,
|
|
348
|
-
"reset_after_each_run": False
|
|
413
|
+
"reset_after_each_run": False,
|
|
349
414
|
}
|
|
350
415
|
|
|
351
416
|
if args.cache:
|
|
352
417
|
agent_kwargs["use_prompt_caching"] = True
|
|
353
|
-
|
|
418
|
+
|
|
354
419
|
agent = ComputerAgent(**agent_kwargs)
|
|
355
|
-
|
|
356
|
-
# Start chat loop
|
|
357
|
-
await chat_loop(agent, args.model, container_name, args.prompt, args.usage)
|
|
358
420
|
|
|
421
|
+
# If predict-click mode is requested, run once and exit
|
|
422
|
+
if args.predict_click:
|
|
423
|
+
if not PIL_AVAILABLE:
|
|
424
|
+
print_colored(
|
|
425
|
+
"❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
|
|
426
|
+
Colors.RED,
|
|
427
|
+
bold=True,
|
|
428
|
+
)
|
|
429
|
+
sys.exit(1)
|
|
430
|
+
|
|
431
|
+
instruction = args.predict_click
|
|
432
|
+
print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
|
|
433
|
+
|
|
434
|
+
# Take a fresh screenshot FIRST
|
|
435
|
+
try:
|
|
436
|
+
img_bytes = await computer.interface.screenshot()
|
|
437
|
+
except Exception as e:
|
|
438
|
+
print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
|
|
439
|
+
sys.exit(1)
|
|
440
|
+
|
|
441
|
+
# Encode screenshot to base64 for predict_click
|
|
442
|
+
try:
|
|
443
|
+
image_b64 = base64.b64encode(img_bytes).decode("utf-8")
|
|
444
|
+
except Exception as e:
|
|
445
|
+
print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
|
|
446
|
+
sys.exit(1)
|
|
447
|
+
|
|
448
|
+
try:
|
|
449
|
+
coords = await agent.predict_click(instruction, image_b64=image_b64)
|
|
450
|
+
except Exception as e:
|
|
451
|
+
print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
|
|
452
|
+
sys.exit(1)
|
|
453
|
+
|
|
454
|
+
if not coords:
|
|
455
|
+
print_colored("⚠️ No coordinates returned.", Colors.YELLOW)
|
|
456
|
+
sys.exit(2)
|
|
457
|
+
|
|
458
|
+
x, y = coords
|
|
459
|
+
print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
|
|
460
|
+
|
|
461
|
+
try:
|
|
462
|
+
from io import BytesIO
|
|
463
|
+
|
|
464
|
+
with Image.open(BytesIO(img_bytes)) as img:
|
|
465
|
+
img = img.convert("RGB")
|
|
466
|
+
draw = ImageDraw.Draw(img)
|
|
467
|
+
# Draw crosshair
|
|
468
|
+
size = 12
|
|
469
|
+
color = (255, 0, 0)
|
|
470
|
+
draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
|
|
471
|
+
draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
|
|
472
|
+
# Optional small circle
|
|
473
|
+
r = 6
|
|
474
|
+
draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
|
|
475
|
+
|
|
476
|
+
out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
|
|
477
|
+
img.save(out_path)
|
|
478
|
+
print_colored(f"🖼️ Saved to {out_path}")
|
|
479
|
+
|
|
480
|
+
# Open the image with default viewer
|
|
481
|
+
try:
|
|
482
|
+
system = platform.system().lower()
|
|
483
|
+
if system == "windows":
|
|
484
|
+
os.startfile(str(out_path)) # type: ignore[attr-defined]
|
|
485
|
+
elif system == "darwin":
|
|
486
|
+
os.system(f'open "{out_path}"')
|
|
487
|
+
else:
|
|
488
|
+
os.system(f'xdg-open "{out_path}"')
|
|
489
|
+
except Exception:
|
|
490
|
+
pass
|
|
491
|
+
except Exception as e:
|
|
492
|
+
print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
|
|
493
|
+
sys.exit(1)
|
|
494
|
+
|
|
495
|
+
# Done
|
|
496
|
+
sys.exit(0)
|
|
497
|
+
|
|
498
|
+
# Resolve initial prompt from --prompt-file or --prompt
|
|
499
|
+
initial_prompt = args.prompt or ""
|
|
500
|
+
if args.prompt_file:
|
|
501
|
+
try:
|
|
502
|
+
initial_prompt = args.prompt_file.read_text(encoding="utf-8")
|
|
503
|
+
except Exception as e:
|
|
504
|
+
print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
|
|
505
|
+
sys.exit(1)
|
|
506
|
+
|
|
507
|
+
# Start chat loop (default interactive mode)
|
|
508
|
+
await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
|
|
359
509
|
|
|
360
510
|
|
|
361
511
|
if __name__ == "__main__":
|
|
362
512
|
try:
|
|
363
513
|
asyncio.run(main())
|
|
364
514
|
except (KeyboardInterrupt, EOFError) as _:
|
|
365
|
-
print_colored("\n\n👋 Goodbye!")
|
|
515
|
+
print_colored("\n\n👋 Goodbye!")
|
agent/computers/__init__.py
CHANGED
|
@@ -6,27 +6,32 @@ computer interface types, supporting both the ComputerHandler protocol and the
|
|
|
6
6
|
Computer library interface.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from computer import Computer as cuaComputer
|
|
10
|
+
|
|
9
11
|
from .base import AsyncComputerHandler
|
|
10
12
|
from .cua import cuaComputerHandler
|
|
11
13
|
from .custom import CustomComputerHandler
|
|
12
|
-
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
def is_agent_computer(computer):
|
|
15
|
-
"""Check if the given computer is a ComputerHandler or
|
|
16
|
-
return
|
|
17
|
-
isinstance(computer,
|
|
18
|
-
|
|
17
|
+
"""Check if the given computer is a ComputerHandler or Cua Computer."""
|
|
18
|
+
return (
|
|
19
|
+
isinstance(computer, AsyncComputerHandler)
|
|
20
|
+
or isinstance(computer, cuaComputer)
|
|
21
|
+
or (isinstance(computer, dict))
|
|
22
|
+
) # and "screenshot" in computer)
|
|
23
|
+
|
|
19
24
|
|
|
20
25
|
async def make_computer_handler(computer):
|
|
21
26
|
"""
|
|
22
27
|
Create a computer handler from a computer interface.
|
|
23
|
-
|
|
28
|
+
|
|
24
29
|
Args:
|
|
25
30
|
computer: Either a ComputerHandler instance, Computer instance, or dict of functions
|
|
26
|
-
|
|
31
|
+
|
|
27
32
|
Returns:
|
|
28
33
|
ComputerHandler: A computer handler instance
|
|
29
|
-
|
|
34
|
+
|
|
30
35
|
Raises:
|
|
31
36
|
ValueError: If the computer type is not supported
|
|
32
37
|
"""
|
|
@@ -38,4 +43,4 @@ async def make_computer_handler(computer):
|
|
|
38
43
|
return computer_handler
|
|
39
44
|
if isinstance(computer, dict):
|
|
40
45
|
return CustomComputerHandler(computer)
|
|
41
|
-
raise ValueError(f"Unsupported computer type: {type(computer)}")
|
|
46
|
+
raise ValueError(f"Unsupported computer type: {type(computer)}")
|