cua-agent 0.4.34__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +36 -12
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/METADATA +18 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.34.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.34.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
agent/cli.py CHANGED
@@ -3,7 +3,7 @@ CLI chat interface for agent - Computer Use Agent
3
3
 
4
4
  Usage:
5
5
  python -m agent.cli <model_string>
6
-
6
+
7
7
  Examples:
8
8
  python -m agent.cli openai/computer-use-preview
9
9
  python -m agent.cli anthropic/claude-3-5-sonnet-20241022
@@ -11,19 +11,22 @@ Examples:
11
11
  """
12
12
 
13
13
  try:
14
- import asyncio
15
14
  import argparse
15
+ import asyncio
16
+ import base64
17
+ import json
16
18
  import os
19
+ import platform
17
20
  import sys
18
- import json
19
- from typing import List, Dict, Any
20
- import dotenv
21
- import base64
22
21
  import time
23
- import platform
24
22
  from pathlib import Path
23
+ from typing import Any, Dict, List
24
+
25
+ import dotenv
26
+
25
27
  try:
26
28
  from PIL import Image, ImageDraw
29
+
27
30
  PIL_AVAILABLE = True
28
31
  except Exception:
29
32
  PIL_AVAILABLE = False
@@ -31,36 +34,44 @@ try:
31
34
  except ImportError:
32
35
  if __name__ == "__main__":
33
36
  raise ImportError(
34
- "CLI dependencies not found. "
35
- "Please install with: pip install \"cua-agent[cli]\""
37
+ "CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
36
38
  )
37
39
 
38
40
  # Load environment variables
39
41
  dotenv.load_dotenv()
40
42
 
43
+
41
44
  # Color codes for terminal output
42
45
  class Colors:
43
- RESET = '\033[0m'
44
- BOLD = '\033[1m'
45
- DIM = '\033[2m'
46
-
46
+ RESET = "\033[0m"
47
+ BOLD = "\033[1m"
48
+ DIM = "\033[2m"
49
+
47
50
  # Text colors
48
- RED = '\033[31m'
49
- GREEN = '\033[32m'
50
- YELLOW = '\033[33m'
51
- BLUE = '\033[34m'
52
- MAGENTA = '\033[35m'
53
- CYAN = '\033[36m'
54
- WHITE = '\033[37m'
55
- GRAY = '\033[90m'
56
-
57
- # Background colors
58
- BG_RED = '\033[41m'
59
- BG_GREEN = '\033[42m'
60
- BG_YELLOW = '\033[43m'
61
- BG_BLUE = '\033[44m'
51
+ RED = "\033[31m"
52
+ GREEN = "\033[32m"
53
+ YELLOW = "\033[33m"
54
+ BLUE = "\033[34m"
55
+ MAGENTA = "\033[35m"
56
+ CYAN = "\033[36m"
57
+ WHITE = "\033[37m"
58
+ GRAY = "\033[90m"
62
59
 
63
- def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""):
60
+ # Background colors
61
+ BG_RED = "\033[41m"
62
+ BG_GREEN = "\033[42m"
63
+ BG_YELLOW = "\033[43m"
64
+ BG_BLUE = "\033[44m"
65
+
66
+
67
+ def print_colored(
68
+ text: str,
69
+ color: str = "",
70
+ bold: bool = False,
71
+ dim: bool = False,
72
+ end: str = "\n",
73
+ right: str = "",
74
+ ):
64
75
  """Print colored text to terminal with optional right-aligned text."""
65
76
  prefix = ""
66
77
  if bold:
@@ -69,24 +80,25 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
69
80
  prefix += Colors.DIM
70
81
  if color:
71
82
  prefix += color
72
-
83
+
73
84
  if right:
74
85
  # Get terminal width (default to 80 if unable to determine)
75
86
  try:
76
87
  import shutil
88
+
77
89
  terminal_width = shutil.get_terminal_size().columns
78
90
  except:
79
91
  terminal_width = 80
80
92
 
81
93
  # Add right margin
82
94
  terminal_width -= 1
83
-
95
+
84
96
  # Calculate padding needed
85
97
  # Account for ANSI escape codes not taking visual space
86
98
  visible_left_len = len(text)
87
99
  visible_right_len = len(right)
88
100
  padding = terminal_width - visible_left_len - visible_right_len
89
-
101
+
90
102
  if padding > 0:
91
103
  output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
92
104
  else:
@@ -94,7 +106,7 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
94
106
  output = f"{prefix}{text} {right}{Colors.RESET}"
95
107
  else:
96
108
  output = f"{prefix}{text}{Colors.RESET}"
97
-
109
+
98
110
  print(output, end=end)
99
111
 
100
112
 
@@ -113,29 +125,34 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
113
125
  args_str = f"('{details['text']}')"
114
126
  elif action_type == "scroll" and "x" in details and "y" in details:
115
127
  args_str = f"({details['x']}, {details['y']})"
116
-
128
+
117
129
  if total_cost > 0:
118
130
  print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
119
131
  else:
120
132
  print_colored(f"🛠️ {action_type}{args_str}", dim=True)
121
133
 
134
+
122
135
  def print_welcome(model: str, agent_loop: str, container_name: str):
123
136
  """Print welcome message."""
124
137
  print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
125
138
  print_colored("Type 'exit' to quit.", dim=True)
126
139
 
140
+
127
141
  async def ainput(prompt: str = ""):
128
142
  return await asyncio.to_thread(input, prompt)
129
143
 
130
- async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
144
+
145
+ async def chat_loop(
146
+ agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
147
+ ):
131
148
  """Main chat loop with the agent."""
132
149
  print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
133
-
150
+
134
151
  history = []
135
-
152
+
136
153
  if initial_prompt:
137
154
  history.append({"role": "user", "content": initial_prompt})
138
-
155
+
139
156
  total_cost = 0
140
157
 
141
158
  while True:
@@ -143,28 +160,28 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
143
160
  # Get user input with prompt
144
161
  print_colored("> ", end="")
145
162
  user_input = await ainput()
146
-
147
- if user_input.lower() in ['exit', 'quit', 'q']:
163
+
164
+ if user_input.lower() in ["exit", "quit", "q"]:
148
165
  print_colored("\n👋 Goodbye!")
149
166
  break
150
-
167
+
151
168
  if not user_input:
152
169
  continue
153
-
170
+
154
171
  # Add user message to history
155
172
  history.append({"role": "user", "content": user_input})
156
-
173
+
157
174
  # Stream responses from the agent with spinner
158
175
  with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
159
176
  spinner.hide()
160
-
177
+
161
178
  async for result in agent.run(history):
162
179
  # Add agent responses to history
163
180
  history.extend(result.get("output", []))
164
181
 
165
182
  if show_usage:
166
183
  total_cost += result.get("usage", {}).get("response_cost", 0)
167
-
184
+
168
185
  # Process and display the output
169
186
  for item in result.get("output", []):
170
187
  if item.get("type") == "message" and item.get("role") == "assistant":
@@ -176,7 +193,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
176
193
  if text:
177
194
  spinner.hide()
178
195
  print_colored(text)
179
-
196
+
180
197
  elif item.get("type") == "computer_call":
181
198
  # Display computer action
182
199
  action = item.get("action", {})
@@ -186,7 +203,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
186
203
  print_action(action_type, action, total_cost)
187
204
  spinner.text = f"Performing {action_type}..."
188
205
  spinner.show()
189
-
206
+
190
207
  elif item.get("type") == "function_call":
191
208
  # Display function call
192
209
  function_name = item.get("name", "")
@@ -194,18 +211,18 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
194
211
  print_colored(f"🔧 Calling function: {function_name}", dim=True)
195
212
  spinner.text = f"Calling {function_name}..."
196
213
  spinner.show()
197
-
214
+
198
215
  elif item.get("type") == "function_call_output":
199
216
  # Display function output (dimmed)
200
217
  output = item.get("output", "")
201
218
  if output and len(output.strip()) > 0:
202
219
  spinner.hide()
203
220
  print_colored(f"📤 {output}", dim=True)
204
-
221
+
205
222
  spinner.hide()
206
223
  if show_usage and total_cost > 0:
207
224
  print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
208
-
225
+
209
226
 
210
227
  async def main():
211
228
  """Main CLI function."""
@@ -218,90 +235,74 @@ Examples:
218
235
  python -m agent.cli anthropic/claude-3-5-sonnet-20241022
219
236
  python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
220
237
  python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
221
- """
238
+ """,
222
239
  )
223
-
240
+
224
241
  parser.add_argument(
225
242
  "model",
226
- help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
243
+ help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')",
227
244
  )
228
-
245
+
229
246
  parser.add_argument(
230
247
  "--provider",
231
248
  choices=["cloud", "lume", "winsandbox", "docker"],
232
249
  default="cloud",
233
- help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
250
+ help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
234
251
  )
235
-
252
+
236
253
  parser.add_argument(
237
254
  "--images",
238
255
  type=int,
239
256
  default=3,
240
- help="Number of recent images to keep in context (default: 3)"
241
- )
242
-
243
- parser.add_argument(
244
- "--trajectory",
245
- action="store_true",
246
- help="Save trajectory for debugging"
247
- )
248
-
249
- parser.add_argument(
250
- "--budget",
251
- type=float,
252
- help="Maximum budget for the session (in dollars)"
253
- )
254
-
255
- parser.add_argument(
256
- "--verbose",
257
- action="store_true",
258
- help="Enable verbose logging"
257
+ help="Number of recent images to keep in context (default: 3)",
259
258
  )
260
259
 
260
+ parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
261
+
262
+ parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
263
+
264
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
265
+
261
266
  parser.add_argument(
262
- "-p", "--prompt",
267
+ "-p",
268
+ "--prompt",
263
269
  type=str,
264
- help="Initial prompt to send to the agent. Leave blank for interactive mode."
270
+ help="Initial prompt to send to the agent. Leave blank for interactive mode.",
265
271
  )
266
272
 
267
273
  parser.add_argument(
268
274
  "--prompt-file",
269
275
  type=Path,
270
- help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
276
+ help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
271
277
  )
272
278
 
273
279
  parser.add_argument(
274
280
  "--predict-click",
275
281
  dest="predict_click",
276
282
  type=str,
277
- help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
283
+ help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
278
284
  )
279
285
 
280
- parser.add_argument(
281
- "-c", "--cache",
282
- action="store_true",
283
- help="Tell the API to enable caching"
284
- )
286
+ parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
285
287
 
286
288
  parser.add_argument(
287
- "-u", "--usage",
288
- action="store_true",
289
- help="Show total cost of the agent runs"
289
+ "-u", "--usage", action="store_true", help="Show total cost of the agent runs"
290
290
  )
291
291
 
292
292
  parser.add_argument(
293
- "-r", "--max-retries",
293
+ "-r",
294
+ "--max-retries",
294
295
  type=int,
295
296
  default=3,
296
- help="Maximum number of retries for the LLM API calls"
297
+ help="Maximum number of retries for the LLM API calls",
297
298
  )
298
-
299
+
299
300
  args = parser.parse_args()
300
-
301
+
301
302
  # Check for required environment variables
302
303
  container_name = os.getenv("CUA_CONTAINER_NAME")
303
304
  cua_api_key = os.getenv("CUA_API_KEY")
304
-
305
+
305
306
  # Prompt for missing environment variables (container name always required)
306
307
  if not container_name:
307
308
  if args.provider == "cloud":
@@ -321,13 +322,13 @@ Examples:
321
322
  if not cua_api_key:
322
323
  print_colored("❌ API key is required for cloud provider.")
323
324
  sys.exit(1)
324
-
325
+
325
326
  # Check for provider-specific API keys based on model
326
327
  provider_api_keys = {
327
328
  "openai/": "OPENAI_API_KEY",
328
329
  "anthropic/": "ANTHROPIC_API_KEY",
329
330
  }
330
-
331
+
331
332
  # Find matching provider and check for API key
332
333
  for prefix, env_var in provider_api_keys.items():
333
334
  if prefix in args.model:
@@ -340,7 +341,7 @@ Examples:
340
341
  # Set the environment variable for the session
341
342
  os.environ[env_var] = api_key
342
343
  break
343
-
344
+
344
345
  # Import here to avoid import errors if dependencies are missing
345
346
  try:
346
347
  from agent import ComputerAgent
@@ -349,7 +350,7 @@ Examples:
349
350
  print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
350
351
  print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
351
352
  sys.exit(1)
352
-
353
+
353
354
  # Resolve provider -> os_type, provider_type, api key requirement
354
355
  provider_map = {
355
356
  "cloud": ("linux", "cloud", True),
@@ -365,42 +366,46 @@ Examples:
365
366
  "name": container_name,
366
367
  }
367
368
  if needs_api_key:
368
- computer_kwargs["api_key"] = cua_api_key # type: ignore
369
+ computer_kwargs["api_key"] = cua_api_key # type: ignore
369
370
 
370
371
  # Create computer instance
371
- async with Computer(**computer_kwargs) as computer: # type: ignore
372
-
372
+ async with Computer(**computer_kwargs) as computer: # type: ignore
373
+
373
374
  # Create agent
374
375
  agent_kwargs = {
375
376
  "model": args.model,
376
377
  "tools": [computer],
377
- "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
378
+ "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
378
379
  "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
379
- "max_retries": args.max_retries
380
+ "max_retries": args.max_retries,
380
381
  }
381
382
 
382
383
  if args.images > 0:
383
384
  agent_kwargs["only_n_most_recent_images"] = args.images
384
-
385
+
385
386
  if args.trajectory:
386
387
  agent_kwargs["trajectory_dir"] = "trajectories"
387
-
388
+
388
389
  if args.budget:
389
390
  agent_kwargs["max_trajectory_budget"] = {
390
391
  "max_budget": args.budget,
391
392
  "raise_error": True,
392
- "reset_after_each_run": False
393
+ "reset_after_each_run": False,
393
394
  }
394
395
 
395
396
  if args.cache:
396
397
  agent_kwargs["use_prompt_caching"] = True
397
-
398
+
398
399
  agent = ComputerAgent(**agent_kwargs)
399
-
400
+
400
401
  # If predict-click mode is requested, run once and exit
401
402
  if args.predict_click:
402
403
  if not PIL_AVAILABLE:
403
- print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
404
+ print_colored(
405
+ "❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
406
+ Colors.RED,
407
+ bold=True,
408
+ )
404
409
  sys.exit(1)
405
410
 
406
411
  instruction = args.predict_click
@@ -435,6 +440,7 @@ Examples:
435
440
 
436
441
  try:
437
442
  from io import BytesIO
443
+
438
444
  with Image.open(BytesIO(img_bytes)) as img:
439
445
  img = img.convert("RGB")
440
446
  draw = ImageDraw.Draw(img)
@@ -457,9 +463,9 @@ Examples:
457
463
  if system == "windows":
458
464
  os.startfile(str(out_path)) # type: ignore[attr-defined]
459
465
  elif system == "darwin":
460
- os.system(f"open \"{out_path}\"")
466
+ os.system(f'open "{out_path}"')
461
467
  else:
462
- os.system(f"xdg-open \"{out_path}\"")
468
+ os.system(f'xdg-open "{out_path}"')
463
469
  except Exception:
464
470
  pass
465
471
  except Exception as e:
@@ -482,9 +488,8 @@ Examples:
482
488
  await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
483
489
 
484
490
 
485
-
486
491
  if __name__ == "__main__":
487
492
  try:
488
493
  asyncio.run(main())
489
494
  except (KeyboardInterrupt, EOFError) as _:
490
- print_colored("\n\n👋 Goodbye!")
495
+ print_colored("\n\n👋 Goodbye!")
@@ -6,27 +6,32 @@ computer interface types, supporting both the ComputerHandler protocol and the
6
6
  Computer library interface.
7
7
  """
8
8
 
9
+ from computer import Computer as cuaComputer
10
+
9
11
  from .base import AsyncComputerHandler
10
12
  from .cua import cuaComputerHandler
11
13
  from .custom import CustomComputerHandler
12
- from computer import Computer as cuaComputer
14
+
13
15
 
14
16
  def is_agent_computer(computer):
15
17
  """Check if the given computer is a ComputerHandler or CUA Computer."""
16
- return isinstance(computer, AsyncComputerHandler) or \
17
- isinstance(computer, cuaComputer) or \
18
- (isinstance(computer, dict)) #and "screenshot" in computer)
18
+ return (
19
+ isinstance(computer, AsyncComputerHandler)
20
+ or isinstance(computer, cuaComputer)
21
+ or (isinstance(computer, dict))
22
+ ) # and "screenshot" in computer)
23
+
19
24
 
20
25
  async def make_computer_handler(computer):
21
26
  """
22
27
  Create a computer handler from a computer interface.
23
-
28
+
24
29
  Args:
25
30
  computer: Either a ComputerHandler instance, Computer instance, or dict of functions
26
-
31
+
27
32
  Returns:
28
33
  ComputerHandler: A computer handler instance
29
-
34
+
30
35
  Raises:
31
36
  ValueError: If the computer type is not supported
32
37
  """
@@ -38,4 +43,4 @@ async def make_computer_handler(computer):
38
43
  return computer_handler
39
44
  if isinstance(computer, dict):
40
45
  return CustomComputerHandler(computer)
41
- raise ValueError(f"Unsupported computer type: {type(computer)}")
46
+ raise ValueError(f"Unsupported computer type: {type(computer)}")
agent/computers/base.py CHANGED
@@ -2,69 +2,78 @@
2
2
  Base computer interface protocol for agent interactions.
3
3
  """
4
4
 
5
- from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable
5
+ from typing import (
6
+ Any,
7
+ Dict,
8
+ List,
9
+ Literal,
10
+ Optional,
11
+ Protocol,
12
+ Union,
13
+ runtime_checkable,
14
+ )
6
15
 
7
16
 
8
17
  @runtime_checkable
9
18
  class AsyncComputerHandler(Protocol):
10
19
  """Protocol defining the interface for computer interactions."""
11
-
12
- # ==== Computer-Use-Preview Action Space ====
20
+
21
+ # ==== Computer-Use-Preview Action Space ====
13
22
 
14
23
  async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
15
24
  """Get the current environment type."""
16
25
  ...
17
-
26
+
18
27
  async def get_dimensions(self) -> tuple[int, int]:
19
28
  """Get screen dimensions as (width, height)."""
20
29
  ...
21
-
30
+
22
31
  async def screenshot(self) -> str:
23
32
  """Take a screenshot and return as base64 string."""
24
33
  ...
25
-
34
+
26
35
  async def click(self, x: int, y: int, button: str = "left") -> None:
27
36
  """Click at coordinates with specified button."""
28
37
  ...
29
-
38
+
30
39
  async def double_click(self, x: int, y: int) -> None:
31
40
  """Double click at coordinates."""
32
41
  ...
33
-
42
+
34
43
  async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
35
44
  """Scroll at coordinates with specified scroll amounts."""
36
45
  ...
37
-
46
+
38
47
  async def type(self, text: str) -> None:
39
48
  """Type text."""
40
49
  ...
41
-
50
+
42
51
  async def wait(self, ms: int = 1000) -> None:
43
52
  """Wait for specified milliseconds."""
44
53
  ...
45
-
54
+
46
55
  async def move(self, x: int, y: int) -> None:
47
56
  """Move cursor to coordinates."""
48
57
  ...
49
-
58
+
50
59
  async def keypress(self, keys: Union[List[str], str]) -> None:
51
60
  """Press key combination."""
52
61
  ...
53
-
62
+
54
63
  async def drag(self, path: List[Dict[str, int]]) -> None:
55
64
  """Drag along specified path."""
56
65
  ...
57
-
66
+
58
67
  async def get_current_url(self) -> str:
59
68
  """Get current URL (for browser environments)."""
60
69
  ...
61
-
62
- # ==== Anthropic Action Space ====
70
+
71
+ # ==== Anthropic Action Space ====
63
72
 
64
73
  async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
65
74
  """Left mouse down at coordinates."""
66
75
  ...
67
-
76
+
68
77
  async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
69
78
  """Left mouse up at coordinates."""
70
79
  ...