cua-agent 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -60,7 +60,7 @@ class TelemetryCallback(AsyncCallbackHandler):
60
60
  """Record agent type/model and session initialization."""
61
61
  agent_info = {
62
62
  "session_id": self.session_id,
63
- "agent_type": self.agent.agent_loop.__name__,
63
+ "agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown',
64
64
  "model": getattr(self.agent, 'model', 'unknown'),
65
65
  **SYSTEM_INFO
66
66
  }
agent/cli.py CHANGED
@@ -51,9 +51,8 @@ class Colors:
51
51
  BG_YELLOW = '\033[43m'
52
52
  BG_BLUE = '\033[44m'
53
53
 
54
-
55
- def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"):
56
- """Print colored text to terminal."""
54
+ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""):
55
+ """Print colored text to terminal with optional right-aligned text."""
57
56
  prefix = ""
58
57
  if bold:
59
58
  prefix += Colors.BOLD
@@ -62,10 +61,35 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
62
61
  if color:
63
62
  prefix += color
64
63
 
65
- print(f"{prefix}{text}{Colors.RESET}", end=end)
64
+ if right:
65
+ # Get terminal width (default to 80 if unable to determine)
66
+ try:
67
+ import shutil
68
+ terminal_width = shutil.get_terminal_size().columns
69
+ except:
70
+ terminal_width = 80
71
+
72
+ # Add right margin
73
+ terminal_width -= 1
74
+
75
+ # Calculate padding needed
76
+ # Account for ANSI escape codes not taking visual space
77
+ visible_left_len = len(text)
78
+ visible_right_len = len(right)
79
+ padding = terminal_width - visible_left_len - visible_right_len
80
+
81
+ if padding > 0:
82
+ output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
83
+ else:
84
+ # If not enough space, just put a single space between
85
+ output = f"{prefix}{text} {right}{Colors.RESET}"
86
+ else:
87
+ output = f"{prefix}{text}{Colors.RESET}"
88
+
89
+ print(output, end=end)
66
90
 
67
91
 
68
- def print_action(action_type: str, details: Dict[str, Any]):
92
+ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
69
93
  """Print computer action with nice formatting."""
70
94
  # Format action details
71
95
  args_str = ""
@@ -81,8 +105,10 @@ def print_action(action_type: str, details: Dict[str, Any]):
81
105
  elif action_type == "scroll" and "x" in details and "y" in details:
82
106
  args_str = f"({details['x']}, {details['y']})"
83
107
 
84
- print_colored(f"🛠️ {action_type}{args_str}", dim=True)
85
-
108
+ if total_cost > 0:
109
+ print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
110
+ else:
111
+ print_colored(f"🛠️ {action_type}{args_str}", dim=True)
86
112
 
87
113
  def print_welcome(model: str, agent_loop: str, container_name: str):
88
114
  """Print welcome message."""
@@ -92,26 +118,32 @@ def print_welcome(model: str, agent_loop: str, container_name: str):
92
118
  async def ainput(prompt: str = ""):
93
119
  return await asyncio.to_thread(input, prompt)
94
120
 
95
- async def chat_loop(agent, model: str, container_name: str):
121
+ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
96
122
  """Main chat loop with the agent."""
97
123
  print_welcome(model, agent.agent_loop.__name__, container_name)
98
124
 
99
125
  history = []
100
126
 
127
+ if initial_prompt:
128
+ history.append({"role": "user", "content": initial_prompt})
129
+
130
+ total_cost = 0
131
+
101
132
  while True:
102
- # Get user input with prompt
103
- print_colored("> ", end="")
104
- user_input = await ainput()
105
-
106
- if user_input.lower() in ['exit', 'quit', 'q']:
107
- print_colored("\n👋 Goodbye!")
108
- break
133
+ if history[-1].get("role") != "user":
134
+ # Get user input with prompt
135
+ print_colored("> ", end="")
136
+ user_input = await ainput()
109
137
 
110
- if not user_input:
111
- continue
112
-
113
- # Add user message to history
114
- history.append({"role": "user", "content": user_input})
138
+ if user_input.lower() in ['exit', 'quit', 'q']:
139
+ print_colored("\n👋 Goodbye!")
140
+ break
141
+
142
+ if not user_input:
143
+ continue
144
+
145
+ # Add user message to history
146
+ history.append({"role": "user", "content": user_input})
115
147
 
116
148
  # Stream responses from the agent with spinner
117
149
  with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
@@ -120,6 +152,9 @@ async def chat_loop(agent, model: str, container_name: str):
120
152
  async for result in agent.run(history):
121
153
  # Add agent responses to history
122
154
  history.extend(result.get("output", []))
155
+
156
+ if show_usage:
157
+ total_cost += result.get("usage", {}).get("response_cost", 0)
123
158
 
124
159
  # Process and display the output
125
160
  for item in result.get("output", []):
@@ -139,7 +174,7 @@ async def chat_loop(agent, model: str, container_name: str):
139
174
  action_type = action.get("type", "")
140
175
  if action_type:
141
176
  spinner.hide()
142
- print_action(action_type, action)
177
+ print_action(action_type, action, total_cost)
143
178
  spinner.text = f"Performing {action_type}..."
144
179
  spinner.show()
145
180
 
@@ -159,6 +194,8 @@ async def chat_loop(agent, model: str, container_name: str):
159
194
  print_colored(f"📤 {output}", dim=True)
160
195
 
161
196
  spinner.hide()
197
+ if show_usage and total_cost > 0:
198
+ print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
162
199
 
163
200
 
164
201
  async def main():
@@ -204,6 +241,26 @@ Examples:
204
241
  action="store_true",
205
242
  help="Enable verbose logging"
206
243
  )
244
+
245
+ parser.add_argument(
246
+ "-p", "--prompt",
247
+ type=str,
248
+ help="Initial prompt to send to the agent. Leave blank for interactive mode."
249
+ )
250
+
251
+ parser.add_argument(
252
+ "-c", "--cache",
253
+ action="store_true",
254
+ help="Tell the API to enable caching"
255
+ )
256
+
257
+ parser.add_argument(
258
+ "-u", "--usage",
259
+ action="store_true",
260
+ help="Show total cost of the agent runs"
261
+ )
262
+
263
+
207
264
 
208
265
  args = parser.parse_args()
209
266
 
@@ -269,9 +326,11 @@ Examples:
269
326
  agent_kwargs = {
270
327
  "model": args.model,
271
328
  "tools": [computer],
272
- "only_n_most_recent_images": args.images,
273
329
  "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
274
330
  }
331
+
332
+ if args.images > 0:
333
+ agent_kwargs["only_n_most_recent_images"] = args.images
275
334
 
276
335
  if args.trajectory:
277
336
  agent_kwargs["trajectory_dir"] = "trajectories"
@@ -282,11 +341,14 @@ Examples:
282
341
  "raise_error": True,
283
342
  "reset_after_each_run": False
284
343
  }
344
+
345
+ if args.cache:
346
+ agent_kwargs["use_prompt_caching"] = True
285
347
 
286
348
  agent = ComputerAgent(**agent_kwargs)
287
349
 
288
350
  # Start chat loop
289
- await chat_loop(agent, args.model, container_name)
351
+ await chat_loop(agent, args.model, container_name, args.prompt, args.usage)
290
352
 
291
353
 
292
354
 
agent/loops/anthropic.py CHANGED
@@ -193,17 +193,98 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
193
193
 
194
194
  tool_use_content = []
195
195
 
196
+ # Basic actions (all versions)
196
197
  if action_type == "click":
198
+ # Input:
199
+ # {
200
+ # "type": "computer_call",
201
+ # "call_id": "call_1",
202
+ # "action": {
203
+ # "type": "click",
204
+ # "x": 100,
205
+ # "y": 200
206
+ # }
207
+ # }
208
+
209
+ # Output:
210
+ # {
211
+ # "function": {
212
+ # "name": "computer",
213
+ # "arguments": json.dumps({
214
+ # "action": "click",
215
+ # "coordinate": [100, 200]
216
+ # })
217
+ # },
218
+ # "id": "call_1",
219
+ # "type": "function"
220
+ # }
221
+ button = action.get("button", "left")
222
+ action_name = "right_click" if button == "right" else "middle_click" if button == "wheel" else "left_click"
197
223
  tool_use_content.append({
198
224
  "type": "tool_use",
199
225
  "id": call_id,
200
226
  "name": "computer",
201
227
  "input": {
202
- "action": "click",
228
+ "action": action_name,
229
+ "coordinate": [action.get("x", 0), action.get("y", 0)]
230
+ }
231
+ })
232
+ elif action_type == "double_click":
233
+ # Input:
234
+ # {
235
+ # "type": "computer_call",
236
+ # "call_id": "call_1",
237
+ # "action": {
238
+ # "type": "double_click",
239
+ # "x": 160,
240
+ # "y": 240
241
+ # }
242
+ # }
243
+
244
+ # Output:
245
+ # {
246
+ # "function": {
247
+ # "name": "computer",
248
+ # "arguments": json.dumps({
249
+ # "action": "double_click",
250
+ # "coordinate": [160, 240]
251
+ # })
252
+ # },
253
+ # "id": "call_1",
254
+ # "type": "function"
255
+ # }
256
+ tool_use_content.append({
257
+ "type": "tool_use",
258
+ "id": call_id,
259
+ "name": "computer",
260
+ "input": {
261
+ "action": "double_click",
203
262
  "coordinate": [action.get("x", 0), action.get("y", 0)]
204
263
  }
205
264
  })
206
265
  elif action_type == "type":
266
+ # Input:
267
+ # {
268
+ # "type": "computer_call",
269
+ # "call_id": "call_1",
270
+ # "action": {
271
+ # "type": "type",
272
+ # "text": "Hello World"
273
+ # }
274
+ # }
275
+
276
+ # Output:
277
+ # {
278
+ # "function": {
279
+ # "name": "computer",
280
+ # "arguments": json.dumps({
281
+ # "action": "type",
282
+ # "text": "Hello World"
283
+ # })
284
+ # },
285
+ # "id": "call_1",
286
+ # "type": "function"
287
+ # }
207
288
  tool_use_content.append({
208
289
  "type": "tool_use",
209
290
  "id": call_id,
@@ -213,26 +294,223 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
213
294
  "text": action.get("text", "")
214
295
  }
215
296
  })
216
- elif action_type == "key":
297
+ elif action_type == "keypress":
298
+ # Input:
299
+ # {
300
+ # "type": "computer_call",
301
+ # "call_id": "call_1",
302
+ # "action": {
303
+ # "type": "keypress",
304
+ # "keys": ["ctrl", "c"]
305
+ # }
306
+ # }
307
+
308
+ # Output:
309
+ # {
310
+ # "function": {
311
+ # "name": "computer",
312
+ # "arguments": json.dumps({
313
+ # "action": "key",
314
+ # "text": "ctrl+c"
315
+ # })
316
+ # },
317
+ # "id": "call_1",
318
+ # "type": "function"
319
+ # }
217
320
  tool_use_content.append({
218
321
  "type": "tool_use",
219
322
  "id": call_id,
220
323
  "name": "computer",
221
324
  "input": {
222
325
  "action": "key",
223
- "key": action.get("key", "")
326
+ "text": "+".join(action.get("keys", []))
327
+ }
328
+ })
329
+ elif action_type == "mouse_move":
330
+ # Input:
331
+ # {
332
+ # "type": "computer_call",
333
+ # "call_id": "call_1",
334
+ # "action": {
335
+ # "type": "mouse_move",
336
+ # "x": 150,
337
+ # "y": 250
338
+ # }
339
+ # }
340
+
341
+ # Output:
342
+ # {
343
+ # "function": {
344
+ # "name": "computer",
345
+ # "arguments": json.dumps({
346
+ # "action": "mouse_move",
347
+ # "coordinate": [150, 250]
348
+ # })
349
+ # },
350
+ # "id": "call_1",
351
+ # "type": "function"
352
+ # }
353
+ tool_use_content.append({
354
+ "type": "tool_use",
355
+ "id": call_id,
356
+ "name": "computer",
357
+ "input": {
358
+ "action": "mouse_move",
359
+ "coordinate": [action.get("x", 0), action.get("y", 0)]
360
+ }
361
+ })
362
+ elif action_type == "scroll":
363
+ # Input:
364
+ # {
365
+ # "type": "computer_call",
366
+ # "call_id": "call_1",
367
+ # "action": {
368
+ # "type": "scroll",
369
+ # "x": 300,
370
+ # "y": 400,
371
+ # "scroll_x": 0,
372
+ # "scroll_y": -5
373
+ # }
374
+ # }
375
+
376
+ # Output:
377
+ # {
378
+ # "function": {
379
+ # "name": "computer",
380
+ # "arguments": json.dumps({
381
+ # "action": "scroll",
382
+ # "coordinate": [300, 400],
383
+ # "scroll_direction": "down",
384
+ # "scroll_amount": 5
385
+ # })
386
+ # },
387
+ # "id": "call_1",
388
+ # "type": "function"
389
+ # }
390
+ scroll_x = action.get("scroll_x", 0)
391
+ scroll_y = action.get("scroll_y", 0)
392
+ # Determine direction and amount from scroll values
393
+ if scroll_x > 0:
394
+ direction = "left"
395
+ amount = scroll_x
396
+ elif scroll_x < 0:
397
+ direction = "right"
398
+ amount = -scroll_x
399
+ elif scroll_y > 0:
400
+ direction = "up"
401
+ amount = scroll_y
402
+ elif scroll_y < 0:
403
+ direction = "down"
404
+ amount = -scroll_y
405
+ else:
406
+ direction = "down"
407
+ amount = 3
408
+
409
+ tool_use_content.append({
410
+ "type": "tool_use",
411
+ "id": call_id,
412
+ "name": "computer",
413
+ "input": {
414
+ "action": "scroll",
415
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
416
+ "scroll_direction": direction,
417
+ "scroll_amount": amount
418
+ }
419
+ })
420
+ elif action_type == "drag":
421
+ # Input:
422
+ # {
423
+ # "type": "computer_call",
424
+ # "call_id": "call_1",
425
+ # "action": {
426
+ # "type": "drag",
427
+ # "path": [
428
+ # {"x": 100, "y": 150},
429
+ # {"x": 200, "y": 250}
430
+ # ]
431
+ # }
432
+ # }
433
+
434
+ # Output:
435
+ # {
436
+ # "function": {
437
+ # "name": "computer",
438
+ # "arguments": json.dumps({
439
+ # "action": "left_click_drag",
440
+ # "start_coordinate": [100, 150],
441
+ # "end_coordinate": [200, 250]
442
+ # })
443
+ # },
444
+ # "id": "call_1",
445
+ # "type": "function"
446
+ # }
447
+ path = action.get("path", [])
448
+ start_coord = [0, 0]
449
+ end_coord = [0, 0]
450
+ if isinstance(path, list) and len(path) >= 2:
451
+ start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
452
+ end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
453
+
454
+ tool_use_content.append({
455
+ "type": "tool_use",
456
+ "id": call_id,
457
+ "name": "computer",
458
+ "input": {
459
+ "action": "left_click_drag",
460
+ "start_coordinate": start_coord,
461
+ "end_coordinate": end_coord
224
462
  }
225
463
  })
226
464
  elif action_type == "wait":
465
+ # Input:
466
+ # {
467
+ # "type": "computer_call",
468
+ # "call_id": "call_1",
469
+ # "action": {
470
+ # "type": "wait"
471
+ # }
472
+ # }
473
+
474
+ # Output:
475
+ # {
476
+ # "function": {
477
+ # "name": "computer",
478
+ # "arguments": json.dumps({
479
+ # "action": "wait"
480
+ # })
481
+ # },
482
+ # "id": "call_1",
483
+ # "type": "function"
484
+ # }
227
485
  tool_use_content.append({
228
486
  "type": "tool_use",
229
487
  "id": call_id,
230
488
  "name": "computer",
231
489
  "input": {
232
- "action": "screenshot"
490
+ "action": "wait"
233
491
  }
234
492
  })
235
493
  elif action_type == "screenshot":
494
+ # Input:
495
+ # {
496
+ # "type": "computer_call",
497
+ # "call_id": "call_1",
498
+ # "action": {
499
+ # "type": "screenshot"
500
+ # }
501
+ # }
502
+
503
+ # Output:
504
+ # {
505
+ # "function": {
506
+ # "name": "computer",
507
+ # "arguments": json.dumps({
508
+ # "action": "screenshot"
509
+ # })
510
+ # },
511
+ # "id": "call_1",
512
+ # "type": "function"
513
+ # }
236
514
  tool_use_content.append({
237
515
  "type": "tool_use",
238
516
  "id": call_id,
@@ -342,7 +620,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
342
620
  ))
343
621
  elif action_type == "key":
344
622
  responses_items.append(make_keypress_item(
345
- key=tool_input.get("key", ""),
623
+ keys=tool_input.get("text", "").replace("+", "-").split("-"),
346
624
  call_id=call_id
347
625
  ))
348
626
  elif action_type == "mouse_move":
@@ -361,21 +639,32 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
361
639
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
362
640
  elif action_type == "scroll":
363
641
  coordinate = tool_input.get("coordinate", [0, 0])
642
+ scroll_amount = tool_input.get("scroll_amount", 3)
643
+ scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
644
+ -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
645
+ scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
646
+ -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
364
647
  responses_items.append(make_scroll_item(
365
648
  x=coordinate[0] if len(coordinate) > 0 else 0,
366
649
  y=coordinate[1] if len(coordinate) > 1 else 0,
367
- direction=tool_input.get("scroll_direction", "down"),
368
- amount=tool_input.get("scroll_amount", 3),
650
+ scroll_x=scroll_x,
651
+ scroll_y=scroll_y,
369
652
  call_id=call_id
370
653
  ))
371
654
  elif action_type == "left_click_drag":
372
655
  start_coord = tool_input.get("start_coordinate", [0, 0])
373
656
  end_coord = tool_input.get("end_coordinate", [0, 0])
374
657
  responses_items.append(make_drag_item(
375
- start_x=start_coord[0] if len(start_coord) > 0 else 0,
376
- start_y=start_coord[1] if len(start_coord) > 1 else 0,
377
- end_x=end_coord[0] if len(end_coord) > 0 else 0,
378
- end_y=end_coord[1] if len(end_coord) > 1 else 0,
658
+ path=[
659
+ {
660
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
661
+ "y": start_coord[1] if len(start_coord) > 1 else 0
662
+ },
663
+ {
664
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
665
+ "y": end_coord[1] if len(end_coord) > 1 else 0
666
+ }
667
+ ],
379
668
  call_id=call_id
380
669
  ))
381
670
  elif action_type == "right_click":
@@ -459,7 +748,6 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
459
748
  # Handle tool calls (alternative format)
460
749
  if hasattr(message, 'tool_calls') and message.tool_calls:
461
750
  for tool_call in message.tool_calls:
462
- print(tool_call)
463
751
  if tool_call.function.name == "computer":
464
752
  try:
465
753
  args = json.loads(tool_call.function.arguments)
@@ -468,10 +756,53 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
468
756
 
469
757
  # Basic actions (all versions)
470
758
  if action_type == "screenshot":
759
+ # Input:
760
+ # {
761
+ # "function": {
762
+ # "name": "computer",
763
+ # "arguments": json.dumps({
764
+ # "action": "screenshot"
765
+ # })
766
+ # },
767
+ # "id": "call_1",
768
+ # "type": "function"
769
+ # }
770
+
771
+ # Output:
772
+ # {
773
+ # "type": "computer_call",
774
+ # "call_id": "call_1",
775
+ # "action": {
776
+ # "type": "screenshot"
777
+ # }
778
+ # }
471
779
  responses_items.append(make_screenshot_item(
472
780
  call_id=call_id
473
781
  ))
474
782
  elif action_type in ["click", "left_click"]:
783
+ # Input:
784
+ # {
785
+ # "function": {
786
+ # "name": "computer",
787
+ # "arguments": json.dumps({
788
+ # "action": "click",
789
+ # "coordinate": [100, 200]
790
+ # })
791
+ # },
792
+ # "id": "call_1",
793
+ # "type": "function"
794
+ # }
795
+
796
+ # Output:
797
+ # {
798
+ # "type": "computer_call",
799
+ # "call_id": "call_1",
800
+ # "action": {
801
+ # "type": "click",
802
+ # "x": 100,
803
+ # "y": 200
804
+ # }
805
+ # }
475
806
  coordinate = args.get("coordinate", [0, 0])
476
807
  responses_items.append(make_click_item(
477
808
  x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -479,16 +810,83 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
479
810
  call_id=call_id
480
811
  ))
481
812
  elif action_type == "type":
813
+ # Input:
814
+ # {
815
+ # "function": {
816
+ # "name": "computer",
817
+ # "arguments": json.dumps({
818
+ # "action": "type",
819
+ # "text": "Hello World"
820
+ # })
821
+ # },
822
+ # "id": "call_1",
823
+ # "type": "function"
824
+ # }
825
+
826
+ # Output:
827
+ # {
828
+ # "type": "computer_call",
829
+ # "call_id": "call_1",
830
+ # "action": {
831
+ # "type": "type",
832
+ # "text": "Hello World"
833
+ # }
834
+ # }
482
835
  responses_items.append(make_type_item(
483
836
  text=args.get("text", ""),
484
837
  call_id=call_id
485
838
  ))
486
839
  elif action_type == "key":
840
+ # Input:
841
+ # {
842
+ # "function": {
843
+ # "name": "computer",
844
+ # "arguments": json.dumps({
845
+ # "action": "key",
846
+ # "text": "ctrl+c"
847
+ # })
848
+ # },
849
+ # "id": "call_1",
850
+ # "type": "function"
851
+ # }
852
+
853
+ # Output:
854
+ # {
855
+ # "type": "computer_call",
856
+ # "call_id": "call_1",
857
+ # "action": {
858
+ # "type": "keypress",
859
+ # "keys": ["ctrl", "c"]
860
+ # }
861
+ # }
487
862
  responses_items.append(make_keypress_item(
488
- key=args.get("key", ""),
863
+ keys=args.get("text", "").replace("+", "-").split("-"),
489
864
  call_id=call_id
490
865
  ))
491
866
  elif action_type == "mouse_move":
867
+ # Input:
868
+ # {
869
+ # "function": {
870
+ # "name": "computer",
871
+ # "arguments": json.dumps({
872
+ # "action": "mouse_move",
873
+ # "coordinate": [150, 250]
874
+ # })
875
+ # },
876
+ # "id": "call_1",
877
+ # "type": "function"
878
+ # }
879
+
880
+ # Output:
881
+ # {
882
+ # "type": "computer_call",
883
+ # "call_id": "call_1",
884
+ # "action": {
885
+ # "type": "mouse_move",
886
+ # "x": 150,
887
+ # "y": 250
888
+ # }
889
+ # }
492
890
  coordinate = args.get("coordinate", [0, 0])
493
891
  responses_items.append(make_move_item(
494
892
  x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -498,6 +896,33 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
498
896
 
499
897
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
500
898
  elif action_type == "scroll":
899
+ # Input:
900
+ # {
901
+ # "function": {
902
+ # "name": "computer",
903
+ # "arguments": json.dumps({
904
+ # "action": "scroll",
905
+ # "coordinate": [300, 400],
906
+ # "scroll_direction": "down",
907
+ # "scroll_amount": 5
908
+ # })
909
+ # },
910
+ # "id": "call_1",
911
+ # "type": "function"
912
+ # }
913
+
914
+ # Output:
915
+ # {
916
+ # "type": "computer_call",
917
+ # "call_id": "call_1",
918
+ # "action": {
919
+ # "type": "scroll",
920
+ # "x": 300,
921
+ # "y": 400,
922
+ # "scroll_x": 0,
923
+ # "scroll_y": -5
924
+ # }
925
+ # }
501
926
  coordinate = args.get("coordinate", [0, 0])
502
927
  direction = args.get("scroll_direction", "down")
503
928
  amount = args.get("scroll_amount", 3)
@@ -513,16 +938,72 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
513
938
  call_id=call_id
514
939
  ))
515
940
  elif action_type == "left_click_drag":
941
+ # Input:
942
+ # {
943
+ # "function": {
944
+ # "name": "computer",
945
+ # "arguments": json.dumps({
946
+ # "action": "left_click_drag",
947
+ # "start_coordinate": [100, 150],
948
+ # "end_coordinate": [200, 250]
949
+ # })
950
+ # },
951
+ # "id": "call_1",
952
+ # "type": "function"
953
+ # }
954
+
955
+ # Output:
956
+ # {
957
+ # "type": "computer_call",
958
+ # "call_id": "call_1",
959
+ # "action": {
960
+ # "type": "drag",
961
+ # "path": [
962
+ # {"x": 100, "y": 150},
963
+ # {"x": 200, "y": 250}
964
+ # ]
965
+ # }
966
+ # }
516
967
  start_coord = args.get("start_coordinate", [0, 0])
517
968
  end_coord = args.get("end_coordinate", [0, 0])
518
969
  responses_items.append(make_drag_item(
519
- start_x=start_coord[0] if len(start_coord) > 0 else 0,
520
- start_y=start_coord[1] if len(start_coord) > 1 else 0,
521
- end_x=end_coord[0] if len(end_coord) > 0 else 0,
522
- end_y=end_coord[1] if len(end_coord) > 1 else 0,
970
+ path=[
971
+ {
972
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
973
+ "y": start_coord[1] if len(start_coord) > 1 else 0
974
+ },
975
+ {
976
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
977
+ "y": end_coord[1] if len(end_coord) > 1 else 0
978
+ }
979
+ ],
523
980
  call_id=call_id
524
981
  ))
525
982
  elif action_type == "right_click":
983
+ # Input:
984
+ # {
985
+ # "function": {
986
+ # "name": "computer",
987
+ # "arguments": json.dumps({
988
+ # "action": "right_click",
989
+ # "coordinate": [120, 180]
990
+ # })
991
+ # },
992
+ # "id": "call_1",
993
+ # "type": "function"
994
+ # }
995
+
996
+ # Output:
997
+ # {
998
+ # "type": "computer_call",
999
+ # "call_id": "call_1",
1000
+ # "action": {
1001
+ # "type": "click",
1002
+ # "x": 120,
1003
+ # "y": 180,
1004
+ # "button": "right"
1005
+ # }
1006
+ # }
526
1007
  coordinate = args.get("coordinate", [0, 0])
527
1008
  responses_items.append(make_click_item(
528
1009
  x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -531,14 +1012,61 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
531
1012
  call_id=call_id
532
1013
  ))
533
1014
  elif action_type == "middle_click":
1015
+ # Input:
1016
+ # {
1017
+ # "function": {
1018
+ # "name": "computer",
1019
+ # "arguments": json.dumps({
1020
+ # "action": "middle_click",
1021
+ # "coordinate": [140, 220]
1022
+ # })
1023
+ # },
1024
+ # "id": "call_1",
1025
+ # "type": "function"
1026
+ # }
1027
+
1028
+ # Output:
1029
+ # {
1030
+ # "type": "computer_call",
1031
+ # "call_id": "call_1",
1032
+ # "action": {
1033
+ # "type": "click",
1034
+ # "x": 140,
1035
+ # "y": 220,
1036
+ # "button": "wheel"
1037
+ # }
1038
+ # }
534
1039
  coordinate = args.get("coordinate", [0, 0])
535
1040
  responses_items.append(make_click_item(
536
1041
  x=coordinate[0] if len(coordinate) > 0 else 0,
537
1042
  y=coordinate[1] if len(coordinate) > 1 else 0,
538
- button="scroll",
1043
+ button="wheel",
539
1044
  call_id=call_id
540
1045
  ))
541
1046
  elif action_type == "double_click":
1047
+ # Input:
1048
+ # {
1049
+ # "function": {
1050
+ # "name": "computer",
1051
+ # "arguments": json.dumps({
1052
+ # "action": "double_click",
1053
+ # "coordinate": [160, 240]
1054
+ # })
1055
+ # },
1056
+ # "id": "call_1",
1057
+ # "type": "function"
1058
+ # }
1059
+
1060
+ # Output:
1061
+ # {
1062
+ # "type": "computer_call",
1063
+ # "call_id": "call_1",
1064
+ # "action": {
1065
+ # "type": "double_click",
1066
+ # "x": 160,
1067
+ # "y": 240
1068
+ # }
1069
+ # }
542
1070
  coordinate = args.get("coordinate", [0, 0])
543
1071
  responses_items.append(make_double_click_item(
544
1072
  x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -546,14 +1074,127 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
546
1074
  call_id=call_id
547
1075
  ))
548
1076
  elif action_type == "triple_click":
1077
+ # Input:
1078
+ # {
1079
+ # "function": {
1080
+ # "name": "computer",
1081
+ # "arguments": json.dumps({
1082
+ # "action": "triple_click",
1083
+ # "coordinate": [180, 260]
1084
+ # })
1085
+ # },
1086
+ # "id": "call_1",
1087
+ # "type": "function"
1088
+ # }
1089
+
1090
+ # Output:
1091
+ # {
1092
+ # "type": "computer_call",
1093
+ # "call_id": "call_1",
1094
+ # "action": {
1095
+ # "type": "triple_click",
1096
+ # "x": 180,
1097
+ # "y": 260
1098
+ # }
1099
+ # }
549
1100
  raise NotImplementedError("triple_click")
550
1101
  elif action_type == "left_mouse_down":
1102
+ # Input:
1103
+ # {
1104
+ # "function": {
1105
+ # "name": "computer",
1106
+ # "arguments": json.dumps({
1107
+ # "action": "left_mouse_down",
1108
+ # "coordinate": [200, 280]
1109
+ # })
1110
+ # },
1111
+ # "id": "call_1",
1112
+ # "type": "function"
1113
+ # }
1114
+
1115
+ # Output:
1116
+ # {
1117
+ # "type": "computer_call",
1118
+ # "call_id": "call_1",
1119
+ # "action": {
1120
+ # "type": "mouse_down",
1121
+ # "button": "left",
1122
+ # "x": 200,
1123
+ # "y": 280
1124
+ # }
1125
+ # }
551
1126
  raise NotImplementedError("left_mouse_down")
552
1127
  elif action_type == "left_mouse_up":
1128
+ # Input:
1129
+ # {
1130
+ # "function": {
1131
+ # "name": "computer",
1132
+ # "arguments": json.dumps({
1133
+ # "action": "left_mouse_up",
1134
+ # "coordinate": [220, 300]
1135
+ # })
1136
+ # },
1137
+ # "id": "call_1",
1138
+ # "type": "function"
1139
+ # }
1140
+
1141
+ # Output:
1142
+ # {
1143
+ # "type": "computer_call",
1144
+ # "call_id": "call_1",
1145
+ # "action": {
1146
+ # "type": "mouse_up",
1147
+ # "button": "left",
1148
+ # "x": 220,
1149
+ # "y": 300
1150
+ # }
1151
+ # }
553
1152
  raise NotImplementedError("left_mouse_up")
554
1153
  elif action_type == "hold_key":
1154
+ # Input:
1155
+ # {
1156
+ # "function": {
1157
+ # "name": "computer",
1158
+ # "arguments": json.dumps({
1159
+ # "action": "hold_key",
1160
+ # "key": "shift"
1161
+ # })
1162
+ # },
1163
+ # "id": "call_1",
1164
+ # "type": "function"
1165
+ # }
1166
+
1167
+ # Output:
1168
+ # {
1169
+ # "type": "computer_call",
1170
+ # "call_id": "call_1",
1171
+ # "action": {
1172
+ # "type": "key_hold",
1173
+ # "key": "shift"
1174
+ # }
1175
+ # }
555
1176
  raise NotImplementedError("hold_key")
556
1177
  elif action_type == "wait":
1178
+ # Input:
1179
+ # {
1180
+ # "function": {
1181
+ # "name": "computer",
1182
+ # "arguments": json.dumps({
1183
+ # "action": "wait"
1184
+ # })
1185
+ # },
1186
+ # "id": "call_1",
1187
+ # "type": "function"
1188
+ # }
1189
+
1190
+ # Output:
1191
+ # {
1192
+ # "type": "computer_call",
1193
+ # "call_id": "call_1",
1194
+ # "action": {
1195
+ # "type": "wait"
1196
+ # }
1197
+ # }
557
1198
  responses_items.append(make_wait_item(
558
1199
  call_id=call_id
559
1200
  ))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.11
@@ -13,7 +13,7 @@ Requires-Dist: pydantic>=2.6.4
13
13
  Requires-Dist: rich>=13.7.1
14
14
  Requires-Dist: python-dotenv>=1.0.1
15
15
  Requires-Dist: cua-computer<0.5.0,>=0.3.0
16
- Requires-Dist: cua-core<0.2.0,>=0.1.0
16
+ Requires-Dist: cua-core<0.2.0,>=0.1.8
17
17
  Requires-Dist: certifi>=2024.2.2
18
18
  Requires-Dist: litellm>=1.74.8
19
19
  Provides-Extra: openai
@@ -9,13 +9,13 @@ agent/callbacks/budget_manager.py,sha256=RyKM-7iXQcDotYvrw3eURzeEHEXvQjID-NobtvQ
9
9
  agent/callbacks/image_retention.py,sha256=tiuRT5ke9xXTb2eP8Gz-2ITyAMY29LURUH6AbjX3RP8,6165
10
10
  agent/callbacks/logging.py,sha256=OOxU97EzrxlnUAtiEnvy9FB7SwCUK90-rdpDFA2Ae4E,10921
11
11
  agent/callbacks/pii_anonymization.py,sha256=UKAqNacHG3z92_6uocVzOIl8gJoqyofldCoCmB4UVIE,10268
12
- agent/callbacks/telemetry.py,sha256=sYsE_-tnZkt1ydIRbp_GfCETlz7QG9DNbawq6hM4Bqw,7445
12
+ agent/callbacks/telemetry.py,sha256=PU7pkK7W1v1xjDN-9gA30lGvn4-WhqK3BPHGW3HpTOc,7497
13
13
  agent/callbacks/trajectory_saver.py,sha256=POE8aPT-MBzfW873wr6C7iiVUHtp483KwvLPxC1S3EY,11626
14
- agent/cli.py,sha256=WZFyhmTbFnA7QgZmqKO5tGoWsKeO12-GVlBab314o9Q,10002
14
+ agent/cli.py,sha256=odI7cdl1psOGK-mEQzezsPzbRcLFwDbi7A2ukvYq8dk,12130
15
15
  agent/computer_handler.py,sha256=2gfFBeDk9Vd54x9mOqnswMo8BdjUduLo5I0RbBPLovY,3964
16
16
  agent/decorators.py,sha256=bCmcCjP31WEjWg1D91OE2jo7AZTfGa9cNgCnYUvjiyw,2832
17
17
  agent/loops/__init__.py,sha256=_qpP_--3ePdFkTZP8qmUEFlBsy6m4h8fj0gGLDKA7zw,217
18
- agent/loops/anthropic.py,sha256=w5s_zvkXdcHt0DgBMYjDQGDMBXK4bPu-SyeIMhA1Rrs,32243
18
+ agent/loops/anthropic.py,sha256=Za_Qzf4q37CO4QZ0jTnSjHj7RIgaoTLNdrxfPYEysCg,58155
19
19
  agent/loops/omniparser.py,sha256=m3bDNQ0Igc_HHVoAbjVNj599uRoC9Eap3DCALg6RZ54,11422
20
20
  agent/loops/openai.py,sha256=ArTqadeJY8F9N8ZLKfswlzgHV_54HbWJgLd4l6ele9w,3010
21
21
  agent/loops/uitars.py,sha256=L0NYxKoIiMfIHbyomnaiK3ZGLmLv3QMx9nX57GruAk0,26323
@@ -27,7 +27,7 @@ agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
27
27
  agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
28
28
  agent/ui/gradio/app.py,sha256=X7he4jzyFqWJDP1y_M8yfZvfdy6GHNuclLn4k9iIwAw,8824
29
29
  agent/ui/gradio/ui_components.py,sha256=WxFE-4wvdEgj7FPLNXUrs118sXJ9vN3kLkZxtto-weo,34474
30
- cua_agent-0.4.1.dist-info/METADATA,sha256=Yf2tVl9529nOxprqpjmvqTtqPOnYWmDtDjuo6UuFddg,12060
31
- cua_agent-0.4.1.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
32
- cua_agent-0.4.1.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
33
- cua_agent-0.4.1.dist-info/RECORD,,
30
+ cua_agent-0.4.3.dist-info/METADATA,sha256=x8zulOSGVabWb_SjdI08AphtSUU0XBaOT0B2ULmQtik,12060
31
+ cua_agent-0.4.3.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
32
+ cua_agent-0.4.3.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
33
+ cua_agent-0.4.3.dist-info/RECORD,,