cua-agent 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

@@ -60,7 +60,7 @@ class TelemetryCallback(AsyncCallbackHandler):
60
60
  """Record agent type/model and session initialization."""
61
61
  agent_info = {
62
62
  "session_id": self.session_id,
63
- "agent_type": self.agent.agent_loop.__name__,
63
+ "agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown',
64
64
  "model": getattr(self.agent, 'model', 'unknown'),
65
65
  **SYSTEM_INFO
66
66
  }
agent/cli.py CHANGED
@@ -92,26 +92,30 @@ def print_welcome(model: str, agent_loop: str, container_name: str):
92
92
  async def ainput(prompt: str = ""):
93
93
  return await asyncio.to_thread(input, prompt)
94
94
 
95
- async def chat_loop(agent, model: str, container_name: str):
95
+ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = ""):
96
96
  """Main chat loop with the agent."""
97
97
  print_welcome(model, agent.agent_loop.__name__, container_name)
98
98
 
99
99
  history = []
100
100
 
101
+ if initial_prompt:
102
+ history.append({"role": "user", "content": initial_prompt})
103
+
101
104
  while True:
102
- # Get user input with prompt
103
- print_colored("> ", end="")
104
- user_input = await ainput()
105
-
106
- if user_input.lower() in ['exit', 'quit', 'q']:
107
- print_colored("\n👋 Goodbye!")
108
- break
105
+ if history[-1].get("role") != "user":
106
+ # Get user input with prompt
107
+ print_colored("> ", end="")
108
+ user_input = await ainput()
109
109
 
110
- if not user_input:
111
- continue
112
-
113
- # Add user message to history
114
- history.append({"role": "user", "content": user_input})
110
+ if user_input.lower() in ['exit', 'quit', 'q']:
111
+ print_colored("\n👋 Goodbye!")
112
+ break
113
+
114
+ if not user_input:
115
+ continue
116
+
117
+ # Add user message to history
118
+ history.append({"role": "user", "content": user_input})
115
119
 
116
120
  # Stream responses from the agent with spinner
117
121
  with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
@@ -204,6 +208,12 @@ Examples:
204
208
  action="store_true",
205
209
  help="Enable verbose logging"
206
210
  )
211
+
212
+ parser.add_argument(
213
+ "-p", "--prompt",
214
+ type=str,
215
+ help="Initial prompt to send to the agent. Leave blank for interactive mode."
216
+ )
207
217
 
208
218
  args = parser.parse_args()
209
219
 
@@ -269,9 +279,11 @@ Examples:
269
279
  agent_kwargs = {
270
280
  "model": args.model,
271
281
  "tools": [computer],
272
- "only_n_most_recent_images": args.images,
273
282
  "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
274
283
  }
284
+
285
+ if args.images > 0:
286
+ agent_kwargs["only_n_most_recent_images"] = args.images
275
287
 
276
288
  if args.trajectory:
277
289
  agent_kwargs["trajectory_dir"] = "trajectories"
@@ -286,7 +298,7 @@ Examples:
286
298
  agent = ComputerAgent(**agent_kwargs)
287
299
 
288
300
  # Start chat loop
289
- await chat_loop(agent, args.model, container_name)
301
+ await chat_loop(agent, args.model, container_name, args.prompt)
290
302
 
291
303
 
292
304
 
agent/loops/anthropic.py CHANGED
@@ -193,17 +193,98 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
193
193
 
194
194
  tool_use_content = []
195
195
 
196
+ # Basic actions (all versions)
196
197
  if action_type == "click":
198
+ # Input:
199
+ # {
200
+ # "type": "computer_call",
201
+ # "call_id": "call_1",
202
+ # "action": {
203
+ # "type": "click",
204
+ # "x": 100,
205
+ # "y": 200
206
+ # }
207
+ # }
208
+
209
+ # Output:
210
+ # {
211
+ # "function": {
212
+ # "name": "computer",
213
+ # "arguments": json.dumps({
214
+ # "action": "click",
215
+ # "coordinate": [100, 200]
216
+ # })
217
+ # },
218
+ # "id": "call_1",
219
+ # "type": "function"
220
+ # }
221
+ button = action.get("button", "left")
222
+ action_name = "right_click" if button == "right" else "middle_click" if button == "wheel" else "left_click"
197
223
  tool_use_content.append({
198
224
  "type": "tool_use",
199
225
  "id": call_id,
200
226
  "name": "computer",
201
227
  "input": {
202
- "action": "click",
228
+ "action": action_name,
229
+ "coordinate": [action.get("x", 0), action.get("y", 0)]
230
+ }
231
+ })
232
+ elif action_type == "double_click":
233
+ # Input:
234
+ # {
235
+ # "type": "computer_call",
236
+ # "call_id": "call_1",
237
+ # "action": {
238
+ # "type": "double_click",
239
+ # "x": 160,
240
+ # "y": 240
241
+ # }
242
+ # }
243
+
244
+ # Output:
245
+ # {
246
+ # "function": {
247
+ # "name": "computer",
248
+ # "arguments": json.dumps({
249
+ # "action": "double_click",
250
+ # "coordinate": [160, 240]
251
+ # })
252
+ # },
253
+ # "id": "call_1",
254
+ # "type": "function"
255
+ # }
256
+ tool_use_content.append({
257
+ "type": "tool_use",
258
+ "id": call_id,
259
+ "name": "computer",
260
+ "input": {
261
+ "action": "double_click",
203
262
  "coordinate": [action.get("x", 0), action.get("y", 0)]
204
263
  }
205
264
  })
206
265
  elif action_type == "type":
266
+ # Input:
267
+ # {
268
+ # "type": "computer_call",
269
+ # "call_id": "call_1",
270
+ # "action": {
271
+ # "type": "type",
272
+ # "text": "Hello World"
273
+ # }
274
+ # }
275
+
276
+ # Output:
277
+ # {
278
+ # "function": {
279
+ # "name": "computer",
280
+ # "arguments": json.dumps({
281
+ # "action": "type",
282
+ # "text": "Hello World"
283
+ # })
284
+ # },
285
+ # "id": "call_1",
286
+ # "type": "function"
287
+ # }
207
288
  tool_use_content.append({
208
289
  "type": "tool_use",
209
290
  "id": call_id,
@@ -213,26 +294,223 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
213
294
  "text": action.get("text", "")
214
295
  }
215
296
  })
216
- elif action_type == "key":
297
+ elif action_type == "keypress":
298
+ # Input:
299
+ # {
300
+ # "type": "computer_call",
301
+ # "call_id": "call_1",
302
+ # "action": {
303
+ # "type": "keypress",
304
+ # "keys": ["ctrl", "c"]
305
+ # }
306
+ # }
307
+
308
+ # Output:
309
+ # {
310
+ # "function": {
311
+ # "name": "computer",
312
+ # "arguments": json.dumps({
313
+ # "action": "key",
314
+ # "text": "ctrl+c"
315
+ # })
316
+ # },
317
+ # "id": "call_1",
318
+ # "type": "function"
319
+ # }
217
320
  tool_use_content.append({
218
321
  "type": "tool_use",
219
322
  "id": call_id,
220
323
  "name": "computer",
221
324
  "input": {
222
325
  "action": "key",
223
- "key": action.get("key", "")
326
+ "text": "+".join(action.get("keys", []))
327
+ }
328
+ })
329
+ elif action_type == "mouse_move":
330
+ # Input:
331
+ # {
332
+ # "type": "computer_call",
333
+ # "call_id": "call_1",
334
+ # "action": {
335
+ # "type": "mouse_move",
336
+ # "x": 150,
337
+ # "y": 250
338
+ # }
339
+ # }
340
+
341
+ # Output:
342
+ # {
343
+ # "function": {
344
+ # "name": "computer",
345
+ # "arguments": json.dumps({
346
+ # "action": "mouse_move",
347
+ # "coordinate": [150, 250]
348
+ # })
349
+ # },
350
+ # "id": "call_1",
351
+ # "type": "function"
352
+ # }
353
+ tool_use_content.append({
354
+ "type": "tool_use",
355
+ "id": call_id,
356
+ "name": "computer",
357
+ "input": {
358
+ "action": "mouse_move",
359
+ "coordinate": [action.get("x", 0), action.get("y", 0)]
360
+ }
361
+ })
362
+ elif action_type == "scroll":
363
+ # Input:
364
+ # {
365
+ # "type": "computer_call",
366
+ # "call_id": "call_1",
367
+ # "action": {
368
+ # "type": "scroll",
369
+ # "x": 300,
370
+ # "y": 400,
371
+ # "scroll_x": 0,
372
+ # "scroll_y": -5
373
+ # }
374
+ # }
375
+
376
+ # Output:
377
+ # {
378
+ # "function": {
379
+ # "name": "computer",
380
+ # "arguments": json.dumps({
381
+ # "action": "scroll",
382
+ # "coordinate": [300, 400],
383
+ # "scroll_direction": "down",
384
+ # "scroll_amount": 5
385
+ # })
386
+ # },
387
+ # "id": "call_1",
388
+ # "type": "function"
389
+ # }
390
+ scroll_x = action.get("scroll_x", 0)
391
+ scroll_y = action.get("scroll_y", 0)
392
+ # Determine direction and amount from scroll values
393
+ if scroll_x > 0:
394
+ direction = "left"
395
+ amount = scroll_x
396
+ elif scroll_x < 0:
397
+ direction = "right"
398
+ amount = -scroll_x
399
+ elif scroll_y > 0:
400
+ direction = "up"
401
+ amount = scroll_y
402
+ elif scroll_y < 0:
403
+ direction = "down"
404
+ amount = -scroll_y
405
+ else:
406
+ direction = "down"
407
+ amount = 3
408
+
409
+ tool_use_content.append({
410
+ "type": "tool_use",
411
+ "id": call_id,
412
+ "name": "computer",
413
+ "input": {
414
+ "action": "scroll",
415
+ "coordinate": [action.get("x", 0), action.get("y", 0)],
416
+ "scroll_direction": direction,
417
+ "scroll_amount": amount
418
+ }
419
+ })
420
+ elif action_type == "drag":
421
+ # Input:
422
+ # {
423
+ # "type": "computer_call",
424
+ # "call_id": "call_1",
425
+ # "action": {
426
+ # "type": "drag",
427
+ # "path": [
428
+ # {"x": 100, "y": 150},
429
+ # {"x": 200, "y": 250}
430
+ # ]
431
+ # }
432
+ # }
433
+
434
+ # Output:
435
+ # {
436
+ # "function": {
437
+ # "name": "computer",
438
+ # "arguments": json.dumps({
439
+ # "action": "left_click_drag",
440
+ # "start_coordinate": [100, 150],
441
+ # "end_coordinate": [200, 250]
442
+ # })
443
+ # },
444
+ # "id": "call_1",
445
+ # "type": "function"
446
+ # }
447
+ path = action.get("path", [])
448
+ start_coord = [0, 0]
449
+ end_coord = [0, 0]
450
+ if isinstance(path, list) and len(path) >= 2:
451
+ start_coord = [path[0].get("x", 0), path[0].get("y", 0)]
452
+ end_coord = [path[-1].get("x", 0), path[-1].get("y", 0)]
453
+
454
+ tool_use_content.append({
455
+ "type": "tool_use",
456
+ "id": call_id,
457
+ "name": "computer",
458
+ "input": {
459
+ "action": "left_click_drag",
460
+ "start_coordinate": start_coord,
461
+ "end_coordinate": end_coord
224
462
  }
225
463
  })
226
464
  elif action_type == "wait":
465
+ # Input:
466
+ # {
467
+ # "type": "computer_call",
468
+ # "call_id": "call_1",
469
+ # "action": {
470
+ # "type": "wait"
471
+ # }
472
+ # }
473
+
474
+ # Output:
475
+ # {
476
+ # "function": {
477
+ # "name": "computer",
478
+ # "arguments": json.dumps({
479
+ # "action": "wait"
480
+ # })
481
+ # },
482
+ # "id": "call_1",
483
+ # "type": "function"
484
+ # }
227
485
  tool_use_content.append({
228
486
  "type": "tool_use",
229
487
  "id": call_id,
230
488
  "name": "computer",
231
489
  "input": {
232
- "action": "screenshot"
490
+ "action": "wait"
233
491
  }
234
492
  })
235
493
  elif action_type == "screenshot":
494
+ # Input:
495
+ # {
496
+ # "type": "computer_call",
497
+ # "call_id": "call_1",
498
+ # "action": {
499
+ # "type": "screenshot"
500
+ # }
501
+ # }
502
+
503
+ # Output:
504
+ # {
505
+ # "function": {
506
+ # "name": "computer",
507
+ # "arguments": json.dumps({
508
+ # "action": "screenshot"
509
+ # })
510
+ # },
511
+ # "id": "call_1",
512
+ # "type": "function"
513
+ # }
236
514
  tool_use_content.append({
237
515
  "type": "tool_use",
238
516
  "id": call_id,
@@ -342,7 +620,7 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
342
620
  ))
343
621
  elif action_type == "key":
344
622
  responses_items.append(make_keypress_item(
345
- key=tool_input.get("key", ""),
623
+ keys=tool_input.get("text", "").replace("+", "-").split("-"),
346
624
  call_id=call_id
347
625
  ))
348
626
  elif action_type == "mouse_move":
@@ -361,21 +639,32 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
361
639
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
362
640
  elif action_type == "scroll":
363
641
  coordinate = tool_input.get("coordinate", [0, 0])
642
+ scroll_amount = tool_input.get("scroll_amount", 3)
643
+ scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
644
+ -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
645
+ scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
646
+ -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
364
647
  responses_items.append(make_scroll_item(
365
648
  x=coordinate[0] if len(coordinate) > 0 else 0,
366
649
  y=coordinate[1] if len(coordinate) > 1 else 0,
367
- direction=tool_input.get("scroll_direction", "down"),
368
- amount=tool_input.get("scroll_amount", 3),
650
+ scroll_x=scroll_x,
651
+ scroll_y=scroll_y,
369
652
  call_id=call_id
370
653
  ))
371
654
  elif action_type == "left_click_drag":
372
655
  start_coord = tool_input.get("start_coordinate", [0, 0])
373
656
  end_coord = tool_input.get("end_coordinate", [0, 0])
374
657
  responses_items.append(make_drag_item(
375
- start_x=start_coord[0] if len(start_coord) > 0 else 0,
376
- start_y=start_coord[1] if len(start_coord) > 1 else 0,
377
- end_x=end_coord[0] if len(end_coord) > 0 else 0,
378
- end_y=end_coord[1] if len(end_coord) > 1 else 0,
658
+ path=[
659
+ {
660
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
661
+ "y": start_coord[1] if len(start_coord) > 1 else 0
662
+ },
663
+ {
664
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
665
+ "y": end_coord[1] if len(end_coord) > 1 else 0
666
+ }
667
+ ],
379
668
  call_id=call_id
380
669
  ))
381
670
  elif action_type == "right_click":
@@ -459,7 +748,6 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
459
748
  # Handle tool calls (alternative format)
460
749
  if hasattr(message, 'tool_calls') and message.tool_calls:
461
750
  for tool_call in message.tool_calls:
462
- print(tool_call)
463
751
  if tool_call.function.name == "computer":
464
752
  try:
465
753
  args = json.loads(tool_call.function.arguments)
@@ -468,10 +756,53 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
468
756
 
469
757
  # Basic actions (all versions)
470
758
  if action_type == "screenshot":
759
+ # Input:
760
+ # {
761
+ # "function": {
762
+ # "name": "computer",
763
+ # "arguments": json.dumps({
764
+ # "action": "screenshot"
765
+ # })
766
+ # },
767
+ # "id": "call_1",
768
+ # "type": "function"
769
+ # }
770
+
771
+ # Output:
772
+ # {
773
+ # "type": "computer_call",
774
+ # "call_id": "call_1",
775
+ # "action": {
776
+ # "type": "screenshot"
777
+ # }
778
+ # }
471
779
  responses_items.append(make_screenshot_item(
472
780
  call_id=call_id
473
781
  ))
474
782
  elif action_type in ["click", "left_click"]:
783
+ # Input:
784
+ # {
785
+ # "function": {
786
+ # "name": "computer",
787
+ # "arguments": json.dumps({
788
+ # "action": "click",
789
+ # "coordinate": [100, 200]
790
+ # })
791
+ # },
792
+ # "id": "call_1",
793
+ # "type": "function"
794
+ # }
795
+
796
+ # Output:
797
+ # {
798
+ # "type": "computer_call",
799
+ # "call_id": "call_1",
800
+ # "action": {
801
+ # "type": "click",
802
+ # "x": 100,
803
+ # "y": 200
804
+ # }
805
+ # }
475
806
  coordinate = args.get("coordinate", [0, 0])
476
807
  responses_items.append(make_click_item(
477
808
  x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -479,16 +810,83 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
479
810
  call_id=call_id
480
811
  ))
481
812
  elif action_type == "type":
813
+ # Input:
814
+ # {
815
+ # "function": {
816
+ # "name": "computer",
817
+ # "arguments": json.dumps({
818
+ # "action": "type",
819
+ # "text": "Hello World"
820
+ # })
821
+ # },
822
+ # "id": "call_1",
823
+ # "type": "function"
824
+ # }
825
+
826
+ # Output:
827
+ # {
828
+ # "type": "computer_call",
829
+ # "call_id": "call_1",
830
+ # "action": {
831
+ # "type": "type",
832
+ # "text": "Hello World"
833
+ # }
834
+ # }
482
835
  responses_items.append(make_type_item(
483
836
  text=args.get("text", ""),
484
837
  call_id=call_id
485
838
  ))
486
839
  elif action_type == "key":
840
+ # Input:
841
+ # {
842
+ # "function": {
843
+ # "name": "computer",
844
+ # "arguments": json.dumps({
845
+ # "action": "key",
846
+ # "text": "ctrl+c"
847
+ # })
848
+ # },
849
+ # "id": "call_1",
850
+ # "type": "function"
851
+ # }
852
+
853
+ # Output:
854
+ # {
855
+ # "type": "computer_call",
856
+ # "call_id": "call_1",
857
+ # "action": {
858
+ # "type": "keypress",
859
+ # "keys": ["ctrl", "c"]
860
+ # }
861
+ # }
487
862
  responses_items.append(make_keypress_item(
488
- key=args.get("key", ""),
863
+ keys=args.get("text", "").replace("+", "-").split("-"),
489
864
  call_id=call_id
490
865
  ))
491
866
  elif action_type == "mouse_move":
867
+ # Input:
868
+ # {
869
+ # "function": {
870
+ # "name": "computer",
871
+ # "arguments": json.dumps({
872
+ # "action": "mouse_move",
873
+ # "coordinate": [150, 250]
874
+ # })
875
+ # },
876
+ # "id": "call_1",
877
+ # "type": "function"
878
+ # }
879
+
880
+ # Output:
881
+ # {
882
+ # "type": "computer_call",
883
+ # "call_id": "call_1",
884
+ # "action": {
885
+ # "type": "mouse_move",
886
+ # "x": 150,
887
+ # "y": 250
888
+ # }
889
+ # }
492
890
  coordinate = args.get("coordinate", [0, 0])
493
891
  responses_items.append(make_move_item(
494
892
  x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -498,6 +896,33 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
498
896
 
499
897
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
500
898
  elif action_type == "scroll":
899
+ # Input:
900
+ # {
901
+ # "function": {
902
+ # "name": "computer",
903
+ # "arguments": json.dumps({
904
+ # "action": "scroll",
905
+ # "coordinate": [300, 400],
906
+ # "scroll_direction": "down",
907
+ # "scroll_amount": 5
908
+ # })
909
+ # },
910
+ # "id": "call_1",
911
+ # "type": "function"
912
+ # }
913
+
914
+ # Output:
915
+ # {
916
+ # "type": "computer_call",
917
+ # "call_id": "call_1",
918
+ # "action": {
919
+ # "type": "scroll",
920
+ # "x": 300,
921
+ # "y": 400,
922
+ # "scroll_x": 0,
923
+ # "scroll_y": -5
924
+ # }
925
+ # }
501
926
  coordinate = args.get("coordinate", [0, 0])
502
927
  direction = args.get("scroll_direction", "down")
503
928
  amount = args.get("scroll_amount", 3)
@@ -513,16 +938,72 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
513
938
  call_id=call_id
514
939
  ))
515
940
  elif action_type == "left_click_drag":
941
+ # Input:
942
+ # {
943
+ # "function": {
944
+ # "name": "computer",
945
+ # "arguments": json.dumps({
946
+ # "action": "left_click_drag",
947
+ # "start_coordinate": [100, 150],
948
+ # "end_coordinate": [200, 250]
949
+ # })
950
+ # },
951
+ # "id": "call_1",
952
+ # "type": "function"
953
+ # }
954
+
955
+ # Output:
956
+ # {
957
+ # "type": "computer_call",
958
+ # "call_id": "call_1",
959
+ # "action": {
960
+ # "type": "drag",
961
+ # "path": [
962
+ # {"x": 100, "y": 150},
963
+ # {"x": 200, "y": 250}
964
+ # ]
965
+ # }
966
+ # }
516
967
  start_coord = args.get("start_coordinate", [0, 0])
517
968
  end_coord = args.get("end_coordinate", [0, 0])
518
969
  responses_items.append(make_drag_item(
519
- start_x=start_coord[0] if len(start_coord) > 0 else 0,
520
- start_y=start_coord[1] if len(start_coord) > 1 else 0,
521
- end_x=end_coord[0] if len(end_coord) > 0 else 0,
522
- end_y=end_coord[1] if len(end_coord) > 1 else 0,
970
+ path=[
971
+ {
972
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
973
+ "y": start_coord[1] if len(start_coord) > 1 else 0
974
+ },
975
+ {
976
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
977
+ "y": end_coord[1] if len(end_coord) > 1 else 0
978
+ }
979
+ ],
523
980
  call_id=call_id
524
981
  ))
525
982
  elif action_type == "right_click":
983
+ # Input:
984
+ # {
985
+ # "function": {
986
+ # "name": "computer",
987
+ # "arguments": json.dumps({
988
+ # "action": "right_click",
989
+ # "coordinate": [120, 180]
990
+ # })
991
+ # },
992
+ # "id": "call_1",
993
+ # "type": "function"
994
+ # }
995
+
996
+ # Output:
997
+ # {
998
+ # "type": "computer_call",
999
+ # "call_id": "call_1",
1000
+ # "action": {
1001
+ # "type": "click",
1002
+ # "x": 120,
1003
+ # "y": 180,
1004
+ # "button": "right"
1005
+ # }
1006
+ # }
526
1007
  coordinate = args.get("coordinate", [0, 0])
527
1008
  responses_items.append(make_click_item(
528
1009
  x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -531,14 +1012,61 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
531
1012
  call_id=call_id
532
1013
  ))
533
1014
  elif action_type == "middle_click":
1015
+ # Input:
1016
+ # {
1017
+ # "function": {
1018
+ # "name": "computer",
1019
+ # "arguments": json.dumps({
1020
+ # "action": "middle_click",
1021
+ # "coordinate": [140, 220]
1022
+ # })
1023
+ # },
1024
+ # "id": "call_1",
1025
+ # "type": "function"
1026
+ # }
1027
+
1028
+ # Output:
1029
+ # {
1030
+ # "type": "computer_call",
1031
+ # "call_id": "call_1",
1032
+ # "action": {
1033
+ # "type": "click",
1034
+ # "x": 140,
1035
+ # "y": 220,
1036
+ # "button": "wheel"
1037
+ # }
1038
+ # }
534
1039
  coordinate = args.get("coordinate", [0, 0])
535
1040
  responses_items.append(make_click_item(
536
1041
  x=coordinate[0] if len(coordinate) > 0 else 0,
537
1042
  y=coordinate[1] if len(coordinate) > 1 else 0,
538
- button="scroll",
1043
+ button="wheel",
539
1044
  call_id=call_id
540
1045
  ))
541
1046
  elif action_type == "double_click":
1047
+ # Input:
1048
+ # {
1049
+ # "function": {
1050
+ # "name": "computer",
1051
+ # "arguments": json.dumps({
1052
+ # "action": "double_click",
1053
+ # "coordinate": [160, 240]
1054
+ # })
1055
+ # },
1056
+ # "id": "call_1",
1057
+ # "type": "function"
1058
+ # }
1059
+
1060
+ # Output:
1061
+ # {
1062
+ # "type": "computer_call",
1063
+ # "call_id": "call_1",
1064
+ # "action": {
1065
+ # "type": "double_click",
1066
+ # "x": 160,
1067
+ # "y": 240
1068
+ # }
1069
+ # }
542
1070
  coordinate = args.get("coordinate", [0, 0])
543
1071
  responses_items.append(make_double_click_item(
544
1072
  x=coordinate[0] if len(coordinate) > 0 else 0,
@@ -546,14 +1074,127 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
546
1074
  call_id=call_id
547
1075
  ))
548
1076
  elif action_type == "triple_click":
1077
+ # Input:
1078
+ # {
1079
+ # "function": {
1080
+ # "name": "computer",
1081
+ # "arguments": json.dumps({
1082
+ # "action": "triple_click",
1083
+ # "coordinate": [180, 260]
1084
+ # })
1085
+ # },
1086
+ # "id": "call_1",
1087
+ # "type": "function"
1088
+ # }
1089
+
1090
+ # Output:
1091
+ # {
1092
+ # "type": "computer_call",
1093
+ # "call_id": "call_1",
1094
+ # "action": {
1095
+ # "type": "triple_click",
1096
+ # "x": 180,
1097
+ # "y": 260
1098
+ # }
1099
+ # }
549
1100
  raise NotImplementedError("triple_click")
550
1101
  elif action_type == "left_mouse_down":
1102
+ # Input:
1103
+ # {
1104
+ # "function": {
1105
+ # "name": "computer",
1106
+ # "arguments": json.dumps({
1107
+ # "action": "left_mouse_down",
1108
+ # "coordinate": [200, 280]
1109
+ # })
1110
+ # },
1111
+ # "id": "call_1",
1112
+ # "type": "function"
1113
+ # }
1114
+
1115
+ # Output:
1116
+ # {
1117
+ # "type": "computer_call",
1118
+ # "call_id": "call_1",
1119
+ # "action": {
1120
+ # "type": "mouse_down",
1121
+ # "button": "left",
1122
+ # "x": 200,
1123
+ # "y": 280
1124
+ # }
1125
+ # }
551
1126
  raise NotImplementedError("left_mouse_down")
552
1127
  elif action_type == "left_mouse_up":
1128
+ # Input:
1129
+ # {
1130
+ # "function": {
1131
+ # "name": "computer",
1132
+ # "arguments": json.dumps({
1133
+ # "action": "left_mouse_up",
1134
+ # "coordinate": [220, 300]
1135
+ # })
1136
+ # },
1137
+ # "id": "call_1",
1138
+ # "type": "function"
1139
+ # }
1140
+
1141
+ # Output:
1142
+ # {
1143
+ # "type": "computer_call",
1144
+ # "call_id": "call_1",
1145
+ # "action": {
1146
+ # "type": "mouse_up",
1147
+ # "button": "left",
1148
+ # "x": 220,
1149
+ # "y": 300
1150
+ # }
1151
+ # }
553
1152
  raise NotImplementedError("left_mouse_up")
554
1153
  elif action_type == "hold_key":
1154
+ # Input:
1155
+ # {
1156
+ # "function": {
1157
+ # "name": "computer",
1158
+ # "arguments": json.dumps({
1159
+ # "action": "hold_key",
1160
+ # "key": "shift"
1161
+ # })
1162
+ # },
1163
+ # "id": "call_1",
1164
+ # "type": "function"
1165
+ # }
1166
+
1167
+ # Output:
1168
+ # {
1169
+ # "type": "computer_call",
1170
+ # "call_id": "call_1",
1171
+ # "action": {
1172
+ # "type": "key_hold",
1173
+ # "key": "shift"
1174
+ # }
1175
+ # }
555
1176
  raise NotImplementedError("hold_key")
556
1177
  elif action_type == "wait":
1178
+ # Input:
1179
+ # {
1180
+ # "function": {
1181
+ # "name": "computer",
1182
+ # "arguments": json.dumps({
1183
+ # "action": "wait"
1184
+ # })
1185
+ # },
1186
+ # "id": "call_1",
1187
+ # "type": "function"
1188
+ # }
1189
+
1190
+ # Output:
1191
+ # {
1192
+ # "type": "computer_call",
1193
+ # "call_id": "call_1",
1194
+ # "action": {
1195
+ # "type": "wait"
1196
+ # }
1197
+ # }
557
1198
  responses_items.append(make_wait_item(
558
1199
  call_id=call_id
559
1200
  ))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cua-agent
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: CUA (Computer Use) Agent for AI-driven computer interaction
5
5
  Author-Email: TryCua <gh@trycua.com>
6
6
  Requires-Python: >=3.11
@@ -13,7 +13,7 @@ Requires-Dist: pydantic>=2.6.4
13
13
  Requires-Dist: rich>=13.7.1
14
14
  Requires-Dist: python-dotenv>=1.0.1
15
15
  Requires-Dist: cua-computer<0.5.0,>=0.3.0
16
- Requires-Dist: cua-core<0.2.0,>=0.1.0
16
+ Requires-Dist: cua-core<0.2.0,>=0.1.8
17
17
  Requires-Dist: certifi>=2024.2.2
18
18
  Requires-Dist: litellm>=1.74.8
19
19
  Provides-Extra: openai
@@ -9,13 +9,13 @@ agent/callbacks/budget_manager.py,sha256=RyKM-7iXQcDotYvrw3eURzeEHEXvQjID-NobtvQ
9
9
  agent/callbacks/image_retention.py,sha256=tiuRT5ke9xXTb2eP8Gz-2ITyAMY29LURUH6AbjX3RP8,6165
10
10
  agent/callbacks/logging.py,sha256=OOxU97EzrxlnUAtiEnvy9FB7SwCUK90-rdpDFA2Ae4E,10921
11
11
  agent/callbacks/pii_anonymization.py,sha256=UKAqNacHG3z92_6uocVzOIl8gJoqyofldCoCmB4UVIE,10268
12
- agent/callbacks/telemetry.py,sha256=sYsE_-tnZkt1ydIRbp_GfCETlz7QG9DNbawq6hM4Bqw,7445
12
+ agent/callbacks/telemetry.py,sha256=PU7pkK7W1v1xjDN-9gA30lGvn4-WhqK3BPHGW3HpTOc,7497
13
13
  agent/callbacks/trajectory_saver.py,sha256=POE8aPT-MBzfW873wr6C7iiVUHtp483KwvLPxC1S3EY,11626
14
- agent/cli.py,sha256=WZFyhmTbFnA7QgZmqKO5tGoWsKeO12-GVlBab314o9Q,10002
14
+ agent/cli.py,sha256=cTH2RT8tLtISseqrtTSau7g3-ILkFnHkR2BDLaNQVVE,10440
15
15
  agent/computer_handler.py,sha256=2gfFBeDk9Vd54x9mOqnswMo8BdjUduLo5I0RbBPLovY,3964
16
16
  agent/decorators.py,sha256=bCmcCjP31WEjWg1D91OE2jo7AZTfGa9cNgCnYUvjiyw,2832
17
17
  agent/loops/__init__.py,sha256=_qpP_--3ePdFkTZP8qmUEFlBsy6m4h8fj0gGLDKA7zw,217
18
- agent/loops/anthropic.py,sha256=w5s_zvkXdcHt0DgBMYjDQGDMBXK4bPu-SyeIMhA1Rrs,32243
18
+ agent/loops/anthropic.py,sha256=Za_Qzf4q37CO4QZ0jTnSjHj7RIgaoTLNdrxfPYEysCg,58155
19
19
  agent/loops/omniparser.py,sha256=m3bDNQ0Igc_HHVoAbjVNj599uRoC9Eap3DCALg6RZ54,11422
20
20
  agent/loops/openai.py,sha256=ArTqadeJY8F9N8ZLKfswlzgHV_54HbWJgLd4l6ele9w,3010
21
21
  agent/loops/uitars.py,sha256=L0NYxKoIiMfIHbyomnaiK3ZGLmLv3QMx9nX57GruAk0,26323
@@ -27,7 +27,7 @@ agent/ui/__main__.py,sha256=vudWXYvGM0aNT5aZ94HPtGW8YXOZ4cLXepHyhUM_k1g,73
27
27
  agent/ui/gradio/__init__.py,sha256=yv4Mrfo-Sj2U5sVn_UJHAuwYCezo-5O4ItR2C9jzNko,145
28
28
  agent/ui/gradio/app.py,sha256=X7he4jzyFqWJDP1y_M8yfZvfdy6GHNuclLn4k9iIwAw,8824
29
29
  agent/ui/gradio/ui_components.py,sha256=WxFE-4wvdEgj7FPLNXUrs118sXJ9vN3kLkZxtto-weo,34474
30
- cua_agent-0.4.1.dist-info/METADATA,sha256=Yf2tVl9529nOxprqpjmvqTtqPOnYWmDtDjuo6UuFddg,12060
31
- cua_agent-0.4.1.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
32
- cua_agent-0.4.1.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
33
- cua_agent-0.4.1.dist-info/RECORD,,
30
+ cua_agent-0.4.2.dist-info/METADATA,sha256=mWUkghYqwfpH9ElvfMlzG30mNuDdlnKPr8VmNnZalAg,12060
31
+ cua_agent-0.4.2.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
32
+ cua_agent-0.4.2.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
33
+ cua_agent-0.4.2.dist-info/RECORD,,