cua-agent 0.4.10__py3-none-any.whl → 0.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

agent/loops/anthropic.py CHANGED
@@ -23,7 +23,10 @@ from ..responses import (
23
23
  make_type_item,
24
24
  make_wait_item,
25
25
  make_input_image_item,
26
- make_screenshot_item
26
+ make_screenshot_item,
27
+ make_failed_tool_call_items,
28
+ make_left_mouse_down_item,
29
+ make_left_mouse_up_item
27
30
  )
28
31
 
29
32
  # Model version mapping to tool version and beta flag
@@ -115,7 +118,8 @@ async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model
115
118
  def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
116
119
  """Convert responses_items message format to liteLLM completion format."""
117
120
  completion_messages = []
118
-
121
+ call_id_to_fn_name = {}
122
+
119
123
  for message in messages:
120
124
  msg_type = message.get("type")
121
125
  role = message.get("role")
@@ -193,6 +197,43 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
193
197
  "content": reasoning_text
194
198
  })
195
199
 
200
+ elif msg_type == "function_call":
201
+ fn_name = message.get("name")
202
+ fn_args = message.get("arguments", "{}")
203
+ call_id = message.get("call_id", "call_1")
204
+ call_id_to_fn_name[call_id] = fn_name
205
+ openai_tool_calls = [{
206
+ "id": call_id,
207
+ "type": "function",
208
+ "function": {
209
+ "name": fn_name,
210
+ "arguments": fn_args
211
+ }
212
+ }] # If the last completion message is an assistant message, extend the tool_calls
213
+ if completion_messages and completion_messages[-1].get("role") == "assistant":
214
+ if "tool_calls" not in completion_messages[-1]:
215
+ completion_messages[-1]["tool_calls"] = []
216
+ completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
217
+ else:
218
+ # Create new assistant message with tool calls
219
+ completion_messages.append({
220
+ "role": "assistant",
221
+ "content": None,
222
+ "tool_calls": openai_tool_calls
223
+ })
224
+
225
+ elif msg_type == "function_call_output":
226
+ call_id = message.get("call_id", "call_1")
227
+ fn_output = message.get("output", "")
228
+ fn_name = call_id_to_fn_name.get(call_id, "computer")
229
+
230
+ completion_messages.append({
231
+ "role": "function",
232
+ "name": fn_name,
233
+ "tool_call_id": call_id,
234
+ "content": str(fn_output)
235
+ })
236
+
196
237
  elif msg_type == "computer_call":
197
238
  # Computer call becomes tool use in assistant message
198
239
  action = message.get("action", {})
@@ -527,6 +568,26 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
527
568
  "action": "screenshot"
528
569
  }
529
570
  })
571
+ elif action_type == "left_mouse_down":
572
+ tool_use_content.append({
573
+ "type": "tool_use",
574
+ "id": call_id,
575
+ "name": "computer",
576
+ "input": {
577
+ "action": "left_mouse_down",
578
+ "coordinate": [action.get("x", None), action.get("y", None)]
579
+ }
580
+ })
581
+ elif action_type == "left_mouse_up":
582
+ tool_use_content.append({
583
+ "type": "tool_use",
584
+ "id": call_id,
585
+ "name": "computer",
586
+ "input": {
587
+ "action": "left_mouse_up",
588
+ "coordinate": [action.get("x", None), action.get("y", None)]
589
+ }
590
+ })
530
591
 
531
592
  # Convert tool_use_content to OpenAI tool_calls format
532
593
  openai_tool_calls = []
@@ -611,45 +672,350 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
611
672
  # Action reference:
612
673
  # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
613
674
 
675
+ try:
676
+ # Basic actions (all versions)
677
+ if action_type == "screenshot":
678
+ responses_items.append(make_screenshot_item(call_id=call_id))
679
+ elif action_type in ["click", "left_click"]:
680
+ coordinate = tool_input.get("coordinate", [0, 0])
681
+ responses_items.append(make_click_item(
682
+ x=coordinate[0] if len(coordinate) > 0 else 0,
683
+ y=coordinate[1] if len(coordinate) > 1 else 0,
684
+ call_id=call_id
685
+ ))
686
+ elif action_type in ["type", "type_text"]:
687
+ responses_items.append(make_type_item(
688
+ text=tool_input.get("text", ""),
689
+ call_id=call_id
690
+ ))
691
+ elif action_type in ["key", "keypress", "hotkey"]:
692
+ responses_items.append(make_keypress_item(
693
+ keys=tool_input.get("text", "").replace("+", "-").split("-"),
694
+ call_id=call_id
695
+ ))
696
+ elif action_type in ["mouse_move", "move_cursor", "move"]:
697
+ # Mouse move - create a custom action item
698
+ coordinate = tool_input.get("coordinate", [0, 0])
699
+ responses_items.append(
700
+ make_move_item(
701
+ x=coordinate[0] if len(coordinate) > 0 else 0,
702
+ y=coordinate[1] if len(coordinate) > 1 else 0,
703
+ call_id=call_id
704
+ )
705
+ )
706
+
707
+ # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
708
+ elif action_type == "scroll":
709
+ coordinate = tool_input.get("coordinate", [0, 0])
710
+ scroll_amount = tool_input.get("scroll_amount", 3)
711
+ scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
712
+ -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
713
+ scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
714
+ -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
715
+ responses_items.append(make_scroll_item(
716
+ x=coordinate[0] if len(coordinate) > 0 else 0,
717
+ y=coordinate[1] if len(coordinate) > 1 else 0,
718
+ scroll_x=scroll_x,
719
+ scroll_y=scroll_y,
720
+ call_id=call_id
721
+ ))
722
+ elif action_type in ["left_click_drag", "drag"]:
723
+ start_coord = tool_input.get("start_coordinate", [0, 0])
724
+ end_coord = tool_input.get("end_coordinate", [0, 0])
725
+ responses_items.append(make_drag_item(
726
+ path=[
727
+ {
728
+ "x": start_coord[0] if len(start_coord) > 0 else 0,
729
+ "y": start_coord[1] if len(start_coord) > 1 else 0
730
+ },
731
+ {
732
+ "x": end_coord[0] if len(end_coord) > 0 else 0,
733
+ "y": end_coord[1] if len(end_coord) > 1 else 0
734
+ }
735
+ ],
736
+ call_id=call_id
737
+ ))
738
+ elif action_type == "right_click":
739
+ coordinate = tool_input.get("coordinate", [0, 0])
740
+ responses_items.append(make_click_item(
741
+ x=coordinate[0] if len(coordinate) > 0 else 0,
742
+ y=coordinate[1] if len(coordinate) > 1 else 0,
743
+ button="right",
744
+ call_id=call_id
745
+ ))
746
+ elif action_type == "middle_click":
747
+ coordinate = tool_input.get("coordinate", [0, 0])
748
+ responses_items.append(make_click_item(
749
+ x=coordinate[0] if len(coordinate) > 0 else 0,
750
+ y=coordinate[1] if len(coordinate) > 1 else 0,
751
+ button="wheel",
752
+ call_id=call_id
753
+ ))
754
+ elif action_type == "double_click":
755
+ coordinate = tool_input.get("coordinate", [0, 0])
756
+ responses_items.append(make_double_click_item(
757
+ x=coordinate[0] if len(coordinate) > 0 else 0,
758
+ y=coordinate[1] if len(coordinate) > 1 else 0,
759
+ call_id=call_id
760
+ ))
761
+ elif action_type == "triple_click":
762
+ # coordinate = tool_input.get("coordinate", [0, 0])
763
+ # responses_items.append({
764
+ # "type": "computer_call",
765
+ # "call_id": call_id,
766
+ # "action": {
767
+ # "type": "triple_click",
768
+ # "x": coordinate[0] if len(coordinate) > 0 else 0,
769
+ # "y": coordinate[1] if len(coordinate) > 1 else 0
770
+ # }
771
+ # })
772
+ raise NotImplementedError("triple_click")
773
+ elif action_type == "left_mouse_down":
774
+ # coordinate = tool_input.get("coordinate", [0, 0])
775
+ # responses_items.append({
776
+ # "type": "computer_call",
777
+ # "call_id": call_id,
778
+ # "action": {
779
+ # "type": "mouse_down",
780
+ # "button": "left",
781
+ # "x": coordinate[0] if len(coordinate) > 0 else 0,
782
+ # "y": coordinate[1] if len(coordinate) > 1 else 0
783
+ # }
784
+ # })
785
+ coordinate = tool_input.get("coordinate", [None, None])
786
+ responses_items.append(make_left_mouse_down_item(
787
+ x=coordinate[0] if len(coordinate) > 0 else None,
788
+ y=coordinate[1] if len(coordinate) > 1 else None,
789
+ call_id=call_id
790
+ ))
791
+ elif action_type == "left_mouse_up":
792
+ # coordinate = tool_input.get("coordinate", [0, 0])
793
+ # responses_items.append({
794
+ # "type": "computer_call",
795
+ # "call_id": call_id,
796
+ # "action": {
797
+ # "type": "mouse_up",
798
+ # "button": "left",
799
+ # "x": coordinate[0] if len(coordinate) > 0 else 0,
800
+ # "y": coordinate[1] if len(coordinate) > 1 else 0
801
+ # }
802
+ # })
803
+ coordinate = tool_input.get("coordinate", [None, None])
804
+ responses_items.append(make_left_mouse_up_item(
805
+ x=coordinate[0] if len(coordinate) > 0 else None,
806
+ y=coordinate[1] if len(coordinate) > 1 else None,
807
+ call_id=call_id
808
+ ))
809
+ elif action_type == "hold_key":
810
+ # responses_items.append({
811
+ # "type": "computer_call",
812
+ # "call_id": call_id,
813
+ # "action": {
814
+ # "type": "key_hold",
815
+ # "key": tool_input.get("key", "")
816
+ # }
817
+ # })
818
+ raise NotImplementedError("hold_key")
819
+ elif action_type == "wait":
820
+ responses_items.append(make_wait_item(
821
+ call_id=call_id
822
+ ))
823
+ else:
824
+ raise ValueError(f"Unknown action type: {action_type}")
825
+ except Exception as e:
826
+ responses_items.extend(make_failed_tool_call_items(
827
+ tool_name="computer",
828
+ tool_kwargs=tool_input,
829
+ error_message=repr(e),
830
+ call_id=call_id
831
+ ))
832
+
833
+ # Handle tool calls (alternative format)
834
+ if hasattr(message, 'tool_calls') and message.tool_calls:
835
+ for tool_call in message.tool_calls:
836
+ if tool_call.function.name == "computer":
837
+ try:
838
+ try:
839
+ args = json.loads(tool_call.function.arguments)
840
+ action_type = args.get("action")
841
+ call_id = tool_call.id
842
+
614
843
  # Basic actions (all versions)
615
844
  if action_type == "screenshot":
616
- responses_items.append(make_screenshot_item(call_id=call_id))
845
+ # Input:
846
+ # {
847
+ # "function": {
848
+ # "name": "computer",
849
+ # "arguments": json.dumps({
850
+ # "action": "screenshot"
851
+ # })
852
+ # },
853
+ # "id": "call_1",
854
+ # "type": "function"
855
+ # }
856
+
857
+ # Output:
858
+ # {
859
+ # "type": "computer_call",
860
+ # "call_id": "call_1",
861
+ # "action": {
862
+ # "type": "screenshot"
863
+ # }
864
+ # }
865
+ responses_items.append(make_screenshot_item(
866
+ call_id=call_id
867
+ ))
617
868
  elif action_type in ["click", "left_click"]:
618
- coordinate = tool_input.get("coordinate", [0, 0])
869
+ # Input:
870
+ # {
871
+ # "function": {
872
+ # "name": "computer",
873
+ # "arguments": json.dumps({
874
+ # "action": "click",
875
+ # "coordinate": [100, 200]
876
+ # })
877
+ # },
878
+ # "id": "call_1",
879
+ # "type": "function"
880
+ # }
881
+
882
+ # Output:
883
+ # {
884
+ # "type": "computer_call",
885
+ # "call_id": "call_1",
886
+ # "action": {
887
+ # "type": "click",
888
+ # "x": 100,
889
+ # "y": 200
890
+ # }
891
+ # }
892
+ coordinate = args.get("coordinate", [0, 0])
619
893
  responses_items.append(make_click_item(
620
894
  x=coordinate[0] if len(coordinate) > 0 else 0,
621
895
  y=coordinate[1] if len(coordinate) > 1 else 0,
622
896
  call_id=call_id
623
897
  ))
624
898
  elif action_type in ["type", "type_text"]:
899
+ # Input:
900
+ # {
901
+ # "function": {
902
+ # "name": "computer",
903
+ # "arguments": json.dumps({
904
+ # "action": "type",
905
+ # "text": "Hello World"
906
+ # })
907
+ # },
908
+ # "id": "call_1",
909
+ # "type": "function"
910
+ # }
911
+
912
+ # Output:
913
+ # {
914
+ # "type": "computer_call",
915
+ # "call_id": "call_1",
916
+ # "action": {
917
+ # "type": "type",
918
+ # "text": "Hello World"
919
+ # }
920
+ # }
625
921
  responses_items.append(make_type_item(
626
- text=tool_input.get("text", ""),
922
+ text=args.get("text", ""),
627
923
  call_id=call_id
628
924
  ))
629
925
  elif action_type in ["key", "keypress", "hotkey"]:
926
+ # Input:
927
+ # {
928
+ # "function": {
929
+ # "name": "computer",
930
+ # "arguments": json.dumps({
931
+ # "action": "key",
932
+ # "text": "ctrl+c"
933
+ # })
934
+ # },
935
+ # "id": "call_1",
936
+ # "type": "function"
937
+ # }
938
+
939
+ # Output:
940
+ # {
941
+ # "type": "computer_call",
942
+ # "call_id": "call_1",
943
+ # "action": {
944
+ # "type": "keypress",
945
+ # "keys": ["ctrl", "c"]
946
+ # }
947
+ # }
630
948
  responses_items.append(make_keypress_item(
631
- keys=tool_input.get("text", "").replace("+", "-").split("-"),
949
+ keys=args.get("text", "").replace("+", "-").split("-"),
632
950
  call_id=call_id
633
951
  ))
634
952
  elif action_type in ["mouse_move", "move_cursor", "move"]:
635
- # Mouse move - create a custom action item
636
- coordinate = tool_input.get("coordinate", [0, 0])
637
- responses_items.append(
638
- make_move_item(
639
- x=coordinate[0] if len(coordinate) > 0 else 0,
640
- y=coordinate[1] if len(coordinate) > 1 else 0,
641
- call_id=call_id
642
- )
643
- )
953
+ # Input:
954
+ # {
955
+ # "function": {
956
+ # "name": "computer",
957
+ # "arguments": json.dumps({
958
+ # "action": "mouse_move",
959
+ # "coordinate": [150, 250]
960
+ # })
961
+ # },
962
+ # "id": "call_1",
963
+ # "type": "function"
964
+ # }
965
+
966
+ # Output:
967
+ # {
968
+ # "type": "computer_call",
969
+ # "call_id": "call_1",
970
+ # "action": {
971
+ # "type": "mouse_move",
972
+ # "x": 150,
973
+ # "y": 250
974
+ # }
975
+ # }
976
+ coordinate = args.get("coordinate", [0, 0])
977
+ responses_items.append(make_move_item(
978
+ x=coordinate[0] if len(coordinate) > 0 else 0,
979
+ y=coordinate[1] if len(coordinate) > 1 else 0,
980
+ call_id=call_id
981
+ ))
644
982
 
645
983
  # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
646
984
  elif action_type == "scroll":
647
- coordinate = tool_input.get("coordinate", [0, 0])
648
- scroll_amount = tool_input.get("scroll_amount", 3)
649
- scroll_x = scroll_amount if tool_input.get("scroll_direction", "down") == "right" else \
650
- -scroll_amount if tool_input.get("scroll_direction", "down") == "left" else 0
651
- scroll_y = scroll_amount if tool_input.get("scroll_direction", "down") == "down" else \
652
- -scroll_amount if tool_input.get("scroll_direction", "down") == "up" else 0
985
+ # Input:
986
+ # {
987
+ # "function": {
988
+ # "name": "computer",
989
+ # "arguments": json.dumps({
990
+ # "action": "scroll",
991
+ # "coordinate": [300, 400],
992
+ # "scroll_direction": "down",
993
+ # "scroll_amount": 5
994
+ # })
995
+ # },
996
+ # "id": "call_1",
997
+ # "type": "function"
998
+ # }
999
+
1000
+ # Output:
1001
+ # {
1002
+ # "type": "computer_call",
1003
+ # "call_id": "call_1",
1004
+ # "action": {
1005
+ # "type": "scroll",
1006
+ # "x": 300,
1007
+ # "y": 400,
1008
+ # "scroll_x": 0,
1009
+ # "scroll_y": -5
1010
+ # }
1011
+ # }
1012
+ coordinate = args.get("coordinate", [0, 0])
1013
+ direction = args.get("scroll_direction", "down")
1014
+ amount = args.get("scroll_amount", 3)
1015
+ scroll_x = amount if direction == "left" else \
1016
+ -amount if direction == "right" else 0
1017
+ scroll_y = amount if direction == "up" else \
1018
+ -amount if direction == "down" else 0
653
1019
  responses_items.append(make_scroll_item(
654
1020
  x=coordinate[0] if len(coordinate) > 0 else 0,
655
1021
  y=coordinate[1] if len(coordinate) > 1 else 0,
@@ -658,8 +1024,34 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
658
1024
  call_id=call_id
659
1025
  ))
660
1026
  elif action_type in ["left_click_drag", "drag"]:
661
- start_coord = tool_input.get("start_coordinate", [0, 0])
662
- end_coord = tool_input.get("end_coordinate", [0, 0])
1027
+ # Input:
1028
+ # {
1029
+ # "function": {
1030
+ # "name": "computer",
1031
+ # "arguments": json.dumps({
1032
+ # "action": "left_click_drag",
1033
+ # "start_coordinate": [100, 150],
1034
+ # "end_coordinate": [200, 250]
1035
+ # })
1036
+ # },
1037
+ # "id": "call_1",
1038
+ # "type": "function"
1039
+ # }
1040
+
1041
+ # Output:
1042
+ # {
1043
+ # "type": "computer_call",
1044
+ # "call_id": "call_1",
1045
+ # "action": {
1046
+ # "type": "drag",
1047
+ # "path": [
1048
+ # {"x": 100, "y": 150},
1049
+ # {"x": 200, "y": 250}
1050
+ # ]
1051
+ # }
1052
+ # }
1053
+ start_coord = args.get("start_coordinate", [0, 0])
1054
+ end_coord = args.get("end_coordinate", [0, 0])
663
1055
  responses_items.append(make_drag_item(
664
1056
  path=[
665
1057
  {
@@ -674,7 +1066,31 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
674
1066
  call_id=call_id
675
1067
  ))
676
1068
  elif action_type == "right_click":
677
- coordinate = tool_input.get("coordinate", [0, 0])
1069
+ # Input:
1070
+ # {
1071
+ # "function": {
1072
+ # "name": "computer",
1073
+ # "arguments": json.dumps({
1074
+ # "action": "right_click",
1075
+ # "coordinate": [120, 180]
1076
+ # })
1077
+ # },
1078
+ # "id": "call_1",
1079
+ # "type": "function"
1080
+ # }
1081
+
1082
+ # Output:
1083
+ # {
1084
+ # "type": "computer_call",
1085
+ # "call_id": "call_1",
1086
+ # "action": {
1087
+ # "type": "click",
1088
+ # "x": 120,
1089
+ # "y": 180,
1090
+ # "button": "right"
1091
+ # }
1092
+ # }
1093
+ coordinate = args.get("coordinate", [0, 0])
678
1094
  responses_items.append(make_click_item(
679
1095
  x=coordinate[0] if len(coordinate) > 0 else 0,
680
1096
  y=coordinate[1] if len(coordinate) > 1 else 0,
@@ -682,7 +1098,31 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
682
1098
  call_id=call_id
683
1099
  ))
684
1100
  elif action_type == "middle_click":
685
- coordinate = tool_input.get("coordinate", [0, 0])
1101
+ # Input:
1102
+ # {
1103
+ # "function": {
1104
+ # "name": "computer",
1105
+ # "arguments": json.dumps({
1106
+ # "action": "middle_click",
1107
+ # "coordinate": [140, 220]
1108
+ # })
1109
+ # },
1110
+ # "id": "call_1",
1111
+ # "type": "function"
1112
+ # }
1113
+
1114
+ # Output:
1115
+ # {
1116
+ # "type": "computer_call",
1117
+ # "call_id": "call_1",
1118
+ # "action": {
1119
+ # "type": "click",
1120
+ # "x": 140,
1121
+ # "y": 220,
1122
+ # "button": "wheel"
1123
+ # }
1124
+ # }
1125
+ coordinate = args.get("coordinate", [0, 0])
686
1126
  responses_items.append(make_click_item(
687
1127
  x=coordinate[0] if len(coordinate) > 0 else 0,
688
1128
  y=coordinate[1] if len(coordinate) > 1 else 0,
@@ -690,518 +1130,175 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
690
1130
  call_id=call_id
691
1131
  ))
692
1132
  elif action_type == "double_click":
693
- coordinate = tool_input.get("coordinate", [0, 0])
1133
+ # Input:
1134
+ # {
1135
+ # "function": {
1136
+ # "name": "computer",
1137
+ # "arguments": json.dumps({
1138
+ # "action": "double_click",
1139
+ # "coordinate": [160, 240]
1140
+ # })
1141
+ # },
1142
+ # "id": "call_1",
1143
+ # "type": "function"
1144
+ # }
1145
+
1146
+ # Output:
1147
+ # {
1148
+ # "type": "computer_call",
1149
+ # "call_id": "call_1",
1150
+ # "action": {
1151
+ # "type": "double_click",
1152
+ # "x": 160,
1153
+ # "y": 240
1154
+ # }
1155
+ # }
1156
+ coordinate = args.get("coordinate", [0, 0])
694
1157
  responses_items.append(make_double_click_item(
695
1158
  x=coordinate[0] if len(coordinate) > 0 else 0,
696
1159
  y=coordinate[1] if len(coordinate) > 1 else 0,
697
1160
  call_id=call_id
698
1161
  ))
699
1162
  elif action_type == "triple_click":
700
- # coordinate = tool_input.get("coordinate", [0, 0])
701
- # responses_items.append({
1163
+ # Input:
1164
+ # {
1165
+ # "function": {
1166
+ # "name": "computer",
1167
+ # "arguments": json.dumps({
1168
+ # "action": "triple_click",
1169
+ # "coordinate": [180, 260]
1170
+ # })
1171
+ # },
1172
+ # "id": "call_1",
1173
+ # "type": "function"
1174
+ # }
1175
+
1176
+ # Output:
1177
+ # {
702
1178
  # "type": "computer_call",
703
- # "call_id": call_id,
1179
+ # "call_id": "call_1",
704
1180
  # "action": {
705
1181
  # "type": "triple_click",
706
- # "x": coordinate[0] if len(coordinate) > 0 else 0,
707
- # "y": coordinate[1] if len(coordinate) > 1 else 0
1182
+ # "x": 180,
1183
+ # "y": 260
708
1184
  # }
709
- # })
1185
+ # }
710
1186
  raise NotImplementedError("triple_click")
711
1187
  elif action_type == "left_mouse_down":
712
- # coordinate = tool_input.get("coordinate", [0, 0])
713
- # responses_items.append({
1188
+ # Input:
1189
+ # {
1190
+ # "function": {
1191
+ # "name": "computer",
1192
+ # "arguments": json.dumps({
1193
+ # "action": "left_mouse_down",
1194
+ # "coordinate": [200, 280]
1195
+ # })
1196
+ # },
1197
+ # "id": "call_1",
1198
+ # "type": "function"
1199
+ # }
1200
+
1201
+ # Output:
1202
+ # {
714
1203
  # "type": "computer_call",
715
- # "call_id": call_id,
1204
+ # "call_id": "call_1",
716
1205
  # "action": {
717
1206
  # "type": "mouse_down",
718
1207
  # "button": "left",
719
- # "x": coordinate[0] if len(coordinate) > 0 else 0,
720
- # "y": coordinate[1] if len(coordinate) > 1 else 0
1208
+ # "x": 200,
1209
+ # "y": 280
721
1210
  # }
722
- # })
723
- raise NotImplementedError("left_mouse_down")
1211
+ # }
1212
+ coordinate = args.get("coordinate", [None, None])
1213
+ responses_items.append(make_left_mouse_down_item(
1214
+ x=coordinate[0] if len(coordinate) > 0 else None,
1215
+ y=coordinate[1] if len(coordinate) > 1 else None,
1216
+ call_id=call_id
1217
+ ))
724
1218
  elif action_type == "left_mouse_up":
725
- # coordinate = tool_input.get("coordinate", [0, 0])
726
- # responses_items.append({
1219
+ # Input:
1220
+ # {
1221
+ # "function": {
1222
+ # "name": "computer",
1223
+ # "arguments": json.dumps({
1224
+ # "action": "left_mouse_up",
1225
+ # "coordinate": [220, 300]
1226
+ # })
1227
+ # },
1228
+ # "id": "call_1",
1229
+ # "type": "function"
1230
+ # }
1231
+
1232
+ # Output:
1233
+ # {
727
1234
  # "type": "computer_call",
728
- # "call_id": call_id,
1235
+ # "call_id": "call_1",
729
1236
  # "action": {
730
1237
  # "type": "mouse_up",
731
1238
  # "button": "left",
732
- # "x": coordinate[0] if len(coordinate) > 0 else 0,
733
- # "y": coordinate[1] if len(coordinate) > 1 else 0
1239
+ # "x": 220,
1240
+ # "y": 300
734
1241
  # }
735
- # })
736
- raise NotImplementedError("left_mouse_up")
1242
+ # }
1243
+ coordinate = args.get("coordinate", [None, None])
1244
+ responses_items.append(make_left_mouse_up_item(
1245
+ x=coordinate[0] if len(coordinate) > 0 else None,
1246
+ y=coordinate[1] if len(coordinate) > 1 else None,
1247
+ call_id=call_id
1248
+ ))
737
1249
  elif action_type == "hold_key":
738
- # responses_items.append({
1250
+ # Input:
1251
+ # {
1252
+ # "function": {
1253
+ # "name": "computer",
1254
+ # "arguments": json.dumps({
1255
+ # "action": "hold_key",
1256
+ # "key": "shift"
1257
+ # })
1258
+ # },
1259
+ # "id": "call_1",
1260
+ # "type": "function"
1261
+ # }
1262
+
1263
+ # Output:
1264
+ # {
739
1265
  # "type": "computer_call",
740
- # "call_id": call_id,
1266
+ # "call_id": "call_1",
741
1267
  # "action": {
742
1268
  # "type": "key_hold",
743
- # "key": tool_input.get("key", "")
1269
+ # "key": "shift"
744
1270
  # }
745
- # })
1271
+ # }
746
1272
  raise NotImplementedError("hold_key")
747
1273
  elif action_type == "wait":
1274
+ # Input:
1275
+ # {
1276
+ # "function": {
1277
+ # "name": "computer",
1278
+ # "arguments": json.dumps({
1279
+ # "action": "wait"
1280
+ # })
1281
+ # },
1282
+ # "id": "call_1",
1283
+ # "type": "function"
1284
+ # }
1285
+
1286
+ # Output:
1287
+ # {
1288
+ # "type": "computer_call",
1289
+ # "call_id": "call_1",
1290
+ # "action": {
1291
+ # "type": "wait"
1292
+ # }
1293
+ # }
748
1294
  responses_items.append(make_wait_item(
749
1295
  call_id=call_id
750
1296
  ))
751
- else:
752
- raise ValueError(f"Unknown action type: {action_type}")
753
-
754
- # Handle tool calls (alternative format)
755
- if hasattr(message, 'tool_calls') and message.tool_calls:
756
- for tool_call in message.tool_calls:
757
- if tool_call.function.name == "computer":
758
- try:
759
- args = json.loads(tool_call.function.arguments)
760
- action_type = args.get("action")
761
- call_id = tool_call.id
762
-
763
- # Basic actions (all versions)
764
- if action_type == "screenshot":
765
- # Input:
766
- # {
767
- # "function": {
768
- # "name": "computer",
769
- # "arguments": json.dumps({
770
- # "action": "screenshot"
771
- # })
772
- # },
773
- # "id": "call_1",
774
- # "type": "function"
775
- # }
776
-
777
- # Output:
778
- # {
779
- # "type": "computer_call",
780
- # "call_id": "call_1",
781
- # "action": {
782
- # "type": "screenshot"
783
- # }
784
- # }
785
- responses_items.append(make_screenshot_item(
786
- call_id=call_id
787
- ))
788
- elif action_type in ["click", "left_click"]:
789
- # Input:
790
- # {
791
- # "function": {
792
- # "name": "computer",
793
- # "arguments": json.dumps({
794
- # "action": "click",
795
- # "coordinate": [100, 200]
796
- # })
797
- # },
798
- # "id": "call_1",
799
- # "type": "function"
800
- # }
801
-
802
- # Output:
803
- # {
804
- # "type": "computer_call",
805
- # "call_id": "call_1",
806
- # "action": {
807
- # "type": "click",
808
- # "x": 100,
809
- # "y": 200
810
- # }
811
- # }
812
- coordinate = args.get("coordinate", [0, 0])
813
- responses_items.append(make_click_item(
814
- x=coordinate[0] if len(coordinate) > 0 else 0,
815
- y=coordinate[1] if len(coordinate) > 1 else 0,
816
- call_id=call_id
817
- ))
818
- elif action_type in ["type", "type_text"]:
819
- # Input:
820
- # {
821
- # "function": {
822
- # "name": "computer",
823
- # "arguments": json.dumps({
824
- # "action": "type",
825
- # "text": "Hello World"
826
- # })
827
- # },
828
- # "id": "call_1",
829
- # "type": "function"
830
- # }
831
-
832
- # Output:
833
- # {
834
- # "type": "computer_call",
835
- # "call_id": "call_1",
836
- # "action": {
837
- # "type": "type",
838
- # "text": "Hello World"
839
- # }
840
- # }
841
- responses_items.append(make_type_item(
842
- text=args.get("text", ""),
843
- call_id=call_id
844
- ))
845
- elif action_type in ["key", "keypress", "hotkey"]:
846
- # Input:
847
- # {
848
- # "function": {
849
- # "name": "computer",
850
- # "arguments": json.dumps({
851
- # "action": "key",
852
- # "text": "ctrl+c"
853
- # })
854
- # },
855
- # "id": "call_1",
856
- # "type": "function"
857
- # }
858
-
859
- # Output:
860
- # {
861
- # "type": "computer_call",
862
- # "call_id": "call_1",
863
- # "action": {
864
- # "type": "keypress",
865
- # "keys": ["ctrl", "c"]
866
- # }
867
- # }
868
- responses_items.append(make_keypress_item(
869
- keys=args.get("text", "").replace("+", "-").split("-"),
870
- call_id=call_id
871
- ))
872
- elif action_type in ["mouse_move", "move_cursor", "move"]:
873
- # Input:
874
- # {
875
- # "function": {
876
- # "name": "computer",
877
- # "arguments": json.dumps({
878
- # "action": "mouse_move",
879
- # "coordinate": [150, 250]
880
- # })
881
- # },
882
- # "id": "call_1",
883
- # "type": "function"
884
- # }
885
-
886
- # Output:
887
- # {
888
- # "type": "computer_call",
889
- # "call_id": "call_1",
890
- # "action": {
891
- # "type": "mouse_move",
892
- # "x": 150,
893
- # "y": 250
894
- # }
895
- # }
896
- coordinate = args.get("coordinate", [0, 0])
897
- responses_items.append(make_move_item(
898
- x=coordinate[0] if len(coordinate) > 0 else 0,
899
- y=coordinate[1] if len(coordinate) > 1 else 0,
900
- call_id=call_id
901
- ))
902
-
903
- # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
904
- elif action_type == "scroll":
905
- # Input:
906
- # {
907
- # "function": {
908
- # "name": "computer",
909
- # "arguments": json.dumps({
910
- # "action": "scroll",
911
- # "coordinate": [300, 400],
912
- # "scroll_direction": "down",
913
- # "scroll_amount": 5
914
- # })
915
- # },
916
- # "id": "call_1",
917
- # "type": "function"
918
- # }
919
-
920
- # Output:
921
- # {
922
- # "type": "computer_call",
923
- # "call_id": "call_1",
924
- # "action": {
925
- # "type": "scroll",
926
- # "x": 300,
927
- # "y": 400,
928
- # "scroll_x": 0,
929
- # "scroll_y": -5
930
- # }
931
- # }
932
- coordinate = args.get("coordinate", [0, 0])
933
- direction = args.get("scroll_direction", "down")
934
- amount = args.get("scroll_amount", 3)
935
- scroll_x = amount if direction == "left" else \
936
- -amount if direction == "right" else 0
937
- scroll_y = amount if direction == "up" else \
938
- -amount if direction == "down" else 0
939
- responses_items.append(make_scroll_item(
940
- x=coordinate[0] if len(coordinate) > 0 else 0,
941
- y=coordinate[1] if len(coordinate) > 1 else 0,
942
- scroll_x=scroll_x,
943
- scroll_y=scroll_y,
944
- call_id=call_id
945
- ))
946
- elif action_type in ["left_click_drag", "drag"]:
947
- # Input:
948
- # {
949
- # "function": {
950
- # "name": "computer",
951
- # "arguments": json.dumps({
952
- # "action": "left_click_drag",
953
- # "start_coordinate": [100, 150],
954
- # "end_coordinate": [200, 250]
955
- # })
956
- # },
957
- # "id": "call_1",
958
- # "type": "function"
959
- # }
960
-
961
- # Output:
962
- # {
963
- # "type": "computer_call",
964
- # "call_id": "call_1",
965
- # "action": {
966
- # "type": "drag",
967
- # "path": [
968
- # {"x": 100, "y": 150},
969
- # {"x": 200, "y": 250}
970
- # ]
971
- # }
972
- # }
973
- start_coord = args.get("start_coordinate", [0, 0])
974
- end_coord = args.get("end_coordinate", [0, 0])
975
- responses_items.append(make_drag_item(
976
- path=[
977
- {
978
- "x": start_coord[0] if len(start_coord) > 0 else 0,
979
- "y": start_coord[1] if len(start_coord) > 1 else 0
980
- },
981
- {
982
- "x": end_coord[0] if len(end_coord) > 0 else 0,
983
- "y": end_coord[1] if len(end_coord) > 1 else 0
984
- }
985
- ],
986
- call_id=call_id
987
- ))
988
- elif action_type == "right_click":
989
- # Input:
990
- # {
991
- # "function": {
992
- # "name": "computer",
993
- # "arguments": json.dumps({
994
- # "action": "right_click",
995
- # "coordinate": [120, 180]
996
- # })
997
- # },
998
- # "id": "call_1",
999
- # "type": "function"
1000
- # }
1001
-
1002
- # Output:
1003
- # {
1004
- # "type": "computer_call",
1005
- # "call_id": "call_1",
1006
- # "action": {
1007
- # "type": "click",
1008
- # "x": 120,
1009
- # "y": 180,
1010
- # "button": "right"
1011
- # }
1012
- # }
1013
- coordinate = args.get("coordinate", [0, 0])
1014
- responses_items.append(make_click_item(
1015
- x=coordinate[0] if len(coordinate) > 0 else 0,
1016
- y=coordinate[1] if len(coordinate) > 1 else 0,
1017
- button="right",
1018
- call_id=call_id
1019
- ))
1020
- elif action_type == "middle_click":
1021
- # Input:
1022
- # {
1023
- # "function": {
1024
- # "name": "computer",
1025
- # "arguments": json.dumps({
1026
- # "action": "middle_click",
1027
- # "coordinate": [140, 220]
1028
- # })
1029
- # },
1030
- # "id": "call_1",
1031
- # "type": "function"
1032
- # }
1033
-
1034
- # Output:
1035
- # {
1036
- # "type": "computer_call",
1037
- # "call_id": "call_1",
1038
- # "action": {
1039
- # "type": "click",
1040
- # "x": 140,
1041
- # "y": 220,
1042
- # "button": "wheel"
1043
- # }
1044
- # }
1045
- coordinate = args.get("coordinate", [0, 0])
1046
- responses_items.append(make_click_item(
1047
- x=coordinate[0] if len(coordinate) > 0 else 0,
1048
- y=coordinate[1] if len(coordinate) > 1 else 0,
1049
- button="wheel",
1050
- call_id=call_id
1051
- ))
1052
- elif action_type == "double_click":
1053
- # Input:
1054
- # {
1055
- # "function": {
1056
- # "name": "computer",
1057
- # "arguments": json.dumps({
1058
- # "action": "double_click",
1059
- # "coordinate": [160, 240]
1060
- # })
1061
- # },
1062
- # "id": "call_1",
1063
- # "type": "function"
1064
- # }
1065
-
1066
- # Output:
1067
- # {
1068
- # "type": "computer_call",
1069
- # "call_id": "call_1",
1070
- # "action": {
1071
- # "type": "double_click",
1072
- # "x": 160,
1073
- # "y": 240
1074
- # }
1075
- # }
1076
- coordinate = args.get("coordinate", [0, 0])
1077
- responses_items.append(make_double_click_item(
1078
- x=coordinate[0] if len(coordinate) > 0 else 0,
1079
- y=coordinate[1] if len(coordinate) > 1 else 0,
1080
- call_id=call_id
1081
- ))
1082
- elif action_type == "triple_click":
1083
- # Input:
1084
- # {
1085
- # "function": {
1086
- # "name": "computer",
1087
- # "arguments": json.dumps({
1088
- # "action": "triple_click",
1089
- # "coordinate": [180, 260]
1090
- # })
1091
- # },
1092
- # "id": "call_1",
1093
- # "type": "function"
1094
- # }
1095
-
1096
- # Output:
1097
- # {
1098
- # "type": "computer_call",
1099
- # "call_id": "call_1",
1100
- # "action": {
1101
- # "type": "triple_click",
1102
- # "x": 180,
1103
- # "y": 260
1104
- # }
1105
- # }
1106
- raise NotImplementedError("triple_click")
1107
- elif action_type == "left_mouse_down":
1108
- # Input:
1109
- # {
1110
- # "function": {
1111
- # "name": "computer",
1112
- # "arguments": json.dumps({
1113
- # "action": "left_mouse_down",
1114
- # "coordinate": [200, 280]
1115
- # })
1116
- # },
1117
- # "id": "call_1",
1118
- # "type": "function"
1119
- # }
1120
-
1121
- # Output:
1122
- # {
1123
- # "type": "computer_call",
1124
- # "call_id": "call_1",
1125
- # "action": {
1126
- # "type": "mouse_down",
1127
- # "button": "left",
1128
- # "x": 200,
1129
- # "y": 280
1130
- # }
1131
- # }
1132
- raise NotImplementedError("left_mouse_down")
1133
- elif action_type == "left_mouse_up":
1134
- # Input:
1135
- # {
1136
- # "function": {
1137
- # "name": "computer",
1138
- # "arguments": json.dumps({
1139
- # "action": "left_mouse_up",
1140
- # "coordinate": [220, 300]
1141
- # })
1142
- # },
1143
- # "id": "call_1",
1144
- # "type": "function"
1145
- # }
1146
-
1147
- # Output:
1148
- # {
1149
- # "type": "computer_call",
1150
- # "call_id": "call_1",
1151
- # "action": {
1152
- # "type": "mouse_up",
1153
- # "button": "left",
1154
- # "x": 220,
1155
- # "y": 300
1156
- # }
1157
- # }
1158
- raise NotImplementedError("left_mouse_up")
1159
- elif action_type == "hold_key":
1160
- # Input:
1161
- # {
1162
- # "function": {
1163
- # "name": "computer",
1164
- # "arguments": json.dumps({
1165
- # "action": "hold_key",
1166
- # "key": "shift"
1167
- # })
1168
- # },
1169
- # "id": "call_1",
1170
- # "type": "function"
1171
- # }
1172
-
1173
- # Output:
1174
- # {
1175
- # "type": "computer_call",
1176
- # "call_id": "call_1",
1177
- # "action": {
1178
- # "type": "key_hold",
1179
- # "key": "shift"
1180
- # }
1181
- # }
1182
- raise NotImplementedError("hold_key")
1183
- elif action_type == "wait":
1184
- # Input:
1185
- # {
1186
- # "function": {
1187
- # "name": "computer",
1188
- # "arguments": json.dumps({
1189
- # "action": "wait"
1190
- # })
1191
- # },
1192
- # "id": "call_1",
1193
- # "type": "function"
1194
- # }
1195
-
1196
- # Output:
1197
- # {
1198
- # "type": "computer_call",
1199
- # "call_id": "call_1",
1200
- # "action": {
1201
- # "type": "wait"
1202
- # }
1203
- # }
1204
- responses_items.append(make_wait_item(
1297
+ except Exception as e:
1298
+ responses_items.extend(make_failed_tool_call_items(
1299
+ tool_name="computer",
1300
+ tool_kwargs=args,
1301
+ error_message=repr(e),
1205
1302
  call_id=call_id
1206
1303
  ))
1207
1304
  except json.JSONDecodeError: