wcgw 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wcgw might be problematic. Click here for more details.
- wcgw/__init__.py +1 -0
- wcgw/client/__main__.py +2 -2
- wcgw/client/anthropic_client.py +83 -38
- wcgw/client/computer_use.py +416 -0
- wcgw/client/mcp_server/Readme.md +73 -0
- wcgw/client/mcp_server/__init__.py +11 -0
- wcgw/client/mcp_server/server.py +283 -0
- wcgw/client/openai_client.py +3 -3
- wcgw/client/sys_utils.py +40 -0
- wcgw/client/tools.py +184 -88
- wcgw/relay/serve.py +5 -12
- wcgw/types_.py +42 -6
- {wcgw-1.3.0.dist-info → wcgw-1.5.0.dist-info}/METADATA +75 -24
- wcgw-1.5.0.dist-info/RECORD +22 -0
- {wcgw-1.3.0.dist-info → wcgw-1.5.0.dist-info}/entry_points.txt +1 -0
- wcgw-1.3.0.dist-info/RECORD +0 -17
- {wcgw-1.3.0.dist-info → wcgw-1.5.0.dist-info}/WHEEL +0 -0
wcgw/__init__.py
CHANGED
wcgw/client/__main__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .cli import app
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
app()
|
wcgw/client/anthropic_client.py
CHANGED
|
@@ -27,15 +27,19 @@ from ..types_ import (
|
|
|
27
27
|
CreateFileNew,
|
|
28
28
|
FileEditFindReplace,
|
|
29
29
|
FileEdit,
|
|
30
|
+
Keyboard,
|
|
31
|
+
Mouse,
|
|
30
32
|
ReadFile,
|
|
31
33
|
ReadImage,
|
|
32
|
-
Writefile,
|
|
33
34
|
ResetShell,
|
|
35
|
+
ScreenShot,
|
|
36
|
+
GetScreenInfo,
|
|
34
37
|
)
|
|
35
38
|
|
|
36
39
|
from .common import Models, discard_input
|
|
37
40
|
from .common import CostData
|
|
38
41
|
from .tools import ImageData
|
|
42
|
+
from .computer_use import Computer
|
|
39
43
|
|
|
40
44
|
from .tools import (
|
|
41
45
|
DoneFlag,
|
|
@@ -166,6 +170,7 @@ def loop(
|
|
|
166
170
|
- The first line might be `(...truncated)` if the output is too long.
|
|
167
171
|
- Always run `pwd` if you get any file or directory not found error to make sure you're not lost.
|
|
168
172
|
- The control will return to you in 5 seconds regardless of the status. For heavy commands, keep checking status using BashInteraction till they are finished.
|
|
173
|
+
- Run long running commands in background using screen instead of "&".
|
|
169
174
|
""",
|
|
170
175
|
),
|
|
171
176
|
ToolParam(
|
|
@@ -192,7 +197,6 @@ def loop(
|
|
|
192
197
|
name="CreateFileNew",
|
|
193
198
|
description="""
|
|
194
199
|
- Write content to a new file. Provide file path and content. Use this instead of BashCommand for writing new files.
|
|
195
|
-
- This doesn't create any directories, please create directories using `mkdir -p` BashCommand.
|
|
196
200
|
- Provide absolute file path only.
|
|
197
201
|
- For editing existing files, use FileEdit instead of this tool.
|
|
198
202
|
""",
|
|
@@ -205,7 +209,7 @@ def loop(
|
|
|
205
209
|
ToolParam(
|
|
206
210
|
input_schema=ResetShell.model_json_schema(),
|
|
207
211
|
name="ResetShell",
|
|
208
|
-
description="Resets the shell. Use only if all interrupts and prompt reset attempts have failed repeatedly
|
|
212
|
+
description="Resets the shell. Use only if all interrupts and prompt reset attempts have failed repeatedly.\nAlso exits the docker environment.\nYou need to call GetScreenInfo again",
|
|
209
213
|
),
|
|
210
214
|
ToolParam(
|
|
211
215
|
input_schema=FileEdit.model_json_schema(),
|
|
@@ -213,6 +217,46 @@ def loop(
|
|
|
213
217
|
description="""
|
|
214
218
|
- Use absolute file path only.
|
|
215
219
|
- Use SEARCH/REPLACE blocks to edit the file.
|
|
220
|
+
""",
|
|
221
|
+
),
|
|
222
|
+
ToolParam(
|
|
223
|
+
input_schema=GetScreenInfo.model_json_schema(),
|
|
224
|
+
name="GetScreenInfo",
|
|
225
|
+
description="""
|
|
226
|
+
- Get display information of an OS running on docker using image "ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest"
|
|
227
|
+
- If user hasn't provided docker image id, check using `docker ps` and provide the id.
|
|
228
|
+
- Important: call this first in the conversation before ScreenShot, Mouse, and Keyboard tools.
|
|
229
|
+
- Connects shell to the docker environment.
|
|
230
|
+
- Note: once this is called, the shell enters the docker environment. All bash commands will run over there.
|
|
231
|
+
""",
|
|
232
|
+
),
|
|
233
|
+
ToolParam(
|
|
234
|
+
input_schema=ScreenShot.model_json_schema(),
|
|
235
|
+
name="ScreenShot",
|
|
236
|
+
description="""
|
|
237
|
+
- Capture screenshot of an OS running on docker using image "ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest"
|
|
238
|
+
- If user hasn't provided docker image id, check using `docker ps` and provide the id.
|
|
239
|
+
- Capture ScreenShot of the current screen for automation.
|
|
240
|
+
""",
|
|
241
|
+
),
|
|
242
|
+
ToolParam(
|
|
243
|
+
input_schema=Mouse.model_json_schema(),
|
|
244
|
+
name="Mouse",
|
|
245
|
+
description="""
|
|
246
|
+
- Interact with docker container running image "ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest"
|
|
247
|
+
- If user hasn't provided docker image id, check using `docker ps` and provide the id.
|
|
248
|
+
- Interact with the screen using mouse
|
|
249
|
+
""",
|
|
250
|
+
),
|
|
251
|
+
ToolParam(
|
|
252
|
+
input_schema=Keyboard.model_json_schema(),
|
|
253
|
+
name="Keyboard",
|
|
254
|
+
description="""
|
|
255
|
+
- Interact with docker container running image "ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest"
|
|
256
|
+
- If user hasn't provided docker image id, check using `docker ps` and provide the id.
|
|
257
|
+
- Emulate keyboard input to the screen
|
|
258
|
+
- Uses xdootool to send keyboard input, keys like Return, BackSpace, Escape, Page_Up, etc. can be used.
|
|
259
|
+
- Do not use it to interact with Bash tool.
|
|
216
260
|
""",
|
|
217
261
|
),
|
|
218
262
|
]
|
|
@@ -358,7 +402,7 @@ System information:
|
|
|
358
402
|
}
|
|
359
403
|
)
|
|
360
404
|
try:
|
|
361
|
-
|
|
405
|
+
output_or_dones, _ = get_tool_output(
|
|
362
406
|
tool_parsed,
|
|
363
407
|
enc,
|
|
364
408
|
limit - cost,
|
|
@@ -366,45 +410,46 @@ System information:
|
|
|
366
410
|
max_tokens=8000,
|
|
367
411
|
)
|
|
368
412
|
except Exception as e:
|
|
369
|
-
|
|
370
|
-
f"GOT EXCEPTION while calling tool. Error: {e}"
|
|
371
|
-
|
|
413
|
+
output_or_dones = [
|
|
414
|
+
(f"GOT EXCEPTION while calling tool. Error: {e}")
|
|
415
|
+
]
|
|
372
416
|
tb = traceback.format_exc()
|
|
373
|
-
error_console.print(
|
|
374
|
-
|
|
375
|
-
if isinstance(
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
output
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
"
|
|
390
|
-
"
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
"data": output.data,
|
|
394
|
-
},
|
|
395
|
-
}
|
|
396
|
-
],
|
|
417
|
+
error_console.print(str(output_or_dones) + "\n" + tb)
|
|
418
|
+
|
|
419
|
+
if any(isinstance(x, DoneFlag) for x in output_or_dones):
|
|
420
|
+
return "", cost
|
|
421
|
+
|
|
422
|
+
tool_results_content: list[
|
|
423
|
+
TextBlockParam | ImageBlockParam
|
|
424
|
+
] = []
|
|
425
|
+
for output in output_or_dones:
|
|
426
|
+
assert not isinstance(output, DoneFlag)
|
|
427
|
+
if isinstance(output, ImageData):
|
|
428
|
+
tool_results_content.append(
|
|
429
|
+
{
|
|
430
|
+
"type": "image",
|
|
431
|
+
"source": {
|
|
432
|
+
"type": "base64",
|
|
433
|
+
"media_type": output.media_type,
|
|
434
|
+
"data": output.data,
|
|
435
|
+
},
|
|
436
|
+
}
|
|
397
437
|
)
|
|
398
|
-
)
|
|
399
438
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
439
|
+
else:
|
|
440
|
+
tool_results_content.append(
|
|
441
|
+
{
|
|
442
|
+
"type": "text",
|
|
443
|
+
"text": output,
|
|
444
|
+
},
|
|
406
445
|
)
|
|
446
|
+
tool_results.append(
|
|
447
|
+
ToolResultBlockParam(
|
|
448
|
+
type="tool_result",
|
|
449
|
+
tool_use_id=tc["id"],
|
|
450
|
+
content=tool_results_content,
|
|
407
451
|
)
|
|
452
|
+
)
|
|
408
453
|
else:
|
|
409
454
|
_histories.append(
|
|
410
455
|
{"role": "assistant", "content": full_response}
|
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
"""Computer Use Tool for Anthropic API"""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import time
|
|
5
|
+
import shlex
|
|
6
|
+
import os
|
|
7
|
+
from abc import ABCMeta, abstractmethod
|
|
8
|
+
from dataclasses import dataclass, fields, replace
|
|
9
|
+
from enum import StrEnum
|
|
10
|
+
from typing import Any, Literal, TypedDict, Union, Optional
|
|
11
|
+
from uuid import uuid4
|
|
12
|
+
|
|
13
|
+
from anthropic.types.beta import BetaToolComputerUse20241022Param, BetaToolUnionParam
|
|
14
|
+
from .sys_utils import command_run
|
|
15
|
+
from ..types_ import (
|
|
16
|
+
Keyboard,
|
|
17
|
+
LeftClickDrag,
|
|
18
|
+
Mouse,
|
|
19
|
+
MouseMove,
|
|
20
|
+
ScreenShot,
|
|
21
|
+
GetScreenInfo,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Constants
|
|
26
|
+
OUTPUT_DIR = "/tmp/outputs"
|
|
27
|
+
TYPING_DELAY_MS = 12
|
|
28
|
+
TYPING_GROUP_SIZE = 50
|
|
29
|
+
TRUNCATED_MESSAGE: str = "<response clipped><NOTE>To save on context only part of this file has been shown to you.</NOTE>"
|
|
30
|
+
|
|
31
|
+
Action = Literal[
|
|
32
|
+
"key",
|
|
33
|
+
"type",
|
|
34
|
+
"mouse_move",
|
|
35
|
+
"left_click",
|
|
36
|
+
"left_click_drag",
|
|
37
|
+
"right_click",
|
|
38
|
+
"middle_click",
|
|
39
|
+
"double_click",
|
|
40
|
+
"screenshot",
|
|
41
|
+
"cursor_position",
|
|
42
|
+
"scroll_up",
|
|
43
|
+
"scroll_down",
|
|
44
|
+
"get_screen_info",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Resolution(TypedDict):
|
|
49
|
+
width: int
|
|
50
|
+
height: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Sizes above XGA/WXGA are not recommended
|
|
54
|
+
MAX_SCALING_TARGETS: dict[str, Resolution] = {
|
|
55
|
+
"XGA": Resolution(width=1024, height=768), # 4:3
|
|
56
|
+
"WXGA": Resolution(width=1280, height=800), # 16:10
|
|
57
|
+
"FWXGA": Resolution(width=1366, height=768), # ~16:9
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ScalingSource(StrEnum):
|
|
62
|
+
COMPUTER = "computer"
|
|
63
|
+
API = "api"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ComputerToolOptions(TypedDict):
|
|
67
|
+
display_height_px: int
|
|
68
|
+
display_width_px: int
|
|
69
|
+
display_number: int | None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(kw_only=True, frozen=True)
|
|
73
|
+
class ToolResult:
|
|
74
|
+
"""Represents the result of a tool execution."""
|
|
75
|
+
|
|
76
|
+
output: str | None = None
|
|
77
|
+
error: str | None = None
|
|
78
|
+
base64_image: str | None = None
|
|
79
|
+
system: str | None = None
|
|
80
|
+
|
|
81
|
+
def __bool__(self) -> bool:
|
|
82
|
+
return any(getattr(self, field.name) for field in fields(self))
|
|
83
|
+
|
|
84
|
+
def __add__(self, other: "ToolResult") -> "ToolResult":
|
|
85
|
+
def combine_fields(
|
|
86
|
+
field: str | None, other_field: str | None, concatenate: bool = True
|
|
87
|
+
) -> str | None:
|
|
88
|
+
if field and other_field:
|
|
89
|
+
if concatenate:
|
|
90
|
+
return field + other_field
|
|
91
|
+
raise ValueError("Cannot combine tool results")
|
|
92
|
+
return field or other_field
|
|
93
|
+
|
|
94
|
+
return ToolResult(
|
|
95
|
+
output=combine_fields(self.output, other.output),
|
|
96
|
+
error=combine_fields(self.error, other.error),
|
|
97
|
+
base64_image=combine_fields(self.base64_image, other.base64_image, False),
|
|
98
|
+
system=combine_fields(self.system, other.system),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def replace(self, **kwargs: Any) -> "ToolResult":
|
|
102
|
+
"""Returns a new ToolResult with the given fields replaced."""
|
|
103
|
+
return replace(self, **kwargs)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class CLIResult(ToolResult):
|
|
107
|
+
"""A ToolResult that can be rendered as a CLI output."""
|
|
108
|
+
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ToolFailure(ToolResult):
|
|
113
|
+
"""A ToolResult that represents a failure."""
|
|
114
|
+
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ToolError(Exception):
|
|
119
|
+
"""Raised when a tool encounters an error."""
|
|
120
|
+
|
|
121
|
+
def __init__(self, message: str) -> None:
|
|
122
|
+
self.message = message
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def chunks(s: str, chunk_size: int) -> list[str]:
|
|
126
|
+
return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class ComputerTool:
|
|
130
|
+
"""
|
|
131
|
+
A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
|
|
132
|
+
The tool parameters are defined by Anthropic and are not editable.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
name: Literal["computer"] = "computer"
|
|
136
|
+
api_type: Literal["computer_20241022"] = "computer_20241022"
|
|
137
|
+
width: Optional[int]
|
|
138
|
+
height: Optional[int]
|
|
139
|
+
display_num: Optional[int]
|
|
140
|
+
xdotool: Optional[str]
|
|
141
|
+
docker_image_id: Optional[str]
|
|
142
|
+
|
|
143
|
+
_screenshot_delay = 0.5
|
|
144
|
+
_scaling_enabled = True
|
|
145
|
+
|
|
146
|
+
def __init__(self) -> None:
|
|
147
|
+
super().__init__()
|
|
148
|
+
|
|
149
|
+
self.xdotool = None
|
|
150
|
+
self.width = None
|
|
151
|
+
self.height = None
|
|
152
|
+
self.display_num = None
|
|
153
|
+
self._display_prefix = ""
|
|
154
|
+
self.docker_image_id = None
|
|
155
|
+
|
|
156
|
+
def get_screen_info(self) -> tuple[int, int, Optional[int]]:
|
|
157
|
+
result = self.shell(
|
|
158
|
+
"echo $WIDTH,$HEIGHT,$DISPLAY_NUM",
|
|
159
|
+
take_screenshot=False,
|
|
160
|
+
)
|
|
161
|
+
assert not result.error, result.error
|
|
162
|
+
assert result.output, "Could not get screen info"
|
|
163
|
+
width, height, display_num = map(
|
|
164
|
+
lambda x: None if not x else int(x), result.output.split(",")
|
|
165
|
+
)
|
|
166
|
+
if width is None:
|
|
167
|
+
width = 1080
|
|
168
|
+
if height is None:
|
|
169
|
+
height = 1920
|
|
170
|
+
|
|
171
|
+
self.width = width
|
|
172
|
+
self.height = height
|
|
173
|
+
if display_num is not None:
|
|
174
|
+
self.display_num = int(display_num)
|
|
175
|
+
self._display_prefix = f"DISPLAY=:{self.display_num} "
|
|
176
|
+
else:
|
|
177
|
+
self.display_num = None
|
|
178
|
+
self._display_prefix = ""
|
|
179
|
+
assert self._display_prefix is not None
|
|
180
|
+
self.xdotool = f"{self._display_prefix}xdotool"
|
|
181
|
+
return width, height, display_num
|
|
182
|
+
|
|
183
|
+
def __call__(
|
|
184
|
+
self,
|
|
185
|
+
*,
|
|
186
|
+
action: Action,
|
|
187
|
+
docker_image_id: Optional[str] = None,
|
|
188
|
+
text: str | None = None,
|
|
189
|
+
coordinate: tuple[int, int] | None = None,
|
|
190
|
+
**kwargs: Any,
|
|
191
|
+
) -> ToolResult:
|
|
192
|
+
if action == "get_screen_info":
|
|
193
|
+
assert docker_image_id is not None
|
|
194
|
+
self.docker_image_id = docker_image_id
|
|
195
|
+
self.get_screen_info()
|
|
196
|
+
screenshot_res = self.screenshot()
|
|
197
|
+
return ToolResult(
|
|
198
|
+
output=f"width: {self.width}, height: {self.height}, display_num: {self.display_num}",
|
|
199
|
+
error=screenshot_res.error,
|
|
200
|
+
base64_image=screenshot_res.base64_image,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if self.width is None or self.height is None or self.docker_image_id is None:
|
|
204
|
+
raise ToolError("Please first get screen info using get_screen_info tool")
|
|
205
|
+
|
|
206
|
+
if action in ("mouse_move", "left_click_drag"):
|
|
207
|
+
if coordinate is None:
|
|
208
|
+
raise ToolError(f"coordinate is required for {action}")
|
|
209
|
+
if text is not None:
|
|
210
|
+
raise ToolError(f"text is not accepted for {action}")
|
|
211
|
+
if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
|
|
212
|
+
raise ToolError(f"{coordinate} must be a tuple of length 2")
|
|
213
|
+
if not all(isinstance(i, int) and i >= 0 for i in coordinate):
|
|
214
|
+
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
|
|
215
|
+
|
|
216
|
+
x, y = self.scale_coordinates(
|
|
217
|
+
ScalingSource.API, coordinate[0], coordinate[1]
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if action == "mouse_move":
|
|
221
|
+
return self.shell(f"{self.xdotool} mousemove {x} {y}")
|
|
222
|
+
elif action == "left_click_drag":
|
|
223
|
+
return self.shell(
|
|
224
|
+
f"{self.xdotool} mousedown 1 mousemove {x} {y} mouseup 1",
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if action in ("key", "type"):
|
|
228
|
+
if text is None:
|
|
229
|
+
raise ToolError(f"text is required for {action}")
|
|
230
|
+
if coordinate is not None:
|
|
231
|
+
raise ToolError(f"coordinate is not accepted for {action}")
|
|
232
|
+
if not isinstance(text, str):
|
|
233
|
+
raise ToolError(output=f"{text} must be a string")
|
|
234
|
+
|
|
235
|
+
if action == "key":
|
|
236
|
+
return self.shell(f"{self.xdotool} key -- {text}")
|
|
237
|
+
elif action == "type":
|
|
238
|
+
results: list[ToolResult] = []
|
|
239
|
+
for chunk in chunks(text, TYPING_GROUP_SIZE):
|
|
240
|
+
cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}"
|
|
241
|
+
results.append(self.shell(cmd, take_screenshot=False))
|
|
242
|
+
screenshot_base64 = self.screenshot().base64_image
|
|
243
|
+
return ToolResult(
|
|
244
|
+
output="".join(result.output or "" for result in results),
|
|
245
|
+
error="".join(result.error or "" for result in results),
|
|
246
|
+
base64_image=screenshot_base64,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
if action in (
|
|
250
|
+
"left_click",
|
|
251
|
+
"right_click",
|
|
252
|
+
"double_click",
|
|
253
|
+
"middle_click",
|
|
254
|
+
"screenshot",
|
|
255
|
+
"cursor_position",
|
|
256
|
+
"scroll_up",
|
|
257
|
+
"scroll_down",
|
|
258
|
+
):
|
|
259
|
+
if text is not None:
|
|
260
|
+
raise ToolError(f"text is not accepted for {action}")
|
|
261
|
+
if coordinate is not None:
|
|
262
|
+
raise ToolError(f"coordinate is not accepted for {action}")
|
|
263
|
+
|
|
264
|
+
if action == "screenshot":
|
|
265
|
+
return self.screenshot()
|
|
266
|
+
elif action == "cursor_position":
|
|
267
|
+
result = self.shell(
|
|
268
|
+
f"{self.xdotool} getmouselocation --shell",
|
|
269
|
+
take_screenshot=False,
|
|
270
|
+
)
|
|
271
|
+
output = result.output or ""
|
|
272
|
+
x, y = self.scale_coordinates(
|
|
273
|
+
ScalingSource.COMPUTER,
|
|
274
|
+
int(output.split("X=")[1].split("\n")[0]),
|
|
275
|
+
int(output.split("Y=")[1].split("\n")[0]),
|
|
276
|
+
)
|
|
277
|
+
return result.replace(output=f"X={x},Y={y}")
|
|
278
|
+
else:
|
|
279
|
+
if action in ("scroll_up", "scroll_down"):
|
|
280
|
+
button = "4" if action == "scroll_up" else "5"
|
|
281
|
+
return self.shell(
|
|
282
|
+
f"{self.xdotool} click --repeat 1 {button}",
|
|
283
|
+
)
|
|
284
|
+
else:
|
|
285
|
+
click_arg = {
|
|
286
|
+
"left_click": "1",
|
|
287
|
+
"right_click": "3",
|
|
288
|
+
"middle_click": "2",
|
|
289
|
+
"double_click": "--repeat 2 --delay 500 1",
|
|
290
|
+
}[action]
|
|
291
|
+
return self.shell(f"{self.xdotool} click {click_arg}")
|
|
292
|
+
|
|
293
|
+
raise ToolError(f"Invalid action: {action}")
|
|
294
|
+
|
|
295
|
+
def screenshot(self) -> ToolResult:
|
|
296
|
+
"""Take a screenshot of the current screen and return the base64 encoded image."""
|
|
297
|
+
if self.width is None or self.height is None or self.docker_image_id is None:
|
|
298
|
+
self.get_screen_info()
|
|
299
|
+
assert self.width and self.height
|
|
300
|
+
# output_dir = Path(OUTPUT_DIR)
|
|
301
|
+
# output_dir.mkdir(parents=True, exist_ok=True)
|
|
302
|
+
mkdir_res = self.shell(
|
|
303
|
+
command=f"mkdir -p {OUTPUT_DIR}",
|
|
304
|
+
take_screenshot=False,
|
|
305
|
+
)
|
|
306
|
+
path = f"{OUTPUT_DIR}/screenshot_{uuid4().hex}.png"
|
|
307
|
+
|
|
308
|
+
screenshot_cmd = f"{self._display_prefix}scrot -f {path} -p"
|
|
309
|
+
|
|
310
|
+
self.shell(screenshot_cmd, take_screenshot=False)
|
|
311
|
+
|
|
312
|
+
if self._scaling_enabled:
|
|
313
|
+
x, y = self.scale_coordinates(
|
|
314
|
+
ScalingSource.COMPUTER, self.width, self.height
|
|
315
|
+
)
|
|
316
|
+
self.shell(
|
|
317
|
+
f"convert {path} -resize {x}x{y}! {path}",
|
|
318
|
+
take_screenshot=False,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Copy file from docker to tmp
|
|
322
|
+
_, stdout, stderr = command_run(
|
|
323
|
+
f"docker cp {self.docker_image_id}:{path} {path}",
|
|
324
|
+
truncate_after=None,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
if os.path.exists(path):
|
|
328
|
+
with open(path, "rb") as f:
|
|
329
|
+
base64_image = base64.b64encode(f.read()).decode("utf-8")
|
|
330
|
+
|
|
331
|
+
return ToolResult(output="", error=stderr, base64_image=base64_image)
|
|
332
|
+
|
|
333
|
+
raise ToolError(f"Failed to take screenshot: {stderr}")
|
|
334
|
+
|
|
335
|
+
def shell(self, command: str, take_screenshot: bool = True) -> ToolResult:
|
|
336
|
+
"""Run a shell command and return the output, error, and optionally a screenshot."""
|
|
337
|
+
_, stdout, stderr = command_run(
|
|
338
|
+
f"docker exec {self.docker_image_id} sh -c '{command}'"
|
|
339
|
+
)
|
|
340
|
+
base64_image = None
|
|
341
|
+
|
|
342
|
+
if take_screenshot:
|
|
343
|
+
# delay to let things settle before taking a screenshot
|
|
344
|
+
time.sleep(self._screenshot_delay)
|
|
345
|
+
base64_image = self.screenshot().base64_image
|
|
346
|
+
|
|
347
|
+
return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
|
|
348
|
+
|
|
349
|
+
def scale_coordinates(
|
|
350
|
+
self, source: ScalingSource, x: int, y: int
|
|
351
|
+
) -> tuple[int, int]:
|
|
352
|
+
"""Scale coordinates to a target maximum resolution."""
|
|
353
|
+
|
|
354
|
+
if self.width is None or self.height is None:
|
|
355
|
+
raise ToolError("Please first get screen info using get_screen_info tool")
|
|
356
|
+
|
|
357
|
+
if not self._scaling_enabled:
|
|
358
|
+
return x, y
|
|
359
|
+
ratio = self.width / self.height
|
|
360
|
+
target_dimension = None
|
|
361
|
+
for dimension in MAX_SCALING_TARGETS.values():
|
|
362
|
+
# allow some error in the aspect ratio - not ratios are exactly 16:9
|
|
363
|
+
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
|
|
364
|
+
if dimension["width"] < self.width:
|
|
365
|
+
target_dimension = dimension
|
|
366
|
+
break
|
|
367
|
+
if target_dimension is None:
|
|
368
|
+
return x, y
|
|
369
|
+
# should be less than 1
|
|
370
|
+
x_scaling_factor = target_dimension["width"] / self.width
|
|
371
|
+
y_scaling_factor = target_dimension["height"] / self.height
|
|
372
|
+
if source == ScalingSource.API:
|
|
373
|
+
if x > self.width or y > self.height:
|
|
374
|
+
raise ToolError(f"Coordinates {x}, {y} are out of bounds")
|
|
375
|
+
# scale up
|
|
376
|
+
return round(x / x_scaling_factor), round(y / y_scaling_factor)
|
|
377
|
+
# scale down
|
|
378
|
+
return round(x * x_scaling_factor), round(y * y_scaling_factor)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
Computer = ComputerTool()
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def run_computer_tool(
|
|
385
|
+
action: Union[Keyboard, Mouse, ScreenShot, GetScreenInfo],
|
|
386
|
+
) -> tuple[str, str]:
|
|
387
|
+
if isinstance(action, GetScreenInfo):
|
|
388
|
+
result = Computer(
|
|
389
|
+
action="get_screen_info", docker_image_id=action.docker_image_id
|
|
390
|
+
)
|
|
391
|
+
elif isinstance(action, ScreenShot):
|
|
392
|
+
result = Computer(
|
|
393
|
+
action="screenshot",
|
|
394
|
+
)
|
|
395
|
+
elif isinstance(action, Keyboard):
|
|
396
|
+
result = Computer(
|
|
397
|
+
action=action.action,
|
|
398
|
+
text=action.text,
|
|
399
|
+
)
|
|
400
|
+
elif isinstance(action, Mouse):
|
|
401
|
+
if isinstance(action.action, MouseMove):
|
|
402
|
+
result = Computer(
|
|
403
|
+
action="mouse_move",
|
|
404
|
+
coordinate=(action.action.x, action.action.y),
|
|
405
|
+
)
|
|
406
|
+
elif isinstance(action.action, LeftClickDrag):
|
|
407
|
+
result = Computer(
|
|
408
|
+
action="left_click_drag",
|
|
409
|
+
coordinate=(action.action.x, action.action.y),
|
|
410
|
+
)
|
|
411
|
+
else:
|
|
412
|
+
result = Computer(action=action.action.button_type)
|
|
413
|
+
|
|
414
|
+
output = f"stdout: {result.output or ''}, stderr: {result.error or ''}"
|
|
415
|
+
image = result.base64_image or ""
|
|
416
|
+
return output, image
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Claude desktop support
|
|
2
|
+
|
|
3
|
+
## Setup
|
|
4
|
+
|
|
5
|
+
Install xdtool
|
|
6
|
+
|
|
7
|
+
```sh
|
|
8
|
+
brew install xdotool
|
|
9
|
+
|
|
10
|
+
# On macos:
|
|
11
|
+
defaults write org.x.X11 enable_test_extensions -boolean true
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Update `claude_desktop_config.json` (~/Library/Application Support/Claude/claude_desktop_config.json)
|
|
15
|
+
|
|
16
|
+
```json
|
|
17
|
+
{
|
|
18
|
+
"mcpServers": {
|
|
19
|
+
"wcgw": {
|
|
20
|
+
"command": "uvx",
|
|
21
|
+
"args": ["--from", "wcgw@latest", "wcgw_mcp"]
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Then restart claude app.
|
|
28
|
+
|
|
29
|
+
### Computer use support using desktop on docker
|
|
30
|
+
|
|
31
|
+
Controlling the system isn't possible yet, but you can connect to a docker container which runs a linux os with desktop.
|
|
32
|
+
|
|
33
|
+
First run a sample docker image with desktop and optionally VNC connection:
|
|
34
|
+
|
|
35
|
+
```sh
|
|
36
|
+
docker run \
|
|
37
|
+
--entrypoint "" \
|
|
38
|
+
-p 6080:6080 \
|
|
39
|
+
-e WIDTH=1024 \
|
|
40
|
+
-e HEIGHT=768 \
|
|
41
|
+
-d \
|
|
42
|
+
ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest \
|
|
43
|
+
bash -c "\
|
|
44
|
+
./start_all.sh && \
|
|
45
|
+
./novnc_startup.sh && \
|
|
46
|
+
python http_server.py > /tmp/server_logs.txt 2>&1 & \
|
|
47
|
+
tail -f /dev/null"
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Connect to `http://localhost:6080/vnc.html` for desktop view (VNC) of the system running in the docker.
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
Wait for a few seconds. You should be able to see this icon if everything goes right.
|
|
55
|
+
|
|
56
|
+

|
|
57
|
+
over here
|
|
58
|
+
|
|
59
|
+

|
|
60
|
+
|
|
61
|
+
Then ask claude to execute shell commands, read files, edit files, run your code, etc.
|
|
62
|
+
|
|
63
|
+
If you've run the docker for LLM to access, you can ask it to control the "docker os". If you don't provide the docker container id to it, it'll try to search for available docker using `docker ps` command.
|
|
64
|
+
|
|
65
|
+
## Example
|
|
66
|
+
|
|
67
|
+
### Computer use example
|
|
68
|
+
|
|
69
|
+

|
|
70
|
+
|
|
71
|
+
### Shell example
|
|
72
|
+
|
|
73
|
+

|