wcgw 2.8.10__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wcgw might be problematic. Click here for more details.

@@ -1,435 +0,0 @@
1
- """Computer Use Tool for Anthropic API"""
2
-
3
- import base64
4
- import time
5
- import shlex
6
- import os
7
- from abc import ABCMeta, abstractmethod
8
- from dataclasses import dataclass, fields, replace
9
- from enum import StrEnum
10
- from typing import Any, Literal, TypedDict, Union, Optional
11
- from uuid import uuid4
12
-
13
- from anthropic.types.beta import BetaToolComputerUse20241022Param, BetaToolUnionParam
14
- from .sys_utils import command_run
15
- from ..types_ import (
16
- Keyboard,
17
- LeftClickDrag,
18
- Mouse,
19
- MouseMove,
20
- ScreenShot,
21
- GetScreenInfo,
22
- )
23
-
24
-
25
- # Constants
26
- OUTPUT_DIR = "/tmp/outputs"
27
- TYPING_DELAY_MS = 12
28
- TYPING_GROUP_SIZE = 50
29
- SLEEP_TIME_MAX_S = 3
30
-
31
- Action = Literal[
32
- "key",
33
- "type",
34
- "mouse_move",
35
- "left_click",
36
- "left_click_drag",
37
- "right_click",
38
- "middle_click",
39
- "double_click",
40
- "screenshot",
41
- "cursor_position",
42
- "scroll_up",
43
- "scroll_down",
44
- "get_screen_info",
45
- ]
46
-
47
-
48
- class Resolution(TypedDict):
49
- width: int
50
- height: int
51
-
52
-
53
- # Sizes above XGA/WXGA are not recommended
54
- MAX_SCALING_TARGETS: dict[str, Resolution] = {
55
- "XGA": Resolution(width=1024, height=768), # 4:3
56
- "WXGA": Resolution(width=1280, height=800), # 16:10
57
- "FWXGA": Resolution(width=1366, height=768), # ~16:9
58
- }
59
-
60
-
61
- class ScalingSource(StrEnum):
62
- COMPUTER = "computer"
63
- API = "api"
64
-
65
-
66
- class ComputerToolOptions(TypedDict):
67
- display_height_px: int
68
- display_width_px: int
69
- display_number: int | None
70
-
71
-
72
- @dataclass(kw_only=True, frozen=True)
73
- class ToolResult:
74
- """Represents the result of a tool execution."""
75
-
76
- output: str | None = None
77
- error: str | None = None
78
- base64_image: str | None = None
79
- system: str | None = None
80
-
81
- def __bool__(self) -> bool:
82
- return any(getattr(self, field.name) for field in fields(self))
83
-
84
- def __add__(self, other: "ToolResult") -> "ToolResult":
85
- def combine_fields(
86
- field: str | None, other_field: str | None, concatenate: bool = True
87
- ) -> str | None:
88
- if field and other_field:
89
- if concatenate:
90
- return field + other_field
91
- raise ValueError("Cannot combine tool results")
92
- return field or other_field
93
-
94
- return ToolResult(
95
- output=combine_fields(self.output, other.output),
96
- error=combine_fields(self.error, other.error),
97
- base64_image=combine_fields(self.base64_image, other.base64_image, False),
98
- system=combine_fields(self.system, other.system),
99
- )
100
-
101
- def replace(self, **kwargs: Any) -> "ToolResult":
102
- """Returns a new ToolResult with the given fields replaced."""
103
- return replace(self, **kwargs)
104
-
105
-
106
- class CLIResult(ToolResult):
107
- """A ToolResult that can be rendered as a CLI output."""
108
-
109
- pass
110
-
111
-
112
- class ToolFailure(ToolResult):
113
- """A ToolResult that represents a failure."""
114
-
115
- pass
116
-
117
-
118
- class ToolError(Exception):
119
- """Raised when a tool encounters an error."""
120
-
121
- def __init__(self, message: str) -> None:
122
- self.message = message
123
-
124
-
125
- def chunks(s: str, chunk_size: int) -> list[str]:
126
- return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
127
-
128
-
129
- class ComputerTool:
130
- """
131
- A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
132
- The tool parameters are defined by Anthropic and are not editable.
133
- """
134
-
135
- name: Literal["computer"] = "computer"
136
- width: Optional[int]
137
- height: Optional[int]
138
- display_num: Optional[int]
139
- xdotool: Optional[str]
140
- docker_image_id: Optional[str]
141
-
142
- _screenshot_delay = 0.5
143
- _scaling_enabled = True
144
-
145
- def __init__(self) -> None:
146
- super().__init__()
147
-
148
- self.xdotool = None
149
- self.width = None
150
- self.height = None
151
- self.display_num = None
152
- self._display_prefix = ""
153
- self.docker_image_id = None
154
-
155
- def get_screen_info(self) -> tuple[int, int, Optional[int]]:
156
- result = self.shell(
157
- "echo $WIDTH,$HEIGHT,$DISPLAY_NUM",
158
- take_screenshot=False,
159
- )
160
- assert not result.error, result.error
161
- assert result.output, "Could not get screen info"
162
- width, height, display_num = map(
163
- lambda x: None if not x else int(x), result.output.strip().split(",")
164
- )
165
- if width is None:
166
- width = 1080
167
- if height is None:
168
- height = 1920
169
-
170
- self.width = width
171
- self.height = height
172
- if display_num is not None:
173
- self.display_num = int(display_num)
174
- self._display_prefix = f"DISPLAY=:{self.display_num} "
175
- else:
176
- self.display_num = None
177
- self._display_prefix = ""
178
- assert self._display_prefix is not None
179
- self.xdotool = f"{self._display_prefix}xdotool"
180
- return width, height, display_num
181
-
182
- def __call__(
183
- self,
184
- *,
185
- action: Action,
186
- docker_image_id: Optional[str] = None,
187
- text: str | None = None,
188
- coordinate: tuple[int, int] | None = None,
189
- do_left_click_on_move: bool | None = None,
190
- take_after_delay_seconds: int | None = None,
191
- **kwargs: Any,
192
- ) -> ToolResult:
193
- if action == "get_screen_info":
194
- assert docker_image_id is not None
195
- self.docker_image_id = docker_image_id
196
- self.get_screen_info()
197
- if take_after_delay_seconds is not None:
198
- time.sleep(min(take_after_delay_seconds, SLEEP_TIME_MAX_S))
199
- screenshot_res = self.screenshot()
200
- return ToolResult(
201
- output=f"width: {self.width}, height: {self.height}, display_num: {self.display_num}",
202
- error=screenshot_res.error,
203
- base64_image=screenshot_res.base64_image,
204
- )
205
-
206
- if self.width is None or self.height is None or self.docker_image_id is None:
207
- raise ToolError("Please first get screen info using get_screen_info tool")
208
-
209
- if action in ("mouse_move", "left_click_drag"):
210
- if coordinate is None:
211
- raise ToolError(f"coordinate is required for {action}")
212
- if text is not None:
213
- raise ToolError(f"text is not accepted for {action}")
214
- if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
215
- raise ToolError(f"{coordinate} must be a tuple of length 2")
216
- if not all(isinstance(i, int) and i >= 0 for i in coordinate):
217
- raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
218
-
219
- x, y = self.scale_coordinates(
220
- ScalingSource.API, coordinate[0], coordinate[1]
221
- )
222
-
223
- if action == "mouse_move":
224
- if not do_left_click_on_move:
225
- return self.shell(f"{self.xdotool} mousemove {x} {y}")
226
- else:
227
- return self.shell(
228
- f"{self.xdotool} mousemove {x} {y} click 1",
229
- )
230
- elif action == "left_click_drag":
231
- return self.shell(
232
- f"{self.xdotool} mousedown 1 mousemove {x} {y} mouseup 1",
233
- )
234
-
235
- if action in ("key", "type"):
236
- if text is None:
237
- raise ToolError(f"text is required for {action}")
238
- if coordinate is not None:
239
- raise ToolError(f"coordinate is not accepted for {action}")
240
- if not isinstance(text, str):
241
- raise ToolError(output=f"{text} must be a string")
242
-
243
- if action == "key":
244
- return self.shell(f"{self.xdotool} key -- {text}")
245
- elif action == "type":
246
- results: list[ToolResult] = []
247
- all_lines = text.splitlines()
248
- for i, line in enumerate(all_lines):
249
- for chunk in chunks(line, TYPING_GROUP_SIZE):
250
- cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}"
251
- results.append(self.shell(cmd, take_screenshot=False))
252
- if i < len(all_lines) - 1:
253
- results.append(
254
- self.shell(
255
- f"{self.xdotool} key Return", take_screenshot=False
256
- )
257
- )
258
- screenshot_base64 = self.screenshot().base64_image
259
- return ToolResult(
260
- output="".join(result.output or "" for result in results),
261
- error="".join(result.error or "" for result in results),
262
- base64_image=screenshot_base64,
263
- )
264
-
265
- if action in (
266
- "left_click",
267
- "right_click",
268
- "double_click",
269
- "middle_click",
270
- "screenshot",
271
- "cursor_position",
272
- "scroll_up",
273
- "scroll_down",
274
- ):
275
- if text is not None:
276
- raise ToolError(f"text is not accepted for {action}")
277
- if coordinate is not None:
278
- raise ToolError(f"coordinate is not accepted for {action}")
279
-
280
- if action == "screenshot":
281
- return self.screenshot()
282
- elif action == "cursor_position":
283
- result = self.shell(
284
- f"{self.xdotool} getmouselocation --shell",
285
- take_screenshot=False,
286
- )
287
- output = result.output or ""
288
- x, y = self.scale_coordinates(
289
- ScalingSource.COMPUTER,
290
- int(output.split("X=")[1].split("\n")[0]),
291
- int(output.split("Y=")[1].split("\n")[0]),
292
- )
293
- return result.replace(output=f"X={x},Y={y}")
294
- else:
295
- if action in ("scroll_up", "scroll_down"):
296
- button = "4" if action == "scroll_up" else "5"
297
- return self.shell(
298
- f"{self.xdotool} click --repeat 1 {button}",
299
- )
300
- else:
301
- click_arg = {
302
- "left_click": "1",
303
- "right_click": "3",
304
- "middle_click": "2",
305
- "double_click": "--repeat 2 --delay 500 1",
306
- }[action]
307
- return self.shell(f"{self.xdotool} click {click_arg}")
308
-
309
- raise ToolError(f"Invalid action: {action}")
310
-
311
- def screenshot(self) -> ToolResult:
312
- """Take a screenshot of the current screen and return the base64 encoded image."""
313
- if self.width is None or self.height is None or self.docker_image_id is None:
314
- self.get_screen_info()
315
- assert self.width and self.height
316
- # output_dir = Path(OUTPUT_DIR)
317
- # output_dir.mkdir(parents=True, exist_ok=True)
318
- mkdir_res = self.shell(
319
- command=f"mkdir -p {OUTPUT_DIR}",
320
- take_screenshot=False,
321
- )
322
- path = f"{OUTPUT_DIR}/screenshot_{uuid4().hex}.png"
323
-
324
- screenshot_cmd = f"{self._display_prefix}scrot -f {path} -p"
325
-
326
- self.shell(screenshot_cmd, take_screenshot=False)
327
-
328
- if self._scaling_enabled:
329
- x, y = self.scale_coordinates(
330
- ScalingSource.COMPUTER, self.width, self.height
331
- )
332
- self.shell(
333
- f"convert {path} -resize {x}x{y}! {path}",
334
- take_screenshot=False,
335
- )
336
-
337
- # Copy file from docker to tmp
338
- _, stdout, stderr = command_run(
339
- f"docker cp {self.docker_image_id}:{path} {path}",
340
- truncate_after=None,
341
- )
342
-
343
- if os.path.exists(path):
344
- with open(path, "rb") as f:
345
- base64_image = base64.b64encode(f.read()).decode("utf-8")
346
-
347
- return ToolResult(output="", error=stderr, base64_image=base64_image)
348
-
349
- raise ToolError(f"Failed to take screenshot: {stderr}")
350
-
351
- def shell(self, command: str, take_screenshot: bool = True) -> ToolResult:
352
- """Run a shell command and return the output, error, and optionally a screenshot."""
353
- escaped_command = shlex.quote(command)
354
- _, stdout, stderr = command_run(
355
- f"docker exec {self.docker_image_id} bash -c {escaped_command}",
356
- )
357
- base64_image = None
358
-
359
- if take_screenshot:
360
- # delay to let things settle before taking a screenshot
361
- time.sleep(self._screenshot_delay)
362
- base64_image = self.screenshot().base64_image
363
-
364
- return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
365
-
366
- def scale_coordinates(
367
- self, source: ScalingSource, x: int, y: int
368
- ) -> tuple[int, int]:
369
- """Scale coordinates to a target maximum resolution."""
370
-
371
- if self.width is None or self.height is None:
372
- raise ToolError("Please first get screen info using get_screen_info tool")
373
-
374
- if not self._scaling_enabled:
375
- return x, y
376
- ratio = self.width / self.height
377
- target_dimension = None
378
- for dimension in MAX_SCALING_TARGETS.values():
379
- # allow some error in the aspect ratio - not ratios are exactly 16:9
380
- if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
381
- if dimension["width"] < self.width:
382
- target_dimension = dimension
383
- break
384
- if target_dimension is None:
385
- return x, y
386
- # should be less than 1
387
- x_scaling_factor = target_dimension["width"] / self.width
388
- y_scaling_factor = target_dimension["height"] / self.height
389
- if source == ScalingSource.API:
390
- if x > self.width or y > self.height:
391
- raise ToolError(f"Coordinates {x}, {y} are out of bounds")
392
- # scale up
393
- return round(x / x_scaling_factor), round(y / y_scaling_factor)
394
- # scale down
395
- return round(x * x_scaling_factor), round(y * y_scaling_factor)
396
-
397
-
398
- Computer = ComputerTool()
399
-
400
-
401
- def run_computer_tool(
402
- action: Union[Keyboard, Mouse, ScreenShot, GetScreenInfo],
403
- ) -> tuple[str, str]:
404
- if isinstance(action, GetScreenInfo):
405
- result = Computer(
406
- action="get_screen_info", docker_image_id=action.docker_image_id
407
- )
408
- elif isinstance(action, ScreenShot):
409
- result = Computer(
410
- action="screenshot",
411
- screenshot_delay=action.take_after_delay_seconds,
412
- )
413
- elif isinstance(action, Keyboard):
414
- result = Computer(
415
- action=action.action,
416
- text=action.text,
417
- )
418
- elif isinstance(action, Mouse):
419
- if isinstance(action.action, MouseMove):
420
- result = Computer(
421
- action="mouse_move",
422
- coordinate=(action.action.x, action.action.y),
423
- do_left_click_on_move=action.action.do_left_click_on_move,
424
- )
425
- elif isinstance(action.action, LeftClickDrag):
426
- result = Computer(
427
- action="left_click_drag",
428
- coordinate=(action.action.x, action.action.y),
429
- )
430
- else:
431
- result = Computer(action=action.action.button_type)
432
-
433
- output = f"stdout: {result.output or ''}, stderr: {result.error or ''}"
434
- image = result.base64_image or ""
435
- return output, image
wcgw/client/sys_utils.py DELETED
@@ -1,41 +0,0 @@
1
- import subprocess
2
-
3
- MAX_RESPONSE_LEN: int = 16000
4
- TRUNCATED_MESSAGE: str = "<response clipped><NOTE>To save on context only part of this file has been shown to you.</NOTE>"
5
-
6
-
7
- def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN) -> str:
8
- """Truncate content and append a notice if content exceeds the specified length."""
9
- return (
10
- content
11
- if not truncate_after or len(content) <= truncate_after
12
- else content[:truncate_after] + TRUNCATED_MESSAGE
13
- )
14
-
15
-
16
- def command_run(
17
- cmd: str,
18
- timeout: float | None = 3.0, # seconds
19
- truncate_after: int | None = MAX_RESPONSE_LEN,
20
- text: bool = True,
21
- ) -> tuple[int, str, str]:
22
- """Run a shell command synchronously with a timeout."""
23
- try:
24
- process = subprocess.Popen(
25
- cmd,
26
- shell=True,
27
- stdout=subprocess.PIPE,
28
- stderr=subprocess.PIPE,
29
- text=text,
30
- )
31
- stdout, stderr = process.communicate(timeout=timeout)
32
- return (
33
- process.returncode or 0,
34
- maybe_truncate(stdout, truncate_after=truncate_after),
35
- maybe_truncate(stderr, truncate_after=truncate_after),
36
- )
37
- except subprocess.TimeoutExpired as exc:
38
- process.kill()
39
- raise TimeoutError(
40
- f"Command '{cmd}' timed out after {timeout} seconds"
41
- ) from exc
File without changes