llms-py 3.0.13__py3-none-any.whl → 3.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,523 @@
1
+ import asyncio
2
+ import base64
3
+ import os
4
+ import shlex
5
+ import shutil
6
+ from enum import StrEnum
7
+ from pathlib import Path
8
+ from typing import Annotated, Any, Literal, TypedDict, get_args
9
+ from uuid import uuid4
10
+
11
+ from .base import BaseTool, ToolError, ToolResult
12
+ from .run import run
13
+
14
+ OUTPUT_DIR = "/tmp/outputs"
15
+
16
+ TYPING_DELAY_MS = 12
17
+ TYPING_GROUP_SIZE = 50
18
+
19
+ Action_20241022 = Literal[
20
+ "key",
21
+ "type",
22
+ "mouse_move",
23
+ "left_click",
24
+ "left_click_drag",
25
+ "right_click",
26
+ "middle_click",
27
+ "double_click",
28
+ "screenshot",
29
+ "cursor_position",
30
+ ]
31
+
32
+ Action_20250124 = (
33
+ Action_20241022
34
+ | Literal[
35
+ "left_mouse_down",
36
+ "left_mouse_up",
37
+ "scroll",
38
+ "hold_key",
39
+ "wait",
40
+ "triple_click",
41
+ ]
42
+ )
43
+
44
+ Action_20251124 = Action_20250124 | Literal["zoom"]
45
+
46
+ ScrollDirection = Literal["up", "down", "left", "right"]
47
+
48
+
49
+ class Resolution(TypedDict):
50
+ width: int
51
+ height: int
52
+
53
+
54
+ # sizes above XGA/WXGA are not recommended (see README.md)
55
+ # scale down to one of these targets if ComputerTool._scaling_enabled is set
56
+ MAX_SCALING_TARGETS: dict[str, Resolution] = {
57
+ "XGA": Resolution(width=1024, height=768), # 4:3
58
+ "WXGA": Resolution(width=1280, height=800), # 16:10
59
+ "FWXGA": Resolution(width=1366, height=768), # ~16:9
60
+ }
61
+
62
+ CLICK_BUTTONS = {
63
+ "left_click": 1,
64
+ "right_click": 3,
65
+ "middle_click": 2,
66
+ "double_click": "--repeat 2 --delay 10 1",
67
+ "triple_click": "--repeat 3 --delay 10 1",
68
+ }
69
+
70
+
71
+ class ScalingSource(StrEnum):
72
+ COMPUTER = "computer"
73
+ API = "api"
74
+
75
+
76
+ class ComputerToolOptions(TypedDict):
77
+ display_height_px: int
78
+ display_width_px: int
79
+ display_number: int | None
80
+
81
+
82
+ def chunks(s: str, chunk_size: int) -> list[str]:
83
+ return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
84
+
85
+
86
+ class BaseComputerTool:
87
+ """
88
+ A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
89
+ The tool parameters are defined by Anthropic and are not editable.
90
+ """
91
+
92
+ name: Literal["computer"] = "computer"
93
+ width: int
94
+ height: int
95
+ display_num: int | None
96
+
97
+ _screenshot_delay = 2.0
98
+ _scaling_enabled = True
99
+
100
+ @property
101
+ def options(self) -> ComputerToolOptions:
102
+ width, height = self.scale_coordinates(ScalingSource.COMPUTER, self.width, self.height)
103
+ return {
104
+ "display_width_px": width,
105
+ "display_height_px": height,
106
+ "display_number": self.display_num,
107
+ }
108
+
109
+ def __init__(self):
110
+ super().__init__()
111
+
112
+ self.width = int(os.getenv("WIDTH") or 0)
113
+ self.height = int(os.getenv("HEIGHT") or 0)
114
+ assert self.width and self.height, "WIDTH, HEIGHT must be set"
115
+ if (display_num := os.getenv("DISPLAY_NUM")) is not None:
116
+ self.display_num = int(display_num)
117
+ self._display_prefix = f"DISPLAY=:{self.display_num} "
118
+ else:
119
+ self.display_num = None
120
+ self._display_prefix = ""
121
+
122
+ self.xdotool = f"{self._display_prefix}xdotool"
123
+
124
+ async def __call__(
125
+ self,
126
+ *,
127
+ action: Action_20241022,
128
+ text: str | None = None,
129
+ coordinate: tuple[int, int] | None = None,
130
+ start_coordinate: tuple[int, int] | None = None,
131
+ **kwargs,
132
+ ):
133
+ if action in ("mouse_move", "left_click_drag"):
134
+ if coordinate is None:
135
+ raise ToolError(f"coordinate is required for {action}")
136
+ if text is not None:
137
+ raise ToolError(f"text is not accepted for {action}")
138
+
139
+ if action == "left_click_drag":
140
+ if start_coordinate is None:
141
+ raise ToolError(f"start_coordinate is required for {action}")
142
+ start_x, start_y = self.validate_and_get_coordinates(start_coordinate)
143
+ end_x, end_y = self.validate_and_get_coordinates(coordinate)
144
+ command_parts = [
145
+ self.xdotool,
146
+ f"mousemove --sync {start_x} {start_y} mousedown 1 mousemove --sync {end_x} {end_y} mouseup 1",
147
+ ]
148
+ return await self.shell(" ".join(command_parts))
149
+ elif action == "mouse_move":
150
+ x, y = self.validate_and_get_coordinates(coordinate)
151
+ command_parts = [self.xdotool, f"mousemove --sync {x} {y}"]
152
+ return await self.shell(" ".join(command_parts))
153
+
154
+ if action in ("key", "type"):
155
+ if text is None:
156
+ raise ToolError(f"text is required for {action}")
157
+ if coordinate is not None:
158
+ raise ToolError(f"coordinate is not accepted for {action}")
159
+ if not isinstance(text, str):
160
+ raise ToolError(output=f"{text} must be a string")
161
+
162
+ if action == "key":
163
+ command_parts = [self.xdotool, f"key -- {text}"]
164
+ return await self.shell(" ".join(command_parts))
165
+ elif action == "type":
166
+ results: list[ToolResult] = []
167
+ for chunk in chunks(text, TYPING_GROUP_SIZE):
168
+ command_parts = [
169
+ self.xdotool,
170
+ f"type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}",
171
+ ]
172
+ results.append(await self.shell(" ".join(command_parts), take_screenshot=False))
173
+ screenshot_base64 = (await self.screenshot()).base64_image
174
+ return ToolResult(
175
+ output="".join(result.output or "" for result in results),
176
+ error="".join(result.error or "" for result in results),
177
+ base64_image=screenshot_base64,
178
+ )
179
+
180
+ if action in (
181
+ "left_click",
182
+ "right_click",
183
+ "double_click",
184
+ "middle_click",
185
+ "screenshot",
186
+ "cursor_position",
187
+ ):
188
+ if text is not None:
189
+ raise ToolError(f"text is not accepted for {action}")
190
+ if coordinate is not None:
191
+ raise ToolError(f"coordinate is not accepted for {action}")
192
+
193
+ if action == "screenshot":
194
+ return await self.screenshot()
195
+ elif action == "cursor_position":
196
+ command_parts = [self.xdotool, "getmouselocation --shell"]
197
+ result = await self.shell(
198
+ " ".join(command_parts),
199
+ take_screenshot=False,
200
+ )
201
+ output = result.output or ""
202
+ x, y = self.scale_coordinates(
203
+ ScalingSource.COMPUTER,
204
+ int(output.split("X=")[1].split("\n")[0]),
205
+ int(output.split("Y=")[1].split("\n")[0]),
206
+ )
207
+ return result.replace(output=f"X={x},Y={y}")
208
+ else:
209
+ command_parts = [self.xdotool, f"click {CLICK_BUTTONS[action]}"]
210
+ return await self.shell(" ".join(command_parts))
211
+
212
+ raise ToolError(f"Invalid action: {action}")
213
+
214
+ def validate_and_get_coordinates(self, coordinate: tuple[int, int] | None = None):
215
+ if not isinstance(coordinate, list) or len(coordinate) != 2:
216
+ raise ToolError(f"{coordinate} must be a tuple of length 2")
217
+ if not all(isinstance(i, int) and i >= 0 for i in coordinate):
218
+ raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
219
+
220
+ return self.scale_coordinates(ScalingSource.API, coordinate[0], coordinate[1])
221
+
222
+ async def screenshot(self):
223
+ """Take a screenshot of the current screen and return the base64 encoded image."""
224
+ output_dir = Path(OUTPUT_DIR)
225
+ output_dir.mkdir(parents=True, exist_ok=True)
226
+ path = output_dir / f"screenshot_{uuid4().hex}.png"
227
+
228
+ # Try gnome-screenshot first
229
+ if shutil.which("gnome-screenshot"):
230
+ screenshot_cmd = f"{self._display_prefix}gnome-screenshot -f {path} -p"
231
+ elif shutil.which("grim"):
232
+ screenshot_cmd = f"{self._display_prefix}grim {path}"
233
+ else:
234
+ # Fall back to scrot if gnome-screenshot isn't available
235
+ screenshot_cmd = f"{self._display_prefix}scrot -p {path}"
236
+
237
+ result = await self.shell(screenshot_cmd, take_screenshot=False)
238
+ if self._scaling_enabled:
239
+ x, y = self.scale_coordinates(ScalingSource.COMPUTER, self.width, self.height)
240
+ await self.shell(f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False)
241
+
242
+ if path.exists():
243
+ return result.replace(base64_image=base64.b64encode(path.read_bytes()).decode())
244
+ raise ToolError(f"Failed to take screenshot: {result.error}")
245
+
246
+ async def shell(self, command: str, take_screenshot=True) -> ToolResult:
247
+ """Run a shell command and return the output, error, and optionally a screenshot."""
248
+ _, stdout, stderr = await run(command)
249
+ base64_image = None
250
+
251
+ if take_screenshot:
252
+ # delay to let things settle before taking a screenshot
253
+ await asyncio.sleep(self._screenshot_delay)
254
+ base64_image = (await self.screenshot()).base64_image
255
+
256
+ return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
257
+
258
+ def scale_coordinates(self, source: ScalingSource, x: int, y: int):
259
+ """Scale coordinates to a target maximum resolution."""
260
+ if not self._scaling_enabled:
261
+ return x, y
262
+ ratio = self.width / self.height
263
+ target_dimension = None
264
+ for dimension in MAX_SCALING_TARGETS.values():
265
+ # allow some error in the aspect ratio - not ratios are exactly 16:9
266
+ if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
267
+ if dimension["width"] < self.width:
268
+ target_dimension = dimension
269
+ break
270
+ if target_dimension is None:
271
+ return x, y
272
+ # should be less than 1
273
+ x_scaling_factor = target_dimension["width"] / self.width
274
+ y_scaling_factor = target_dimension["height"] / self.height
275
+ if source == ScalingSource.API:
276
+ if x > self.width or y > self.height:
277
+ raise ToolError(f"Coordinates {x}, {y} are out of bounds")
278
+ # scale up
279
+ return round(x / x_scaling_factor), round(y / y_scaling_factor)
280
+ # scale down
281
+ return round(x * x_scaling_factor), round(y * y_scaling_factor)
282
+
283
+
284
+ class ComputerTool20241022(BaseComputerTool, BaseTool):
285
+ api_type: Literal["computer_20241022"] = "computer_20241022"
286
+
287
+ def to_params(self) -> Any:
288
+ return {"name": self.name, "type": self.api_type, **self.options}
289
+
290
+
291
+ class ComputerTool20250124(BaseComputerTool, BaseTool):
292
+ api_type: Literal["computer_20250124"] = "computer_20250124"
293
+
294
+ def to_params(self):
295
+ return {"name": self.name, "type": self.api_type, **self.options}
296
+
297
+ async def __call__(
298
+ self,
299
+ *,
300
+ action: Action_20250124,
301
+ text: str | None = None,
302
+ coordinate: tuple[int, int] | None = None,
303
+ start_coordinate: tuple[int, int] | None = None,
304
+ scroll_direction: ScrollDirection | None = None,
305
+ scroll_amount: int | None = None,
306
+ duration: int | float | None = None,
307
+ key: str | None = None,
308
+ **kwargs,
309
+ ):
310
+ if action in ("left_mouse_down", "left_mouse_up"):
311
+ if coordinate is not None:
312
+ raise ToolError(f"coordinate is not accepted for {action=}.")
313
+ command_parts = [
314
+ self.xdotool,
315
+ f"{'mousedown' if action == 'left_mouse_down' else 'mouseup'} 1",
316
+ ]
317
+ return await self.shell(" ".join(command_parts))
318
+ if action == "scroll":
319
+ if scroll_direction is None or scroll_direction not in get_args(ScrollDirection):
320
+ raise ToolError(f"{scroll_direction=} must be 'up', 'down', 'left', or 'right'")
321
+ if not isinstance(scroll_amount, int) or scroll_amount < 0:
322
+ raise ToolError(f"{scroll_amount=} must be a non-negative int")
323
+ mouse_move_part = ""
324
+ if coordinate is not None:
325
+ x, y = self.validate_and_get_coordinates(coordinate)
326
+ mouse_move_part = f"mousemove --sync {x} {y}"
327
+ scroll_button = {
328
+ "up": 4,
329
+ "down": 5,
330
+ "left": 6,
331
+ "right": 7,
332
+ }[scroll_direction]
333
+
334
+ command_parts = [self.xdotool, mouse_move_part]
335
+ if text:
336
+ command_parts.append(f"keydown {text}")
337
+ command_parts.append(f"click --repeat {scroll_amount} {scroll_button}")
338
+ if text:
339
+ command_parts.append(f"keyup {text}")
340
+
341
+ return await self.shell(" ".join(command_parts))
342
+
343
+ if action in ("hold_key", "wait"):
344
+ if duration is None or not isinstance(duration, (int, float)):
345
+ raise ToolError(f"{duration=} must be a number")
346
+ if duration < 0:
347
+ raise ToolError(f"{duration=} must be non-negative")
348
+ if duration > 100:
349
+ raise ToolError(f"{duration=} is too long.")
350
+
351
+ if action == "hold_key":
352
+ if text is None:
353
+ raise ToolError(f"text is required for {action}")
354
+ escaped_keys = shlex.quote(text)
355
+ command_parts = [
356
+ self.xdotool,
357
+ f"keydown {escaped_keys}",
358
+ f"sleep {duration}",
359
+ f"keyup {escaped_keys}",
360
+ ]
361
+ return await self.shell(" ".join(command_parts))
362
+
363
+ if action == "wait":
364
+ await asyncio.sleep(duration)
365
+ return await self.screenshot()
366
+
367
+ if action in (
368
+ "left_click",
369
+ "right_click",
370
+ "double_click",
371
+ "triple_click",
372
+ "middle_click",
373
+ ):
374
+ if text is not None:
375
+ raise ToolError(f"text is not accepted for {action}")
376
+ mouse_move_part = ""
377
+ if coordinate is not None:
378
+ x, y = self.validate_and_get_coordinates(coordinate)
379
+ mouse_move_part = f"mousemove --sync {x} {y}"
380
+
381
+ command_parts = [self.xdotool, mouse_move_part]
382
+ if key:
383
+ command_parts.append(f"keydown {key}")
384
+ command_parts.append(f"click {CLICK_BUTTONS[action]}")
385
+ if key:
386
+ command_parts.append(f"keyup {key}")
387
+
388
+ return await self.shell(" ".join(command_parts))
389
+
390
+ return await super().__call__(
391
+ action=action,
392
+ text=text,
393
+ coordinate=coordinate,
394
+ start_coordinate=start_coordinate,
395
+ key=key,
396
+ **kwargs,
397
+ )
398
+
399
+
400
+ class ComputerTool20251124(ComputerTool20250124):
401
+ api_type: Literal["computer_20251124"] = "computer_20251124" # pyright: ignore[reportIncompatibleVariableOverride]
402
+
403
+ @property
404
+ def options(self) -> ComputerToolOptions: # pyright: ignore[reportIncompatibleMethodOverride]
405
+ return {**super().options, "enable_zoom": True} # pyright: ignore[reportReturnType]
406
+
407
+ async def __call__(
408
+ self,
409
+ *,
410
+ action: Action_20251124,
411
+ text: str | None = None,
412
+ coordinate: tuple[int, int] | None = None,
413
+ scroll_direction: ScrollDirection | None = None,
414
+ scroll_amount: int | None = None,
415
+ duration: int | float | None = None,
416
+ key: str | None = None,
417
+ region: tuple[int, int, int, int] | None = None,
418
+ **kwargs,
419
+ ):
420
+ if action == "zoom":
421
+ if region is None or not isinstance(region, (list, tuple)) or len(region) != 4:
422
+ raise ToolError(f"{region=} must be a tuple of 4 coordinates (x0, y0, x1, y1)")
423
+ if not all(isinstance(c, int) and c >= 0 for c in region):
424
+ raise ToolError(f"{region=} must contain non-negative integers")
425
+
426
+ x0, y0, x1, y1 = region
427
+ # Scale coordinates from API space to screen space
428
+ x0, y0 = self.scale_coordinates(ScalingSource.API, x0, y0)
429
+ x1, y1 = self.scale_coordinates(ScalingSource.API, x1, y1)
430
+
431
+ # Take a screenshot and crop to the specified region
432
+ screenshot_result = await self.screenshot()
433
+ if not screenshot_result.base64_image:
434
+ raise ToolError("Failed to take screenshot for zoom")
435
+
436
+ # Crop the image using ImageMagick convert
437
+ output_dir = Path(OUTPUT_DIR)
438
+ temp_path = output_dir / f"screenshot_{uuid4().hex}.png"
439
+ cropped_path = output_dir / f"zoomed_{uuid4().hex}.png"
440
+
441
+ # Write the screenshot to a temp file
442
+ temp_path.write_bytes(base64.b64decode(screenshot_result.base64_image))
443
+
444
+ # Crop using ImageMagick: convert input -crop WxH+X+Y output
445
+ width = x1 - x0
446
+ height = y1 - y0
447
+ crop_cmd = f"convert {temp_path} -crop {width}x{height}+{x0}+{y0} +repage {cropped_path}"
448
+ await run(crop_cmd)
449
+
450
+ if cropped_path.exists():
451
+ cropped_base64 = base64.b64encode(cropped_path.read_bytes()).decode()
452
+ temp_path.unlink(missing_ok=True)
453
+ cropped_path.unlink(missing_ok=True)
454
+ return ToolResult(base64_image=cropped_base64)
455
+
456
+ raise ToolError("Failed to crop screenshot for zoom")
457
+
458
+ return await super().__call__(
459
+ action=action,
460
+ text=text,
461
+ coordinate=coordinate,
462
+ scroll_direction=scroll_direction,
463
+ scroll_amount=scroll_amount,
464
+ duration=duration,
465
+ key=key,
466
+ **kwargs,
467
+ )
468
+
469
+
470
+ g_tool = None
471
+
472
+
473
+ def strip_list_brackets(s: str) -> str:
474
+ if not s:
475
+ return s
476
+ s = s.strip()
477
+ if s.startswith("[") and s.endswith("]"):
478
+ return s[1:-1]
479
+ if s.startswith("(") and s.endswith(")"):
480
+ return s[1:-1]
481
+ return s
482
+
483
+
484
+ def str_to_list(s: str) -> list[int]:
485
+ return [int(x) for x in strip_list_brackets(s).split(",")]
486
+
487
+
488
+ async def computer(
489
+ action: Action_20251124,
490
+ text: Annotated[str | None, "The text to type or the key to press"] = None,
491
+ coordinate: Annotated[tuple[int, int] | None, "(x, y): The x and y coordinates to move the mouse to"] = None,
492
+ scroll_direction: ScrollDirection | None = None,
493
+ scroll_amount: Annotated[int | None, "The number of lines to scroll"] = None,
494
+ duration: Annotated[float | None, "Duration in seconds"] = None,
495
+ key: Annotated[str | None, "The key sequence to press"] = None,
496
+ region: Annotated[str | None, "(x0, y0, x1, y1): The region to zoom into"] = None,
497
+ ) -> list[dict[str, Any]]:
498
+ """
499
+ A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
500
+ """
501
+ global g_tool
502
+ if g_tool is None:
503
+ g_tool = ComputerTool20251124()
504
+ coordinate_values = None
505
+ if coordinate:
506
+ coordinate_values = tuple(str_to_list(coordinate))
507
+ region_values = None
508
+ if region:
509
+ region_values = tuple(str_to_list(region))
510
+ result = await g_tool(
511
+ action=action,
512
+ text=text if text else None,
513
+ coordinate=coordinate_values,
514
+ scroll_direction=scroll_direction if scroll_direction else None,
515
+ scroll_amount=scroll_amount if scroll_amount else None,
516
+ duration=float(duration) if duration else None,
517
+ key=key if key else None,
518
+ region=region_values,
519
+ )
520
+ if isinstance(result, Exception):
521
+ raise result
522
+ else:
523
+ return result.to_tool_results()