inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. inspect_ai/_cli/eval.py +27 -0
  2. inspect_ai/_display/textual/widgets/samples.py +3 -3
  3. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  4. inspect_ai/_eval/eval.py +19 -2
  5. inspect_ai/_eval/evalset.py +4 -1
  6. inspect_ai/_eval/run.py +41 -0
  7. inspect_ai/_eval/task/generate.py +38 -44
  8. inspect_ai/_eval/task/log.py +26 -28
  9. inspect_ai/_eval/task/run.py +23 -27
  10. inspect_ai/_util/answer.py +26 -0
  11. inspect_ai/_util/constants.py +0 -1
  12. inspect_ai/_util/local_server.py +398 -0
  13. inspect_ai/_util/working.py +10 -4
  14. inspect_ai/_view/www/dist/assets/index.css +173 -159
  15. inspect_ai/_view/www/dist/assets/index.js +1417 -1142
  16. inspect_ai/_view/www/log-schema.json +379 -3
  17. inspect_ai/_view/www/package.json +1 -1
  18. inspect_ai/_view/www/src/@types/log.d.ts +93 -14
  19. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  20. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  21. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  22. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  23. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  24. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  25. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  26. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  27. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  28. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  29. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  30. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  31. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  32. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  33. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  34. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  35. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  36. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  37. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  38. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  39. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  40. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  41. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  42. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  43. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  44. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  45. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  46. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  47. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  48. inspect_ai/_view/www/src/components/Card.css +0 -1
  49. inspect_ai/_view/www/src/constants.ts +2 -0
  50. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  51. inspect_ai/agent/_agent.py +3 -3
  52. inspect_ai/agent/_as_solver.py +22 -12
  53. inspect_ai/agent/_as_tool.py +20 -6
  54. inspect_ai/agent/_handoff.py +12 -1
  55. inspect_ai/agent/_react.py +4 -3
  56. inspect_ai/agent/_run.py +16 -3
  57. inspect_ai/agent/_types.py +9 -0
  58. inspect_ai/dataset/_dataset.py +6 -3
  59. inspect_ai/log/__init__.py +14 -0
  60. inspect_ai/log/_convert.py +4 -9
  61. inspect_ai/log/_file.py +56 -0
  62. inspect_ai/log/_log.py +99 -0
  63. inspect_ai/log/_recorders/__init__.py +2 -0
  64. inspect_ai/log/_recorders/buffer/database.py +12 -11
  65. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  66. inspect_ai/log/_recorders/buffer/types.py +2 -2
  67. inspect_ai/log/_recorders/eval.py +20 -65
  68. inspect_ai/log/_recorders/file.py +28 -6
  69. inspect_ai/log/_recorders/recorder.py +7 -0
  70. inspect_ai/log/_recorders/types.py +1 -23
  71. inspect_ai/log/_samples.py +14 -25
  72. inspect_ai/log/_transcript.py +84 -36
  73. inspect_ai/log/_tree.py +118 -0
  74. inspect_ai/log/_util.py +52 -0
  75. inspect_ai/model/__init__.py +5 -1
  76. inspect_ai/model/_call_tools.py +72 -44
  77. inspect_ai/model/_generate_config.py +14 -8
  78. inspect_ai/model/_model.py +66 -88
  79. inspect_ai/model/_model_output.py +25 -0
  80. inspect_ai/model/_openai.py +2 -0
  81. inspect_ai/model/_providers/anthropic.py +13 -23
  82. inspect_ai/model/_providers/hf.py +27 -1
  83. inspect_ai/model/_providers/openai_o1.py +8 -2
  84. inspect_ai/model/_providers/providers.py +18 -4
  85. inspect_ai/model/_providers/sglang.py +247 -0
  86. inspect_ai/model/_providers/vllm.py +211 -400
  87. inspect_ai/scorer/_choice.py +1 -2
  88. inspect_ai/solver/__init__.py +7 -2
  89. inspect_ai/solver/_basic_agent.py +3 -10
  90. inspect_ai/solver/_chain.py +1 -1
  91. inspect_ai/solver/_fork.py +1 -1
  92. inspect_ai/solver/_multiple_choice.py +5 -22
  93. inspect_ai/solver/_plan.py +2 -2
  94. inspect_ai/solver/_task_state.py +26 -88
  95. inspect_ai/solver/_transcript.py +6 -7
  96. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  97. inspect_ai/tool/_mcp/_mcp.py +8 -5
  98. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  99. inspect_ai/tool/_mcp/server.py +3 -1
  100. inspect_ai/tool/_tool_call.py +4 -1
  101. inspect_ai/tool/_tool_support_helpers.py +51 -12
  102. inspect_ai/tool/_tools/_bash_session.py +190 -68
  103. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  104. inspect_ai/tool/_tools/_execute.py +4 -1
  105. inspect_ai/tool/_tools/_text_editor.py +4 -3
  106. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  107. inspect_ai/util/__init__.py +16 -0
  108. inspect_ai/util/_anyio.py +11 -0
  109. inspect_ai/util/_collect.py +50 -0
  110. inspect_ai/util/_limit.py +393 -0
  111. inspect_ai/util/_limited_conversation.py +57 -0
  112. inspect_ai/util/_span.py +58 -0
  113. inspect_ai/util/_subtask.py +27 -42
  114. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  115. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
  116. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  117. inspect_ai/_display/core/group.py +0 -79
  118. inspect_ai/solver/_limit.py +0 -39
  119. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  120. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  121. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  122. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  123. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  124. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  125. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  126. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  127. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  128. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  129. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  130. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  131. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  132. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  133. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  134. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  135. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  136. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  137. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  138. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  139. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  140. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  141. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  142. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  143. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  144. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  145. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  146. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  147. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  148. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  149. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -1,78 +0,0 @@
1
- from argparse import Action, ArgumentParser, Namespace
2
- from typing import Sequence
3
-
4
-
5
- def parse_arguments(args: Sequence[str] | None = None) -> Namespace:
6
- return _create_parser().parse_args(args)
7
-
8
-
9
- def _create_parser() -> ArgumentParser:
10
- parser = ArgumentParser(prog="computer_tool")
11
- subparsers = parser.add_subparsers(dest="action", required=True)
12
-
13
- # these take no additional arguments
14
- subparsers.add_parser(
15
- "screenshot",
16
- aliases=["cursor_position", "left_mouse_down", "left_mouse_up"],
17
- )
18
-
19
- key_and_type = subparsers.add_parser("type", aliases=["key"])
20
- _add_text(key_and_type)
21
-
22
- hold_key = subparsers.add_parser("hold_key")
23
- _add_text(hold_key)
24
- _add_duration(hold_key)
25
-
26
- mouse_move = subparsers.add_parser("mouse_move")
27
- _add_coordinate(mouse_move)
28
-
29
- click = subparsers.add_parser(
30
- "left_click",
31
- aliases=["right_click", "middle_click", "double_click", "triple_click"],
32
- )
33
- _add_coordinate(click, False)
34
- _add_text(click, False)
35
-
36
- left_click_drag = subparsers.add_parser("left_click_drag")
37
- _add_start_coordinate(left_click_drag)
38
- _add_coordinate(left_click_drag)
39
- _add_text(left_click_drag, False)
40
-
41
- scroll = subparsers.add_parser("scroll")
42
- _add_scroll_direction(scroll)
43
- _add_scroll_amount(scroll)
44
- # despite what the doc says, the model doesn't always provide a coordinate
45
- _add_coordinate(scroll, False)
46
-
47
- wait = subparsers.add_parser("wait")
48
- _add_duration(wait)
49
-
50
- return parser
51
-
52
-
53
- def _add_scroll_direction(subparser: ArgumentParser) -> Action:
54
- return subparser.add_argument(
55
- "--scroll_direction", choices=["up", "down", "left", "right"], required=True
56
- )
57
-
58
-
59
- def _add_scroll_amount(subparser: ArgumentParser) -> Action:
60
- return subparser.add_argument("--scroll_amount", type=int, required=True)
61
-
62
-
63
- def _add_coordinate(subparser: ArgumentParser, required: bool = True) -> Action:
64
- return subparser.add_argument("--coordinate", type=int, nargs=2, required=required)
65
-
66
-
67
- def _add_start_coordinate(subparser: ArgumentParser) -> Action:
68
- return subparser.add_argument(
69
- "--start_coordinate", type=int, nargs=2, required=True
70
- )
71
-
72
-
73
- def _add_duration(subparser: ArgumentParser) -> Action:
74
- return subparser.add_argument("--duration", type=int, required=True)
75
-
76
-
77
- def _add_text(subparser: ArgumentParser, required: bool = True) -> Action:
78
- return subparser.add_argument("--text", type=str, required=required)
@@ -1,22 +0,0 @@
1
- from typing import Literal
2
-
3
- Action = Literal[
4
- "key",
5
- "hold_key",
6
- "type",
7
- "cursor_position",
8
- "mouse_move",
9
- "left_mouse_down",
10
- "left_mouse_up",
11
- "left_click",
12
- "left_click_drag",
13
- "right_click",
14
- "middle_click",
15
- "back_click",
16
- "forward_click",
17
- "double_click",
18
- "triple_click",
19
- "scroll",
20
- "wait",
21
- "screenshot",
22
- ]
@@ -1,22 +0,0 @@
1
- import logging
2
-
3
-
4
- def setup_logger(level=logging.INFO):
5
- """
6
- This logger emits all of its output to PID 1's stdout.
7
-
8
- This makes it so that logging from invocations of the computer_tool cli show up in `docker logs` output.
9
- """
10
- new_logger = logging.getLogger("computer_tool")
11
- new_logger.setLevel(level)
12
-
13
- stdout_handler = logging.FileHandler("/proc/1/fd/1", mode="w")
14
- stdout_handler.setLevel(level)
15
- stdout_handler.setFormatter(
16
- logging.Formatter("%(name)s(pid=%(process)d) - %(levelname)s - %(message)s")
17
- )
18
-
19
- if not new_logger.handlers:
20
- new_logger.addHandler(stdout_handler)
21
-
22
- return new_logger
@@ -1,42 +0,0 @@
1
- """Utility to run shell commands asynchronously with a timeout."""
2
-
3
- import asyncio
4
-
5
- TRUNCATED_MESSAGE: str = "<response clipped><NOTE>To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for.</NOTE>"
6
- MAX_RESPONSE_LEN: int = 16000
7
-
8
-
9
- def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN):
10
- """Truncate content and append a notice if content exceeds the specified length."""
11
- return (
12
- content
13
- if not truncate_after or len(content) <= truncate_after
14
- else content[:truncate_after] + TRUNCATED_MESSAGE
15
- )
16
-
17
-
18
- async def run(
19
- cmd: str,
20
- timeout: float | None = 120.0, # seconds
21
- truncate_after: int | None = MAX_RESPONSE_LEN,
22
- ):
23
- """Run a shell command asynchronously with a timeout."""
24
- process = await asyncio.create_subprocess_shell(
25
- cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
26
- )
27
-
28
- try:
29
- stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout)
30
- return (
31
- process.returncode or 0,
32
- maybe_truncate(stdout.decode(), truncate_after=truncate_after),
33
- maybe_truncate(stderr.decode(), truncate_after=truncate_after),
34
- )
35
- except (TimeoutError, asyncio.TimeoutError) as exc:
36
- try:
37
- process.kill()
38
- except ProcessLookupError:
39
- pass
40
- raise TimeoutError(
41
- f"Command '{cmd}' timed out after {timeout} seconds"
42
- ) from exc
@@ -1,33 +0,0 @@
1
- from dataclasses import dataclass, fields, replace
2
-
3
-
4
- @dataclass(kw_only=True, frozen=True)
5
- class ToolResult:
6
- """Represents the result of a tool execution."""
7
-
8
- output: str | None = None
9
- error: str | None = None
10
- base64_image: str | None = None
11
-
12
- def __bool__(self):
13
- return any(getattr(self, field.name) for field in fields(self))
14
-
15
- def __add__(self, other: "ToolResult"):
16
- def combine_fields(
17
- field: str | None, other_field: str | None, concatenate: bool = True
18
- ):
19
- if field and other_field:
20
- if concatenate:
21
- return field + other_field
22
- raise ValueError("Cannot combine tool results")
23
- return field or other_field
24
-
25
- return ToolResult(
26
- output=combine_fields(self.output, other.output),
27
- error=combine_fields(self.error, other.error),
28
- base64_image=combine_fields(self.base64_image, other.base64_image, False),
29
- )
30
-
31
- def replace(self, **kwargs):
32
- """Returns a new ToolResult with the given fields replaced."""
33
- return replace(self, **kwargs)
@@ -1,341 +0,0 @@
1
- """Inspired by https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/tools/computer.py"""
2
-
3
- import asyncio
4
- import base64
5
- import logging
6
- import os
7
- import shlex
8
- from pathlib import Path
9
- from typing import Literal, TypedDict
10
- from uuid import uuid4
11
-
12
- from _run import run
13
- from _tool_result import ToolResult
14
-
15
- OUTPUT_DIR = "/tmp/outputs"
16
-
17
- TYPING_DELAY_MS = 12
18
- TYPING_GROUP_SIZE = 50
19
-
20
- ColorCount = Literal[4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4]
21
-
22
-
23
- class X11ClientError(Exception):
24
- def __init__(self, message):
25
- self.message = message
26
-
27
-
28
- class Resolution(TypedDict):
29
- width: int
30
- height: int
31
-
32
-
33
- # sizes above XGA/WXGA are not recommended (see README.md)
34
- # scale down to one of these targets if ComputerTool._scaling_enabled is set
35
- MAX_SCALING_TARGETS: dict[str, Resolution] = {
36
- "XGA": Resolution(width=1024, height=768), # 4:3
37
- "WXGA": Resolution(width=1280, height=800), # 16:10
38
- "FWXGA": Resolution(width=1366, height=768), # ~16:9
39
- }
40
-
41
-
42
- ScalingSource = Literal["computer", "api"]
43
-
44
-
45
- class ComputerToolOptions(TypedDict):
46
- display_height_px: int
47
- display_width_px: int
48
- display_number: int | None
49
-
50
-
51
- def chunks(s: str, chunk_size: int) -> list[str]:
52
- return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
53
-
54
-
55
- class X11Client:
56
- """
57
- A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
58
-
59
- The tool parameters are defined by Anthropic and are not editable.
60
- """
61
-
62
- width: int
63
- height: int
64
- display_num: int | None
65
- # TODO: Complete plumbing this or remove it
66
- color_count: ColorCount | None = 256
67
-
68
- _screenshot_delay = 2.0
69
- _scaling_enabled = True
70
-
71
- @property
72
- def options(self) -> ComputerToolOptions:
73
- width, height = self._scale_coordinates("computer", self.width, self.height)
74
- return {
75
- "display_width_px": width,
76
- "display_height_px": height,
77
- "display_number": self.display_num,
78
- }
79
-
80
- def __init__(self):
81
- super().__init__()
82
-
83
- self.width = int(os.getenv("WIDTH") or 0)
84
- self.height = int(os.getenv("HEIGHT") or 0)
85
- assert self.width and self.height, "WIDTH, HEIGHT must be set"
86
- if (display_num := os.getenv("DISPLAY_NUM")) is not None:
87
- self.display_num = int(display_num)
88
- self._display_prefix = f"DISPLAY=:{self.display_num} "
89
- else:
90
- self.display_num = None
91
- self._display_prefix = ""
92
-
93
- self.xdotool = f"{self._display_prefix}xdotool"
94
-
95
- async def key(self, text: str) -> ToolResult:
96
- return await self._shell(f"{self.xdotool} key -- {_key_arg_for_text(text)}")
97
-
98
- async def hold_key(self, text: str, duration: int) -> ToolResult:
99
- key_arg = _key_arg_for_text(text)
100
- await self._shell(f"{self.xdotool} keydown -- {key_arg}", False)
101
- await asyncio.sleep(duration)
102
- return await self._shell(f"{self.xdotool} keyup -- {key_arg}")
103
-
104
- async def type(self, text: str) -> ToolResult:
105
- results: list[ToolResult] = []
106
- for chunk in chunks(text, TYPING_GROUP_SIZE):
107
- cmd = (
108
- f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}"
109
- )
110
- results.append(await self._shell(cmd, take_screenshot=False))
111
-
112
- screenshot_base64 = await self._take_screenshot_after_delay()
113
- return ToolResult(
114
- output="".join(result.output or "" for result in results),
115
- error="".join(result.error or "" for result in results),
116
- base64_image=screenshot_base64,
117
- )
118
-
119
- async def cursor_position(self) -> ToolResult:
120
- result = await self._shell(
121
- f"{self.xdotool} getmouselocation --shell",
122
- take_screenshot=False,
123
- )
124
- output = result.output or ""
125
- x, y = self._scale_coordinates(
126
- "computer",
127
- int(output.split("X=")[1].split("\n")[0]),
128
- int(output.split("Y=")[1].split("\n")[0]),
129
- )
130
- return result.replace(output=f"X={x},Y={y}")
131
-
132
- async def left_mouse_down(self) -> ToolResult:
133
- return await self._shell(f"{self.xdotool} mousedown 1")
134
-
135
- async def left_mouse_up(self) -> ToolResult:
136
- return await self._shell(f"{self.xdotool} mouseup 1")
137
-
138
- async def mouse_move(self, coordinate: tuple[int, int]) -> ToolResult:
139
- return await self._mouse_move_and("mouse_move", coordinate, None)
140
-
141
- async def left_click(
142
- self, coordinate: tuple[int, int] | None, text: str | None
143
- ) -> ToolResult:
144
- return await self._mouse_move_and("left_click", coordinate, text)
145
-
146
- async def right_click(
147
- self, coordinate: tuple[int, int] | None, text: str | None
148
- ) -> ToolResult:
149
- return await self._mouse_move_and("right_click", coordinate, text)
150
-
151
- async def middle_click(
152
- self, coordinate: tuple[int, int] | None, text: str | None
153
- ) -> ToolResult:
154
- return await self._mouse_move_and("middle_click", coordinate, text)
155
-
156
- # https://wiki.archlinux.org/title/Mouse_buttons#Thumb_buttons_-_forward_and_back
157
- # suggests that, although not in any spec, the de facto standard is 8 for
158
- # back and 9 for forward.
159
- async def back_click(
160
- self, coordinate: tuple[int, int] | None, text: str | None
161
- ) -> ToolResult:
162
- return await self._mouse_move_and("back_click", coordinate, text)
163
-
164
- async def forward_click(
165
- self, coordinate: tuple[int, int] | None, text: str | None
166
- ) -> ToolResult:
167
- return await self._mouse_move_and("forward_click", coordinate, text)
168
-
169
- async def double_click(
170
- self, coordinate: tuple[int, int] | None, text: str | None
171
- ) -> ToolResult:
172
- return await self._mouse_move_and("double_click", coordinate, text)
173
-
174
- async def triple_click(
175
- self, coordinate: tuple[int, int] | None, text: str | None
176
- ) -> ToolResult:
177
- return await self._mouse_move_and("triple_click", coordinate, text)
178
-
179
- async def left_click_drag(
180
- self, start_coordinate: tuple[int, int], coordinate: tuple[int, int]
181
- ) -> ToolResult:
182
- await self._move_mouse_to_coordinate(start_coordinate, False)
183
- x, y = self._scale_coordinates("api", *coordinate)
184
- return await self._shell(
185
- f"{self.xdotool} mousedown 1 mousemove --sync {x} {y} mouseup 1"
186
- )
187
-
188
- async def scroll(
189
- self,
190
- scroll_direction: Literal["up", "down", "left", "right"],
191
- scroll_amount: int,
192
- coordinate: tuple[int, int] | None,
193
- text: str | None,
194
- ) -> ToolResult:
195
- if coordinate:
196
- await self._move_mouse_to_coordinate(coordinate, False)
197
- scroll_button = {
198
- "up": 4,
199
- "down": 5,
200
- "left": 6,
201
- "right": 7,
202
- }[scroll_direction]
203
-
204
- if text:
205
- key_arg = _key_arg_for_text(text)
206
- await self._shell(f"{self.xdotool} keydown -- {key_arg}", False)
207
- await self._shell(
208
- f"{self.xdotool} click --repeat {scroll_amount} {scroll_button}",
209
- False,
210
- )
211
- return await self._shell(f"{self.xdotool} keyup -- {key_arg}")
212
- else:
213
- return await self._shell(
214
- f"{self.xdotool} click --repeat {scroll_amount} {scroll_button}"
215
- )
216
-
217
- async def wait(self, duration: int) -> ToolResult:
218
- await asyncio.sleep(duration)
219
- return await self.screenshot()
220
-
221
- async def screenshot(self) -> ToolResult:
222
- return await self._screenshot()
223
-
224
- async def _mouse_move_and(
225
- self,
226
- action: Literal[
227
- "mouse_move",
228
- "left_click",
229
- "right_click",
230
- "middle_click",
231
- "back_click",
232
- "forward_click",
233
- "double_click",
234
- "triple_click",
235
- ],
236
- coordinate: tuple[int, int] | None,
237
- text: str | None,
238
- ):
239
- should_move = action == "mouse_move" or coordinate
240
- if should_move:
241
- assert coordinate # coding/type safety error
242
- move_result = await self._move_mouse_to_coordinate(
243
- coordinate, action == "mouse_move"
244
- )
245
- if action == "mouse_move":
246
- return move_result
247
- click_arg = {
248
- "left_click": "1",
249
- "right_click": "3",
250
- "middle_click": "2",
251
- "back_click": "8",
252
- "forward_click": "9",
253
- "double_click": "--repeat 2 --delay 300 1",
254
- "triple_click": "--repeat 3 --delay 300 1",
255
- }[action]
256
-
257
- if text:
258
- key_arg = _key_arg_for_text(text)
259
- await self._shell(f"{self.xdotool} keydown -- {key_arg}", False)
260
- await self._shell(f"{self.xdotool} click {click_arg}", False)
261
- return await self._shell(f"{self.xdotool} keyup -- {key_arg}")
262
- else:
263
- return await self._shell(f"{self.xdotool} click {click_arg}")
264
-
265
- async def _move_mouse_to_coordinate(
266
- self, coordinate: tuple[int, int], take_screenshot: bool
267
- ):
268
- x, y = self._scale_coordinates("api", *coordinate)
269
- return await self._shell(
270
- f"{self.xdotool} mousemove --sync {x} {y}", take_screenshot=take_screenshot
271
- )
272
-
273
- async def _screenshot(self):
274
- """Take a screenshot of the current screen and return the base64 encoded image."""
275
- output_dir = Path(OUTPUT_DIR)
276
- output_dir.mkdir(parents=True, exist_ok=True)
277
- path = output_dir / f"screenshot_{uuid4().hex}.png"
278
-
279
- result = await self._shell(
280
- f"{self._display_prefix}scrot --silent -p {path}", take_screenshot=False
281
- )
282
- if self._scaling_enabled:
283
- x, y = self._scale_coordinates("computer", self.width, self.height)
284
- convert_cmd = f"convert {path} -resize {x}x{y}!"
285
- if self.color_count is not None:
286
- convert_cmd += f" -colors {self.color_count}"
287
- convert_cmd += f" {path}"
288
- await self._shell(convert_cmd, take_screenshot=False)
289
-
290
- if path.exists():
291
- return result.replace(
292
- base64_image=base64.b64encode(path.read_bytes()).decode()
293
- )
294
- raise X11ClientError(f"Failed to take screenshot: {result.error}")
295
-
296
- async def _shell(self, command: str, take_screenshot=True) -> ToolResult:
297
- """Run a shell command and return the output, error, and optionally a screenshot."""
298
- logging.debug(f"running shell command {command}")
299
- _, stdout, stderr = await run(command)
300
- logging.debug(f"shell command returned stdout: {stdout}, stderr: {stderr}")
301
- return ToolResult(
302
- output=stdout,
303
- error=stderr,
304
- base64_image=(await self._take_screenshot_after_delay())
305
- if take_screenshot
306
- else None,
307
- )
308
-
309
- async def _take_screenshot_after_delay(self) -> str:
310
- # delay to let things settle before taking a screenshot
311
- await asyncio.sleep(self._screenshot_delay)
312
- return (await self._screenshot()).base64_image
313
-
314
- def _scale_coordinates(self, source: ScalingSource, x: int, y: int):
315
- """Scale coordinates to a target maximum resolution."""
316
- if not self._scaling_enabled:
317
- return x, y
318
- ratio = self.width / self.height
319
- target_dimension = None
320
- for dimension in MAX_SCALING_TARGETS.values():
321
- # allow some error in the aspect ratio - not ratios are exactly 16:9
322
- if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
323
- if dimension["width"] < self.width:
324
- target_dimension = dimension
325
- break
326
- if target_dimension is None:
327
- return x, y
328
- # should be less than 1
329
- x_scaling_factor = target_dimension["width"] / self.width
330
- y_scaling_factor = target_dimension["height"] / self.height
331
- if source == "api":
332
- if x > self.width or y > self.height:
333
- raise X11ClientError(f"Coordinates {x}, {y} are out of bounds")
334
- # scale up
335
- return round(x / x_scaling_factor), round(y / y_scaling_factor)
336
- # scale down
337
- return round(x * x_scaling_factor), round(y * y_scaling_factor)
338
-
339
-
340
- def _key_arg_for_text(text: str) -> str:
341
- return " ".join(shlex.quote(part) for part in text.split())
@@ -1,141 +0,0 @@
1
- import asyncio
2
- import json
3
- import logging
4
- import os
5
- import sys
6
- import time
7
- from argparse import Namespace
8
- from typing import TypeVar
9
-
10
- from _args import parse_arguments
11
- from _constants import Action
12
- from _logger import setup_logger
13
- from _tool_result import ToolResult
14
- from _x11_client import X11Client
15
-
16
-
17
- class ComputerToolError(Exception):
18
- def __init__(self, message):
19
- self.message = message
20
-
21
-
22
- # This is a bit sketchy. We really want to use relative imports here. Using absolute imports
23
- # works at runtime, but it prevents intellisense from working. However, when this folder is
24
- # copied to the container, by default relative imports won't work if this file is launched
25
- # normally. To overcome this, two things need to happen:
26
- # 1. PYTHONPATH must be set to the parent of the container folder. `PYTHONPATH=/opt`
27
- # 2. The program must be launched with the -m flag. `python3 -m computer_tool.computer_tool`
28
- #
29
- # TODO: There's got to be a cleaner way.
30
-
31
- my_logger = setup_logger(logging.INFO)
32
-
33
-
34
- def main():
35
- try:
36
- args = parse_arguments()
37
- my_logger.info(f"({args})")
38
- result = asyncio.run(execute_action(args))
39
-
40
- print(
41
- json.dumps(
42
- {
43
- "output": result.output,
44
- "error": result.error,
45
- "base64_image": result.base64_image,
46
- }
47
- )
48
- )
49
- my_logger.debug("SUCCESS")
50
- except Exception as e:
51
- my_logger.warning(f"An error occurred: {e}")
52
- print(f"An error occurred: {e}", file=sys.stderr)
53
- sys.exit(1)
54
-
55
-
56
- async def execute_action(args: Namespace) -> ToolResult:
57
- # we can't do anything until X11 is ready to go.
58
- await wait_for_file("/tmp/xfce_started")
59
-
60
- computer = X11Client()
61
- action: Action = args.action
62
- match action:
63
- case "key":
64
- return await computer.key(not_none(args.text, "text"))
65
- case "hold_key":
66
- return await computer.hold_key(
67
- not_none(args.text, "text"), not_none(args.duration, "duration")
68
- )
69
- case "type":
70
- return await computer.type(not_none(args.text, "text"))
71
- case "cursor_position":
72
- return await computer.cursor_position()
73
- case "left_mouse_down":
74
- return await computer.left_mouse_down()
75
- case "left_mouse_up":
76
- return await computer.left_mouse_up()
77
- case "mouse_move":
78
- return await computer.mouse_move(not_none(args.coordinate, "coordinate"))
79
- case "left_click":
80
- return await computer.left_click(
81
- getattr(args, "coordinate", None), getattr(args, "text", None)
82
- )
83
- case "right_click":
84
- return await computer.right_click(
85
- getattr(args, "coordinate", None), getattr(args, "text", None)
86
- )
87
- case "middle_click":
88
- return await computer.middle_click(
89
- getattr(args, "coordinate", None), getattr(args, "text", None)
90
- )
91
- case "double_click":
92
- return await computer.double_click(
93
- getattr(args, "coordinate", None), getattr(args, "text", None)
94
- )
95
- case "triple_click":
96
- return await computer.triple_click(
97
- getattr(args, "coordinate", None), getattr(args, "text", None)
98
- )
99
- case "left_click_drag":
100
- return await computer.left_click_drag(
101
- not_none(args.start_coordinate, "start_coordinate"),
102
- not_none(args.coordinate, "coordinate"),
103
- )
104
- case "scroll":
105
- return await computer.scroll(
106
- not_none(args.scroll_direction, "scroll_direction"),
107
- not_none(args.scroll_amount, "scroll_amount"),
108
- getattr(args, "coordinate", None),
109
- getattr(args, "text", None),
110
- )
111
- case "wait":
112
- return await computer.wait(not_none(args.duration, "duration"))
113
- case "screenshot":
114
- return await computer.screenshot()
115
-
116
- raise ComputerToolError(f"Invalid action: {action}")
117
-
118
-
119
- async def wait_for_file(file_path, check_interval=1):
120
- if os.path.exists(file_path):
121
- return
122
- my_logger.info(f"Waiting for {file_path}")
123
- start_time = time.time()
124
- while not os.path.exists(file_path):
125
- await asyncio.sleep(check_interval)
126
- my_logger.info(
127
- f"Done waiting for {file_path} after {time.time() - start_time:.1f} seconds"
128
- )
129
-
130
-
131
- T = TypeVar("T")
132
-
133
-
134
- def not_none(value: T | None, name: str) -> T:
135
- if value is None:
136
- raise ComputerToolError(f"{name} must be provided")
137
- return value
138
-
139
-
140
- if __name__ == "__main__":
141
- main()