lm-deluge 0.0.83__py3-none-any.whl → 0.0.85__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lm_deluge/api_requests/anthropic.py +3 -0
- lm_deluge/api_requests/gemini.py +34 -2
- lm_deluge/api_requests/openai.py +1 -1
- lm_deluge/client.py +86 -0
- lm_deluge/models/google.py +14 -0
- lm_deluge/models/openai.py +28 -0
- lm_deluge/prompt.py +39 -11
- lm_deluge/tool/__init__.py +11 -4
- lm_deluge/tool/builtin/anthropic/__init__.py +300 -0
- lm_deluge/tool/builtin/gemini.py +59 -0
- lm_deluge/tool/builtin/openai.py +74 -0
- lm_deluge/tool/cua/__init__.py +173 -0
- lm_deluge/tool/cua/actions.py +148 -0
- lm_deluge/tool/cua/base.py +27 -0
- lm_deluge/tool/cua/batch.py +215 -0
- lm_deluge/tool/cua/converters.py +466 -0
- lm_deluge/tool/cua/kernel.py +702 -0
- lm_deluge/tool/cua/trycua.py +989 -0
- lm_deluge/tool/prefab/web_search.py +62 -69
- {lm_deluge-0.0.83.dist-info → lm_deluge-0.0.85.dist-info}/METADATA +1 -1
- {lm_deluge-0.0.83.dist-info → lm_deluge-0.0.85.dist-info}/RECORD +28 -21
- lm_deluge/built_in_tools/anthropic/__init__.py +0 -128
- lm_deluge/built_in_tools/openai.py +0 -28
- lm_deluge/llm_tools/__init__.py +0 -25
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/bash.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/computer_use.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/anthropic/editor.py +0 -0
- /lm_deluge/{built_in_tools → tool/builtin}/base.py +0 -0
- {lm_deluge-0.0.83.dist-info → lm_deluge-0.0.85.dist-info}/WHEEL +0 -0
- {lm_deluge-0.0.83.dist-info → lm_deluge-0.0.85.dist-info}/licenses/LICENSE +0 -0
- {lm_deluge-0.0.83.dist-info → lm_deluge-0.0.85.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gemini built-in tools including computer use.
|
|
3
|
+
|
|
4
|
+
Gemini computer use works differently from OpenAI/Anthropic:
|
|
5
|
+
- Uses a special ComputerUse tool type in the API request
|
|
6
|
+
- Returns actions as regular function_call objects
|
|
7
|
+
- Uses normalized coordinates (0-999) that must be denormalized
|
|
8
|
+
- Function responses include screenshots as FunctionResponsePart
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Literal
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def computer_use_gemini(
|
|
15
|
+
environment: Literal["browser", "android"] = "browser",
|
|
16
|
+
excluded_functions: list[str] | None = None,
|
|
17
|
+
) -> dict:
|
|
18
|
+
"""
|
|
19
|
+
Create a Gemini computer use tool configuration.
|
|
20
|
+
|
|
21
|
+
This returns a dict that will be specially handled when building
|
|
22
|
+
the Gemini API request.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
environment: The environment type - "browser" or "android"
|
|
26
|
+
excluded_functions: List of predefined function names to exclude.
|
|
27
|
+
Available functions:
|
|
28
|
+
- open_web_browser, wait_5_seconds, go_back, go_forward
|
|
29
|
+
- search, navigate, click_at, hover_at, type_text_at
|
|
30
|
+
- key_combination, scroll_document, scroll_at, drag_and_drop
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
A dict that will be converted to ComputerUse tool config
|
|
34
|
+
"""
|
|
35
|
+
result: dict[str, str | list[str]] = {
|
|
36
|
+
"type": "gemini_computer_use",
|
|
37
|
+
"environment": environment,
|
|
38
|
+
}
|
|
39
|
+
if excluded_functions:
|
|
40
|
+
result["excluded_predefined_functions"] = excluded_functions
|
|
41
|
+
return result
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Constants for Gemini computer use action names
|
|
45
|
+
GEMINI_CU_ACTIONS = [
|
|
46
|
+
"open_web_browser",
|
|
47
|
+
"wait_5_seconds",
|
|
48
|
+
"go_back",
|
|
49
|
+
"go_forward",
|
|
50
|
+
"search",
|
|
51
|
+
"navigate",
|
|
52
|
+
"click_at",
|
|
53
|
+
"hover_at",
|
|
54
|
+
"type_text_at",
|
|
55
|
+
"key_combination",
|
|
56
|
+
"scroll_document",
|
|
57
|
+
"scroll_at",
|
|
58
|
+
"drag_and_drop",
|
|
59
|
+
]
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
def image_generation_openai():
|
|
2
|
+
# TODO: handle result properly
|
|
3
|
+
return {"type": "image_generation"}
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def code_interpreter_openai(container: dict | None = None):
|
|
7
|
+
if container is None:
|
|
8
|
+
container = {"type": "auto"}
|
|
9
|
+
return {"type": "code_interpreter", "container": container}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def local_shell_openai():
|
|
13
|
+
return {"type": "local_shell"}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def web_search_openai(
|
|
17
|
+
preview: bool = False,
|
|
18
|
+
user_location: dict | None = None,
|
|
19
|
+
allowed_domains: list[str] | None = None,
|
|
20
|
+
search_context_size: str | None = None,
|
|
21
|
+
):
|
|
22
|
+
"""OpenAI's built-in web search tool for the Responses API.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
preview: If True, use web_search_preview. If False (default), use
|
|
26
|
+
the GA web_search tool.
|
|
27
|
+
user_location: Optional approximate user location to refine search results.
|
|
28
|
+
Should be a dict with "type": "approximate" and an "approximate" key
|
|
29
|
+
containing any of: country (ISO code), city, region, timezone.
|
|
30
|
+
Note: Not supported for deep research models.
|
|
31
|
+
allowed_domains: Optional list of domains to restrict search results to.
|
|
32
|
+
Up to 100 URLs, without http/https prefix (e.g. "openai.com").
|
|
33
|
+
Only available with web_search (not preview).
|
|
34
|
+
search_context_size: Controls how much context from web search results
|
|
35
|
+
is provided to the model. Options: "low", "medium" (default), "high".
|
|
36
|
+
Higher values use more tokens but may improve response quality.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
A dict representing the web search tool configuration.
|
|
40
|
+
"""
|
|
41
|
+
tool: dict = {}
|
|
42
|
+
if preview:
|
|
43
|
+
tool["type"] = "web_search_preview"
|
|
44
|
+
if user_location:
|
|
45
|
+
tool["user_location"] = user_location
|
|
46
|
+
if search_context_size:
|
|
47
|
+
tool["search_context_size"] = search_context_size
|
|
48
|
+
return tool
|
|
49
|
+
|
|
50
|
+
# GA web_search tool
|
|
51
|
+
tool["type"] = "web_search"
|
|
52
|
+
|
|
53
|
+
if user_location:
|
|
54
|
+
tool["user_location"] = user_location
|
|
55
|
+
|
|
56
|
+
if search_context_size:
|
|
57
|
+
tool["search_context_size"] = search_context_size
|
|
58
|
+
|
|
59
|
+
# Domain filtering uses a nested filters structure
|
|
60
|
+
if allowed_domains:
|
|
61
|
+
tool["filters"] = {"allowed_domains": allowed_domains}
|
|
62
|
+
|
|
63
|
+
return tool
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def computer_use_openai(
|
|
67
|
+
display_width: int = 1024, display_height: int = 768, environment: str = "browser"
|
|
68
|
+
):
|
|
69
|
+
return {
|
|
70
|
+
"type": "computer_use_preview",
|
|
71
|
+
"display_width": display_width,
|
|
72
|
+
"display_height": display_height,
|
|
73
|
+
"environment": environment,
|
|
74
|
+
}
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Computer Use Actions (CUA) module.
|
|
3
|
+
|
|
4
|
+
This module provides a provider-agnostic abstraction for computer use actions
|
|
5
|
+
and executors that can run them on various backends.
|
|
6
|
+
|
|
7
|
+
Key components:
|
|
8
|
+
- CUAction: Union type of all possible computer use actions
|
|
9
|
+
- ComputerExecutor: Abstract base class for action executors
|
|
10
|
+
- KernelExecutor: Execute actions on Kernel's browser-as-a-service
|
|
11
|
+
- TryCUAExecutor: Execute actions on TryCUA's computer-server (desktop control)
|
|
12
|
+
|
|
13
|
+
Usage with Kernel (browser):
|
|
14
|
+
from lm_deluge.tool.cua import (
|
|
15
|
+
KernelBrowser,
|
|
16
|
+
KernelExecutor,
|
|
17
|
+
anthropic_tool_call_to_action,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Create a browser and executor
|
|
21
|
+
with KernelBrowser() as browser:
|
|
22
|
+
executor = KernelExecutor(browser.session_id)
|
|
23
|
+
|
|
24
|
+
# Convert Anthropic tool call to action
|
|
25
|
+
action = anthropic_tool_call_to_action(tool_call.arguments)
|
|
26
|
+
|
|
27
|
+
# Execute and get result
|
|
28
|
+
result = executor.execute(action)
|
|
29
|
+
|
|
30
|
+
Usage with TryCUA (desktop):
|
|
31
|
+
from lm_deluge.tool.cua import (
|
|
32
|
+
TryCUAConnection,
|
|
33
|
+
TryCUAExecutor,
|
|
34
|
+
Screenshot,
|
|
35
|
+
Click,
|
|
36
|
+
Type,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Connect to a TryCUA computer-server
|
|
40
|
+
with TryCUAConnection("ws://localhost:8000/ws") as conn:
|
|
41
|
+
executor = TryCUAExecutor(conn)
|
|
42
|
+
|
|
43
|
+
# Execute actions
|
|
44
|
+
result = executor.execute(Screenshot(kind="screenshot"))
|
|
45
|
+
executor.execute(Click(kind="click", x=100, y=200, button="left"))
|
|
46
|
+
executor.execute(Type(kind="type", text="Hello!"))
|
|
47
|
+
|
|
48
|
+
# Async version
|
|
49
|
+
async with AsyncTryCUAConnection("ws://localhost:8000/ws") as conn:
|
|
50
|
+
executor = AsyncTryCUAExecutor(conn)
|
|
51
|
+
result = await executor.execute(Screenshot(kind="screenshot"))
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
from .actions import (
|
|
55
|
+
Bash,
|
|
56
|
+
Click,
|
|
57
|
+
CUAction,
|
|
58
|
+
CursorPos,
|
|
59
|
+
DoubleClick,
|
|
60
|
+
Drag,
|
|
61
|
+
Edit,
|
|
62
|
+
GoBack,
|
|
63
|
+
GoForward,
|
|
64
|
+
HoldKey,
|
|
65
|
+
Keypress,
|
|
66
|
+
MouseDown,
|
|
67
|
+
MouseUp,
|
|
68
|
+
Move,
|
|
69
|
+
Navigate,
|
|
70
|
+
Scroll,
|
|
71
|
+
Screenshot,
|
|
72
|
+
Search,
|
|
73
|
+
TripleClick,
|
|
74
|
+
Type,
|
|
75
|
+
Wait,
|
|
76
|
+
)
|
|
77
|
+
from .base import ComputerExecutor, CUActionResult
|
|
78
|
+
from .base import Screenshot as ScreenshotResult
|
|
79
|
+
from .converters import (
|
|
80
|
+
anthropic_tool_call_to_action,
|
|
81
|
+
openai_computer_call_to_action,
|
|
82
|
+
gemini_function_call_to_action,
|
|
83
|
+
)
|
|
84
|
+
from .batch import create_computer_batch_tool
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# Lazy imports for optional dependencies
|
|
88
|
+
def __getattr__(name: str):
|
|
89
|
+
if name in (
|
|
90
|
+
"KernelBrowser",
|
|
91
|
+
"KernelExecutor",
|
|
92
|
+
"AsyncKernelBrowser",
|
|
93
|
+
"AsyncKernelExecutor",
|
|
94
|
+
):
|
|
95
|
+
from .kernel import (
|
|
96
|
+
KernelBrowser,
|
|
97
|
+
KernelExecutor,
|
|
98
|
+
AsyncKernelBrowser,
|
|
99
|
+
AsyncKernelExecutor,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
"KernelBrowser": KernelBrowser,
|
|
104
|
+
"KernelExecutor": KernelExecutor,
|
|
105
|
+
"AsyncKernelBrowser": AsyncKernelBrowser,
|
|
106
|
+
"AsyncKernelExecutor": AsyncKernelExecutor,
|
|
107
|
+
}[name]
|
|
108
|
+
if name in (
|
|
109
|
+
"TryCUAConnection",
|
|
110
|
+
"TryCUAExecutor",
|
|
111
|
+
"AsyncTryCUAConnection",
|
|
112
|
+
"AsyncTryCUAExecutor",
|
|
113
|
+
):
|
|
114
|
+
from .trycua import (
|
|
115
|
+
TryCUAConnection,
|
|
116
|
+
TryCUAExecutor,
|
|
117
|
+
AsyncTryCUAConnection,
|
|
118
|
+
AsyncTryCUAExecutor,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"TryCUAConnection": TryCUAConnection,
|
|
123
|
+
"TryCUAExecutor": TryCUAExecutor,
|
|
124
|
+
"AsyncTryCUAConnection": AsyncTryCUAConnection,
|
|
125
|
+
"AsyncTryCUAExecutor": AsyncTryCUAExecutor,
|
|
126
|
+
}[name]
|
|
127
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
__all__ = [
|
|
131
|
+
# Actions
|
|
132
|
+
"CUAction",
|
|
133
|
+
"Click",
|
|
134
|
+
"DoubleClick",
|
|
135
|
+
"TripleClick",
|
|
136
|
+
"Move",
|
|
137
|
+
"Drag",
|
|
138
|
+
"Scroll",
|
|
139
|
+
"Keypress",
|
|
140
|
+
"Type",
|
|
141
|
+
"Wait",
|
|
142
|
+
"Screenshot",
|
|
143
|
+
"MouseDown",
|
|
144
|
+
"MouseUp",
|
|
145
|
+
"CursorPos",
|
|
146
|
+
"HoldKey",
|
|
147
|
+
"Navigate",
|
|
148
|
+
"GoBack",
|
|
149
|
+
"GoForward",
|
|
150
|
+
"Search",
|
|
151
|
+
"Bash",
|
|
152
|
+
"Edit",
|
|
153
|
+
# Base classes
|
|
154
|
+
"ComputerExecutor",
|
|
155
|
+
"CUActionResult",
|
|
156
|
+
"ScreenshotResult",
|
|
157
|
+
# Converters
|
|
158
|
+
"anthropic_tool_call_to_action",
|
|
159
|
+
"openai_computer_call_to_action",
|
|
160
|
+
"gemini_function_call_to_action",
|
|
161
|
+
# Batch tool
|
|
162
|
+
"create_computer_batch_tool",
|
|
163
|
+
# Kernel executor (lazy loaded)
|
|
164
|
+
"KernelBrowser", # pyright: ignore[reportUnsupportedDunderAll]
|
|
165
|
+
"KernelExecutor", # pyright: ignore[reportUnsupportedDunderAll]
|
|
166
|
+
"AsyncKernelBrowser", # pyright: ignore[reportUnsupportedDunderAll]
|
|
167
|
+
"AsyncKernelExecutor", # pyright: ignore[reportUnsupportedDunderAll]
|
|
168
|
+
# TryCUA executor (lazy loaded)
|
|
169
|
+
"TryCUAConnection", # pyright: ignore[reportUnsupportedDunderAll]
|
|
170
|
+
"TryCUAExecutor", # pyright: ignore[reportUnsupportedDunderAll]
|
|
171
|
+
"AsyncTryCUAConnection", # pyright: ignore[reportUnsupportedDunderAll]
|
|
172
|
+
"AsyncTryCUAExecutor", # pyright: ignore[reportUnsupportedDunderAll]
|
|
173
|
+
]
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from typing import Any, List, Literal, TypedDict, Union
|
|
2
|
+
|
|
3
|
+
Coord = tuple[int, int]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CUActionBase(TypedDict):
|
|
7
|
+
kind: str | Any # discriminator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Click(CUActionBase):
|
|
11
|
+
kind: Literal["click"]
|
|
12
|
+
x: int | None # if missing, current cursor position
|
|
13
|
+
y: int | None
|
|
14
|
+
button: Literal["left", "right", "middle", "back", "forward"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DoubleClick(CUActionBase):
|
|
18
|
+
kind: Literal["double_click"]
|
|
19
|
+
x: int | None # if missing, current cursor position
|
|
20
|
+
y: int | None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Move(CUActionBase):
|
|
24
|
+
kind: Literal["move"]
|
|
25
|
+
x: int
|
|
26
|
+
y: int
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Drag(CUActionBase):
|
|
30
|
+
kind: Literal["drag"]
|
|
31
|
+
start_x: int | None # if missing, current cursor position
|
|
32
|
+
start_y: int | None # if missing, current cursor position
|
|
33
|
+
path: List[Coord] # path to drag after mousedown
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Scroll(CUActionBase):
|
|
37
|
+
kind: Literal["scroll"]
|
|
38
|
+
x: int | None # if not provided, current cursor position
|
|
39
|
+
y: int | None # if not provided, current cursor position
|
|
40
|
+
dx: int # scroll_x in OpenAI
|
|
41
|
+
dy: int # scroll_y in OpenAI
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Keypress(CUActionBase):
|
|
45
|
+
kind: Literal["keypress"]
|
|
46
|
+
keys: List[str]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Type(CUActionBase):
|
|
50
|
+
kind: Literal["type"]
|
|
51
|
+
text: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Wait(CUActionBase):
|
|
55
|
+
kind: Literal["wait"]
|
|
56
|
+
ms: int
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class Screenshot(CUActionBase):
|
|
60
|
+
kind: Literal["screenshot"]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class MouseDown(CUActionBase):
|
|
64
|
+
kind: Literal["mouse_down"]
|
|
65
|
+
button: Literal["left", "right", "middle", "back", "forward"]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class MouseUp(CUActionBase):
|
|
69
|
+
kind: Literal["mouse_up"]
|
|
70
|
+
button: Literal["left", "right", "middle", "back", "forward"]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class CursorPos(CUActionBase):
|
|
74
|
+
kind: Literal["cursor_position"]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class HoldKey(CUActionBase):
|
|
78
|
+
kind: Literal["hold_key"]
|
|
79
|
+
key: str
|
|
80
|
+
ms: int # duration
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class TripleClick(CUActionBase):
|
|
84
|
+
kind: Literal["triple_click"]
|
|
85
|
+
x: int | None # if missing, current cursor position
|
|
86
|
+
y: int | None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ── Browser‑level actions ────────────────────────────────────────────
|
|
90
|
+
class Navigate(CUActionBase):
|
|
91
|
+
kind: Literal["navigate"]
|
|
92
|
+
url: str
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class GoBack(CUActionBase):
|
|
96
|
+
kind: Literal["go_back"]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class GoForward(CUActionBase):
|
|
100
|
+
kind: Literal["go_forward"]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class Search(CUActionBase):
|
|
104
|
+
kind: Literal["search"]
|
|
105
|
+
query: str
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ── Bash / Editor (provider‑independent) ────────────────────────────
|
|
109
|
+
class Bash(CUActionBase):
|
|
110
|
+
kind: Literal["bash"]
|
|
111
|
+
command: str | None
|
|
112
|
+
restart: bool | None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class Edit(CUActionBase):
|
|
116
|
+
kind: Literal["edit"]
|
|
117
|
+
command: Literal["view", "create", "str_replace", "insert", "undo_edit"]
|
|
118
|
+
path: str
|
|
119
|
+
# optional, keep names identical to Anthropic spec
|
|
120
|
+
file_text: str | None
|
|
121
|
+
view_range: List[int] | None
|
|
122
|
+
old_str: str | None
|
|
123
|
+
new_str: str | None
|
|
124
|
+
insert_line: int | None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
CUAction = Union[
|
|
128
|
+
Click,
|
|
129
|
+
DoubleClick,
|
|
130
|
+
TripleClick,
|
|
131
|
+
MouseDown,
|
|
132
|
+
MouseUp,
|
|
133
|
+
Drag,
|
|
134
|
+
Move,
|
|
135
|
+
Scroll,
|
|
136
|
+
Keypress,
|
|
137
|
+
Type,
|
|
138
|
+
HoldKey,
|
|
139
|
+
Wait,
|
|
140
|
+
Screenshot,
|
|
141
|
+
CursorPos,
|
|
142
|
+
Navigate,
|
|
143
|
+
GoBack,
|
|
144
|
+
GoForward,
|
|
145
|
+
Search,
|
|
146
|
+
Bash,
|
|
147
|
+
Edit,
|
|
148
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import TypedDict
|
|
3
|
+
|
|
4
|
+
from .actions import CUAction
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Screenshot(TypedDict):
|
|
8
|
+
media_type: str
|
|
9
|
+
content: bytes
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CUActionResult(TypedDict):
|
|
13
|
+
screenshot: Screenshot | None
|
|
14
|
+
data: dict # for structured metadata
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ComputerExecutor(abc.ABC):
|
|
18
|
+
"""
|
|
19
|
+
A computer executor is any class that can take an action (from actions.py)
|
|
20
|
+
and "execute" it. This allows us to plug any API provider (OpenAI, Anthropic)
|
|
21
|
+
into any computer-use backend (BrowserBase, Kernel, Modal sandbox) by:
|
|
22
|
+
- Mapping each provider's tools to some (sub)set of CUActions
|
|
23
|
+
- Defining how to run each CUAction on that backend
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def execute(self, action: CUAction) -> CUActionResult:
|
|
27
|
+
raise NotImplementedError("Subclasses must implement execute method")
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch tool for computer use actions.
|
|
3
|
+
|
|
4
|
+
Allows Claude to submit multiple computer actions in a single tool call,
|
|
5
|
+
executing them sequentially and returning only one screenshot at the end.
|
|
6
|
+
This dramatically reduces roundtrips for common action sequences like:
|
|
7
|
+
- Ctrl+L → type URL → Return → wait → screenshot
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import base64
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from .. import Tool
|
|
16
|
+
from .converters import anthropic_tool_call_to_action
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Define the action schema matching Anthropic's computer tool
|
|
20
|
+
ACTION_SCHEMA = {
|
|
21
|
+
"type": "object",
|
|
22
|
+
"properties": {
|
|
23
|
+
"action": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"enum": [
|
|
26
|
+
"screenshot",
|
|
27
|
+
"key",
|
|
28
|
+
"type",
|
|
29
|
+
"mouse_move",
|
|
30
|
+
"left_click",
|
|
31
|
+
"left_click_drag",
|
|
32
|
+
"right_click",
|
|
33
|
+
"middle_click",
|
|
34
|
+
"double_click",
|
|
35
|
+
"triple_click",
|
|
36
|
+
"scroll",
|
|
37
|
+
"wait",
|
|
38
|
+
"cursor_position",
|
|
39
|
+
],
|
|
40
|
+
"description": "The action to perform",
|
|
41
|
+
},
|
|
42
|
+
"text": {
|
|
43
|
+
"type": "string",
|
|
44
|
+
"description": "For 'key' action: key combo like 'Return', 'ctrl+l'. For 'type' action: text to type.",
|
|
45
|
+
},
|
|
46
|
+
"coordinate": {
|
|
47
|
+
"type": "array",
|
|
48
|
+
"items": {"type": "integer"},
|
|
49
|
+
"minItems": 2,
|
|
50
|
+
"maxItems": 2,
|
|
51
|
+
"description": "For click/move actions: [x, y] coordinates",
|
|
52
|
+
},
|
|
53
|
+
"scroll_direction": {
|
|
54
|
+
"type": "string",
|
|
55
|
+
"enum": ["up", "down", "left", "right"],
|
|
56
|
+
"description": "For scroll action: direction to scroll",
|
|
57
|
+
},
|
|
58
|
+
"scroll_amount": {
|
|
59
|
+
"type": "integer",
|
|
60
|
+
"description": "For scroll action: number of scroll clicks",
|
|
61
|
+
},
|
|
62
|
+
"duration": {
|
|
63
|
+
"type": "number",
|
|
64
|
+
"description": "For wait action: seconds to wait",
|
|
65
|
+
},
|
|
66
|
+
},
|
|
67
|
+
"required": ["action"],
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def create_computer_batch_tool(
|
|
72
|
+
executor, # AsyncKernelExecutor or similar
|
|
73
|
+
*,
|
|
74
|
+
tool_name: str = "computer_batch",
|
|
75
|
+
include_final_screenshot: bool = True,
|
|
76
|
+
) -> Tool:
|
|
77
|
+
"""
|
|
78
|
+
Create a batch tool for computer use actions.
|
|
79
|
+
|
|
80
|
+
This tool allows Claude to submit multiple actions in one call:
|
|
81
|
+
- Actions execute sequentially
|
|
82
|
+
- Only one screenshot is returned at the end (if requested)
|
|
83
|
+
- Dramatically reduces API roundtrips
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
executor: The computer executor (e.g., AsyncKernelExecutor)
|
|
87
|
+
tool_name: Name for the batch tool
|
|
88
|
+
include_final_screenshot: Whether to always include a screenshot at the end
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
A Tool that can be passed to the LLM
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
executor = AsyncKernelExecutor(session_id)
|
|
95
|
+
batch_tool = create_computer_batch_tool(executor)
|
|
96
|
+
|
|
97
|
+
# Claude can now call:
|
|
98
|
+
# computer_batch(actions=[
|
|
99
|
+
# {"action": "key", "text": "ctrl+l"},
|
|
100
|
+
# {"action": "type", "text": "https://example.com"},
|
|
101
|
+
# {"action": "key", "text": "Return"},
|
|
102
|
+
# {"action": "wait", "duration": 2},
|
|
103
|
+
# {"action": "screenshot"}
|
|
104
|
+
# ])
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
async def run_batch(actions: list[dict[str, Any]]) -> str | list:
|
|
108
|
+
"""Execute a batch of computer actions and return results."""
|
|
109
|
+
from ...image import Image
|
|
110
|
+
from ...prompt import Text
|
|
111
|
+
|
|
112
|
+
results = []
|
|
113
|
+
final_screenshot = None
|
|
114
|
+
|
|
115
|
+
for i, action_args in enumerate(actions):
|
|
116
|
+
action_name = action_args.get("action", "unknown")
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
# Convert Anthropic format to CUAction
|
|
120
|
+
cu_action = anthropic_tool_call_to_action(action_args)
|
|
121
|
+
|
|
122
|
+
# Execute the action
|
|
123
|
+
result = await executor.execute(cu_action)
|
|
124
|
+
|
|
125
|
+
# Track if this was a screenshot
|
|
126
|
+
if result.get("screenshot"):
|
|
127
|
+
final_screenshot = result["screenshot"]
|
|
128
|
+
results.append(
|
|
129
|
+
{
|
|
130
|
+
"action": action_name,
|
|
131
|
+
"status": "ok",
|
|
132
|
+
"has_screenshot": True,
|
|
133
|
+
}
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
results.append(
|
|
137
|
+
{
|
|
138
|
+
"action": action_name,
|
|
139
|
+
"status": "ok",
|
|
140
|
+
}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
results.append(
|
|
145
|
+
{
|
|
146
|
+
"action": action_name,
|
|
147
|
+
"status": "error",
|
|
148
|
+
"error": str(e),
|
|
149
|
+
}
|
|
150
|
+
)
|
|
151
|
+
# Stop on error
|
|
152
|
+
break
|
|
153
|
+
|
|
154
|
+
# If we should include a final screenshot and don't have one yet, take one
|
|
155
|
+
if include_final_screenshot and final_screenshot is None:
|
|
156
|
+
try:
|
|
157
|
+
from .actions import Screenshot
|
|
158
|
+
|
|
159
|
+
result = await executor.execute(Screenshot(kind="screenshot"))
|
|
160
|
+
if result.get("screenshot"):
|
|
161
|
+
final_screenshot = result["screenshot"]
|
|
162
|
+
except Exception:
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
# Build the response
|
|
166
|
+
summary = f"Executed {len(results)} actions. "
|
|
167
|
+
errors = [r for r in results if r.get("status") == "error"]
|
|
168
|
+
if errors:
|
|
169
|
+
summary += f"{len(errors)} failed: {errors[0].get('error', 'unknown')}"
|
|
170
|
+
else:
|
|
171
|
+
summary += "All succeeded."
|
|
172
|
+
|
|
173
|
+
if final_screenshot:
|
|
174
|
+
# Return Text + Image (proper ToolResultPart types)
|
|
175
|
+
screenshot_bytes = final_screenshot["content"]
|
|
176
|
+
b64 = base64.b64encode(screenshot_bytes).decode()
|
|
177
|
+
img = Image(data=f"data:image/png;base64,{b64}")
|
|
178
|
+
return [Text(summary), img]
|
|
179
|
+
else:
|
|
180
|
+
# Just return text summary
|
|
181
|
+
return summary
|
|
182
|
+
|
|
183
|
+
description = """Execute multiple computer actions in a single call.
|
|
184
|
+
This is much faster than calling actions one at a time.
|
|
185
|
+
Actions run sequentially. A screenshot is taken at the end.
|
|
186
|
+
|
|
187
|
+
Common patterns:
|
|
188
|
+
- Navigate to URL: [{"action":"key","text":"ctrl+l"}, {"action":"type","text":"https://..."}, {"action":"key","text":"Return"}, {"action":"wait","duration":2}]
|
|
189
|
+
- Click and type: [{"action":"left_click","coordinate":[x,y]}, {"action":"type","text":"..."}]
|
|
190
|
+
- Scroll and screenshot: [{"action":"scroll","coordinate":[x,y],"scroll_direction":"down","scroll_amount":3}]
|
|
191
|
+
|
|
192
|
+
Available actions:
|
|
193
|
+
- screenshot: Capture the screen
|
|
194
|
+
- key: Press key combo (text="Return", "ctrl+l", "ctrl+a", etc.)
|
|
195
|
+
- type: Type text (text="hello world")
|
|
196
|
+
- left_click, right_click, middle_click, double_click, triple_click: Click at coordinate=[x,y]
|
|
197
|
+
- mouse_move: Move cursor to coordinate=[x,y]
|
|
198
|
+
- scroll: Scroll at coordinate=[x,y] with scroll_direction and scroll_amount
|
|
199
|
+
- wait: Pause for duration seconds
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
return Tool(
|
|
203
|
+
name=tool_name,
|
|
204
|
+
description=description,
|
|
205
|
+
parameters={
|
|
206
|
+
"actions": {
|
|
207
|
+
"type": "array",
|
|
208
|
+
"description": "List of actions to execute in order",
|
|
209
|
+
"items": ACTION_SCHEMA,
|
|
210
|
+
"minItems": 1,
|
|
211
|
+
}
|
|
212
|
+
},
|
|
213
|
+
required=["actions"],
|
|
214
|
+
run=run_batch,
|
|
215
|
+
)
|