hud-python 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +13 -10
- hud/adapters/claude/adapter.py +30 -18
- hud/adapters/common/adapter.py +0 -1
- hud/adapters/common/types.py +129 -4
- hud/adapters/operator/adapter.py +23 -13
- hud/agent/base.py +5 -4
- hud/agent/claude.py +65 -13
- hud/agent/claude_plays_pokemon.py +2 -2
- hud/agent/langchain.py +8 -2
- hud/agent/operator.py +36 -11
- hud/agent/tests/test_base.py +2 -2
- hud/env/docker_client.py +26 -3
- hud/env/environment.py +86 -40
- hud/env/local_docker_client.py +50 -4
- hud/env/remote_client.py +22 -4
- hud/env/remote_docker_client.py +6 -2
- hud/gym.py +15 -4
- hud/job.py +91 -26
- hud/settings.py +6 -0
- hud/task.py +84 -6
- hud/taskset.py +63 -8
- hud/telemetry/exporter.py +4 -6
- hud/trajectory.py +3 -0
- hud/types.py +28 -2
- hud/utils/agent.py +37 -0
- hud/utils/common.py +142 -26
- hud/utils/config.py +11 -0
- hud/utils/tests/test_common.py +225 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/METADATA +9 -6
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/RECORD +34 -33
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/WHEEL +0 -0
- {hud_python-0.2.6.dist-info → hud_python-0.2.8.dist-info}/licenses/LICENSE +0 -0
hud/__init__.py
CHANGED
|
@@ -4,8 +4,6 @@ HUD SDK for interacting with the HUD evaluation platform.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
-
import logging
|
|
8
|
-
|
|
9
7
|
from . import agent, env, gym, settings, task, taskset, types, utils
|
|
10
8
|
from .adapters import ResponseAction as Response
|
|
11
9
|
from .job import create_job, load_job, run_job
|
|
@@ -15,16 +13,21 @@ from .taskset import load_taskset
|
|
|
15
13
|
from .telemetry import flush, init_telemetry, trace
|
|
16
14
|
from .version import __version__
|
|
17
15
|
|
|
18
|
-
|
|
16
|
+
if settings.settings.telemetry_enabled:
|
|
17
|
+
init_telemetry()
|
|
18
|
+
|
|
19
|
+
if settings.settings.fancy_logging:
|
|
20
|
+
import logging
|
|
19
21
|
|
|
20
|
-
hud_logger = logging.getLogger("hud")
|
|
21
|
-
|
|
22
|
+
hud_logger = logging.getLogger("hud")
|
|
23
|
+
# TODO: Make this configurable
|
|
24
|
+
hud_logger.setLevel(logging.INFO)
|
|
22
25
|
|
|
23
|
-
if not hud_logger.handlers:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
if not hud_logger.handlers:
|
|
27
|
+
handler = logging.StreamHandler()
|
|
28
|
+
formatter = logging.Formatter("[%(levelname)s] %(asctime)s | %(name)s | %(message)s")
|
|
29
|
+
handler.setFormatter(formatter)
|
|
30
|
+
hud_logger.addHandler(handler)
|
|
28
31
|
|
|
29
32
|
__all__ = [
|
|
30
33
|
"Response",
|
hud/adapters/claude/adapter.py
CHANGED
|
@@ -29,12 +29,16 @@ class ClaudeAdapter(Adapter):
|
|
|
29
29
|
"super_r": "win",
|
|
30
30
|
"right shift": "shift",
|
|
31
31
|
"left shift": "shift",
|
|
32
|
+
"down shift": "shift",
|
|
33
|
+
"windows": "win",
|
|
34
|
+
"page_down": "pagedown",
|
|
35
|
+
"page_up": "pageup",
|
|
32
36
|
}
|
|
33
37
|
|
|
34
|
-
def __init__(self) -> None:
|
|
38
|
+
def __init__(self, width: int = 1024, height: int = 768) -> None:
|
|
35
39
|
super().__init__()
|
|
36
|
-
self.agent_width =
|
|
37
|
-
self.agent_height =
|
|
40
|
+
self.agent_width = width # Claude's preferred width
|
|
41
|
+
self.agent_height = height # Claude's preferred height
|
|
38
42
|
|
|
39
43
|
def _map_key(self, key: str) -> CLAKey:
|
|
40
44
|
"""Map a key to its standardized form."""
|
|
@@ -53,12 +57,13 @@ class ClaudeAdapter(Adapter):
|
|
|
53
57
|
if "+" in data["text"]:
|
|
54
58
|
keys: list[CLAKey] = [self._map_key(k) for k in (data["text"].split("+"))]
|
|
55
59
|
assert len(keys) > 0
|
|
56
|
-
|
|
57
|
-
|
|
60
|
+
converted_action = PressAction(keys=keys)
|
|
61
|
+
else:
|
|
62
|
+
converted_action = PressAction(keys=[self._map_key(data["text"])])
|
|
58
63
|
|
|
59
64
|
elif action_type == "type":
|
|
60
65
|
assert "text" in data
|
|
61
|
-
|
|
66
|
+
converted_action = TypeAction(
|
|
62
67
|
text=data["text"],
|
|
63
68
|
enter_after=False,
|
|
64
69
|
)
|
|
@@ -69,14 +74,14 @@ class ClaudeAdapter(Adapter):
|
|
|
69
74
|
coord = data["coordinate"]
|
|
70
75
|
assert isinstance(coord, list)
|
|
71
76
|
assert len(coord) == 2
|
|
72
|
-
|
|
77
|
+
converted_action = MoveAction(point=Point(x=coord[0], y=coord[1]))
|
|
73
78
|
|
|
74
79
|
elif action_type == "left_click":
|
|
75
80
|
assert "coordinate" in data
|
|
76
81
|
coord = data["coordinate"]
|
|
77
82
|
assert isinstance(coord, list)
|
|
78
83
|
assert len(coord) == 2
|
|
79
|
-
|
|
84
|
+
converted_action = ClickAction(point=Point(x=coord[0], y=coord[1]), button="left")
|
|
80
85
|
|
|
81
86
|
elif action_type == "left_click_drag":
|
|
82
87
|
assert "coordinate" in data
|
|
@@ -93,28 +98,30 @@ class ClaudeAdapter(Adapter):
|
|
|
93
98
|
):
|
|
94
99
|
raise ValueError("Left click drag must be preceded by a move or click action")
|
|
95
100
|
else:
|
|
96
|
-
|
|
101
|
+
converted_action = DragAction(
|
|
102
|
+
path=[self.memory[-1].point, Point(x=coord[0], y=coord[1])]
|
|
103
|
+
)
|
|
97
104
|
|
|
98
105
|
elif action_type == "right_click":
|
|
99
106
|
assert "coordinate" in data
|
|
100
107
|
coord = data["coordinate"]
|
|
101
108
|
assert isinstance(coord, list)
|
|
102
109
|
assert len(coord) == 2
|
|
103
|
-
|
|
110
|
+
converted_action = ClickAction(point=Point(x=coord[0], y=coord[1]), button="right")
|
|
104
111
|
|
|
105
112
|
elif action_type == "middle_click":
|
|
106
113
|
assert "coordinate" in data
|
|
107
114
|
coord = data["coordinate"]
|
|
108
115
|
assert isinstance(coord, list)
|
|
109
116
|
assert len(coord) == 2
|
|
110
|
-
|
|
117
|
+
converted_action = ClickAction(point=Point(x=coord[0], y=coord[1]), button="middle")
|
|
111
118
|
|
|
112
119
|
elif action_type == "double_click":
|
|
113
120
|
assert "coordinate" in data
|
|
114
121
|
coord = data["coordinate"]
|
|
115
122
|
assert isinstance(coord, list)
|
|
116
123
|
assert len(coord) == 2
|
|
117
|
-
|
|
124
|
+
converted_action = ClickAction(
|
|
118
125
|
point=Point(x=coord[0], y=coord[1]), button="left", pattern=[100]
|
|
119
126
|
)
|
|
120
127
|
|
|
@@ -123,7 +130,7 @@ class ClaudeAdapter(Adapter):
|
|
|
123
130
|
coord = data["coordinate"]
|
|
124
131
|
assert isinstance(coord, list)
|
|
125
132
|
assert len(coord) == 2
|
|
126
|
-
|
|
133
|
+
converted_action = ClickAction(
|
|
127
134
|
point=Point(x=coord[0], y=coord[1]),
|
|
128
135
|
button="left",
|
|
129
136
|
pattern=[100, 100],
|
|
@@ -144,25 +151,30 @@ class ClaudeAdapter(Adapter):
|
|
|
144
151
|
else:
|
|
145
152
|
raise ValueError(f"Unsupported scroll direction: {direction}")
|
|
146
153
|
|
|
147
|
-
|
|
154
|
+
converted_action = ScrollAction(
|
|
148
155
|
point=Point(x=data["coordinate"][0], y=data["coordinate"][1]),
|
|
149
156
|
scroll=scroll,
|
|
150
157
|
)
|
|
151
158
|
|
|
152
159
|
elif action_type == "screenshot":
|
|
153
|
-
|
|
160
|
+
converted_action = ScreenshotFetch()
|
|
154
161
|
|
|
155
162
|
elif action_type == "cursor_position":
|
|
156
|
-
|
|
163
|
+
converted_action = PositionFetch()
|
|
157
164
|
|
|
158
165
|
elif action_type == "wait":
|
|
159
166
|
assert "duration" in data
|
|
160
|
-
|
|
167
|
+
converted_action = WaitAction(time=data["duration"])
|
|
161
168
|
|
|
162
169
|
elif action_type == "response":
|
|
163
|
-
|
|
170
|
+
converted_action = ResponseAction(text=data.get("text", ""))
|
|
164
171
|
|
|
165
172
|
else:
|
|
166
173
|
raise ValueError(f"Unsupported action type: {action_type}")
|
|
174
|
+
|
|
175
|
+
converted_action.reasoning = data.get("reasoning", None)
|
|
176
|
+
converted_action.logs = data.get("logs", None)
|
|
177
|
+
|
|
178
|
+
return converted_action
|
|
167
179
|
except AssertionError:
|
|
168
180
|
raise ValueError(f"Invalid action: {data}") from None
|
hud/adapters/common/adapter.py
CHANGED
hud/adapters/common/types.py
CHANGED
|
@@ -1,13 +1,40 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Annotated, Literal, TypeAlias
|
|
3
|
+
from typing import Annotated, Any, Literal, TypeAlias
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, Field
|
|
6
6
|
|
|
7
|
+
LogType = str | dict[str, Any] | list[str | dict[str, Any]] | None
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Helper function to format logs for display
|
|
11
|
+
def _format_logs_for_display(
|
|
12
|
+
logs: LogType | None = None,
|
|
13
|
+
reasoning: str | None = None,
|
|
14
|
+
max_log_len: int = 277,
|
|
15
|
+
) -> str:
|
|
16
|
+
log_repr = repr(logs)
|
|
17
|
+
truncated_log = log_repr[:max_log_len] + "..." if len(log_repr) > max_log_len else log_repr
|
|
18
|
+
return f" │ Reasoning: {reasoning} │ Logs: {truncated_log}"
|
|
19
|
+
|
|
7
20
|
|
|
8
21
|
# Base class for all actions
|
|
9
22
|
class CLAAction(BaseModel):
|
|
10
23
|
type: str
|
|
24
|
+
reasoning: str | None = None
|
|
25
|
+
logs: LogType | None = None
|
|
26
|
+
|
|
27
|
+
def __str__(self) -> str:
|
|
28
|
+
# Basic representation for actions that don't have a specific override
|
|
29
|
+
# This base __str__ will NOT include logs by default, subclasses should handle it.
|
|
30
|
+
attributes = ", ".join(
|
|
31
|
+
f"{k}='{v}'" if isinstance(v, str) else f"{k}={v}"
|
|
32
|
+
for k, v in self.model_dump().items()
|
|
33
|
+
if k != "type" and v is not None and k != "logs" and k != "reasoning"
|
|
34
|
+
)
|
|
35
|
+
action_str = f"{self.type.capitalize()}Action ({attributes})"
|
|
36
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
37
|
+
return action_str
|
|
11
38
|
|
|
12
39
|
|
|
13
40
|
# Basic Point model for coordinates
|
|
@@ -16,32 +43,59 @@ class Point(BaseModel):
|
|
|
16
43
|
y: int
|
|
17
44
|
|
|
18
45
|
|
|
19
|
-
# CLICK ACTION
|
|
46
|
+
# CLICK ACTION
|
|
20
47
|
class ClickAction(CLAAction):
|
|
21
48
|
type: Literal["click"] = "click"
|
|
22
49
|
point: Point | None = None
|
|
23
50
|
button: CLAButton = "left"
|
|
24
|
-
pattern: list[int] | None = None
|
|
51
|
+
pattern: list[int] | None = None
|
|
25
52
|
hold_keys: list[CLAKey] | None = None
|
|
26
53
|
|
|
54
|
+
def __str__(self) -> str:
|
|
55
|
+
parts = ["💥 Click"]
|
|
56
|
+
if self.point:
|
|
57
|
+
parts.append(f"at ({self.point.x}, {self.point.y})")
|
|
58
|
+
if self.button != "left":
|
|
59
|
+
parts.append(f"with {self.button} button")
|
|
60
|
+
if self.hold_keys:
|
|
61
|
+
parts.append(f"holding {self.hold_keys}")
|
|
62
|
+
action_str = " ".join(parts)
|
|
63
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
64
|
+
return action_str
|
|
65
|
+
|
|
27
66
|
|
|
28
67
|
# PRESS ACTION for key presses/hotkeys
|
|
29
68
|
class PressAction(CLAAction):
|
|
30
69
|
type: Literal["press"] = "press"
|
|
31
70
|
keys: list[CLAKey]
|
|
32
71
|
|
|
72
|
+
def __str__(self) -> str:
|
|
73
|
+
action_str = f"🎹 Press keys: {'+'.join(self.keys)}"
|
|
74
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
75
|
+
return action_str
|
|
76
|
+
|
|
33
77
|
|
|
34
78
|
# KEYDOWN ACTION for key presses/hotkeys
|
|
35
79
|
class KeyDownAction(CLAAction):
|
|
36
80
|
type: Literal["keydown"] = "keydown"
|
|
37
81
|
keys: list[CLAKey]
|
|
38
82
|
|
|
83
|
+
def __str__(self) -> str:
|
|
84
|
+
action_str = f"👇 KeyDown: {'+'.join(self.keys)}"
|
|
85
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
86
|
+
return action_str
|
|
87
|
+
|
|
39
88
|
|
|
40
89
|
# KEYUP ACTION for key presses/hotkeys
|
|
41
90
|
class KeyUpAction(CLAAction):
|
|
42
91
|
type: Literal["keyup"] = "keyup"
|
|
43
92
|
keys: list[CLAKey]
|
|
44
93
|
|
|
94
|
+
def __str__(self) -> str:
|
|
95
|
+
action_str = f"👆 KeyUp: {'+'.join(self.keys)}"
|
|
96
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
97
|
+
return action_str
|
|
98
|
+
|
|
45
99
|
|
|
46
100
|
# TYPE ACTION for text typing
|
|
47
101
|
class TypeAction(CLAAction):
|
|
@@ -49,6 +103,13 @@ class TypeAction(CLAAction):
|
|
|
49
103
|
text: str
|
|
50
104
|
enter_after: bool | None = False
|
|
51
105
|
|
|
106
|
+
def __str__(self) -> str:
|
|
107
|
+
action_str = f'✍️ Type: "{self.text}"'
|
|
108
|
+
if self.enter_after:
|
|
109
|
+
action_str += " (and press Enter)"
|
|
110
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
111
|
+
return action_str
|
|
112
|
+
|
|
52
113
|
|
|
53
114
|
# SCROLL ACTION
|
|
54
115
|
class ScrollAction(CLAAction):
|
|
@@ -57,6 +118,18 @@ class ScrollAction(CLAAction):
|
|
|
57
118
|
scroll: Point | None = None
|
|
58
119
|
hold_keys: list[CLAKey] | None = None
|
|
59
120
|
|
|
121
|
+
def __str__(self) -> str:
|
|
122
|
+
parts = ["📄 Scroll"]
|
|
123
|
+
if self.point:
|
|
124
|
+
parts.append(f"at ({self.point.x}, {self.point.y})")
|
|
125
|
+
if self.scroll:
|
|
126
|
+
parts.append(f"by ({self.scroll.x}, {self.scroll.y})")
|
|
127
|
+
if self.hold_keys: # Added hold_keys for scroll
|
|
128
|
+
parts.append(f"holding {self.hold_keys}")
|
|
129
|
+
action_str = " ".join(parts)
|
|
130
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
131
|
+
return action_str
|
|
132
|
+
|
|
60
133
|
|
|
61
134
|
# MOVE ACTION for mouse movement
|
|
62
135
|
class MoveAction(CLAAction):
|
|
@@ -64,11 +137,26 @@ class MoveAction(CLAAction):
|
|
|
64
137
|
point: Point | None = None
|
|
65
138
|
offset: Point | None = None
|
|
66
139
|
|
|
140
|
+
def __str__(self) -> str:
|
|
141
|
+
parts = ["✨ Move"]
|
|
142
|
+
if self.point:
|
|
143
|
+
parts.append(f"to ({self.point.x},{self.point.y})")
|
|
144
|
+
if self.offset:
|
|
145
|
+
parts.append(f"by ({self.offset.x},{self.offset.y})")
|
|
146
|
+
action_str = " ".join(parts)
|
|
147
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
148
|
+
return action_str
|
|
149
|
+
|
|
67
150
|
|
|
68
151
|
# WAIT ACTION
|
|
69
152
|
class WaitAction(CLAAction):
|
|
70
153
|
type: Literal["wait"] = "wait"
|
|
71
|
-
time: int
|
|
154
|
+
time: int
|
|
155
|
+
|
|
156
|
+
def __str__(self) -> str:
|
|
157
|
+
action_str = f"💤 Wait for {self.time}ms"
|
|
158
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
159
|
+
return action_str
|
|
72
160
|
|
|
73
161
|
|
|
74
162
|
# DRAG ACTION
|
|
@@ -78,26 +166,63 @@ class DragAction(CLAAction):
|
|
|
78
166
|
pattern: list[int] | None = None # [delay_1, delay_2, ...]
|
|
79
167
|
hold_keys: list[CLAKey] | None = None
|
|
80
168
|
|
|
169
|
+
def __str__(self) -> str:
|
|
170
|
+
parts = ["🤏 Drag"]
|
|
171
|
+
if self.path and len(self.path) > 0:
|
|
172
|
+
if len(self.path) == 1:
|
|
173
|
+
parts.append(f"at ({self.path[0].x},{self.path[0].y})")
|
|
174
|
+
else:
|
|
175
|
+
parts.append(
|
|
176
|
+
f"from ({self.path[0].x}, {self.path[0].y}) to "
|
|
177
|
+
f"({self.path[-1].x}, {self.path[-1].y})"
|
|
178
|
+
)
|
|
179
|
+
if self.hold_keys: # Added hold_keys for drag
|
|
180
|
+
parts.append(f"holding {self.hold_keys}")
|
|
181
|
+
action_str = " ".join(parts)
|
|
182
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
183
|
+
return action_str
|
|
184
|
+
|
|
81
185
|
|
|
82
186
|
# RESPONSE ACTION from agent
|
|
83
187
|
class ResponseAction(CLAAction):
|
|
84
188
|
type: Literal["response"] = "response"
|
|
85
189
|
text: str # The final textual response from the agent
|
|
86
190
|
|
|
191
|
+
def __str__(self) -> str:
|
|
192
|
+
displayed_text = self.text if len(self.text) < 50 else self.text[:47] + "..."
|
|
193
|
+
action_str = f'💬 Response: "{displayed_text}"'
|
|
194
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
195
|
+
return action_str
|
|
196
|
+
|
|
87
197
|
|
|
88
198
|
# SCREENSHOT ACTION
|
|
89
199
|
class ScreenshotFetch(CLAAction):
|
|
90
200
|
type: Literal["screenshot"] = "screenshot"
|
|
91
201
|
|
|
202
|
+
def __str__(self) -> str:
|
|
203
|
+
action_str = "📸 Screenshot"
|
|
204
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
205
|
+
return action_str
|
|
206
|
+
|
|
92
207
|
|
|
93
208
|
class PositionFetch(CLAAction):
|
|
94
209
|
type: Literal["position"] = "position"
|
|
95
210
|
|
|
211
|
+
def __str__(self) -> str:
|
|
212
|
+
action_str = "📍 Position"
|
|
213
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
214
|
+
return action_str
|
|
215
|
+
|
|
96
216
|
|
|
97
217
|
class CustomAction(CLAAction):
|
|
98
218
|
type: Literal["custom"] = "custom"
|
|
99
219
|
action: str
|
|
100
220
|
|
|
221
|
+
def __str__(self) -> str:
|
|
222
|
+
action_str = f"⚙️ Custom: {self.action}"
|
|
223
|
+
action_str += _format_logs_for_display(self.logs, self.reasoning)
|
|
224
|
+
return action_str
|
|
225
|
+
|
|
101
226
|
|
|
102
227
|
# Union of all possible actions
|
|
103
228
|
CLA = Annotated[
|
hud/adapters/operator/adapter.py
CHANGED
|
@@ -26,6 +26,7 @@ class OperatorAdapter(Adapter):
|
|
|
26
26
|
"arrowdown": "down",
|
|
27
27
|
"arrowleft": "left",
|
|
28
28
|
"arrowright": "right",
|
|
29
|
+
"cmd": "ctrl",
|
|
29
30
|
}
|
|
30
31
|
|
|
31
32
|
BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {"wheel": "middle"}
|
|
@@ -49,46 +50,55 @@ class OperatorAdapter(Adapter):
|
|
|
49
50
|
x, y = data.get("x", 0), data.get("y", 0)
|
|
50
51
|
button = data.get("button", "left")
|
|
51
52
|
button = self.BUTTON_MAP.get(button, button)
|
|
52
|
-
|
|
53
|
+
if button is None:
|
|
54
|
+
button = "left"
|
|
55
|
+
converted_action = ClickAction(point=Point(x=x, y=y), button=button)
|
|
53
56
|
|
|
54
57
|
elif action_type == "double_click":
|
|
55
58
|
x, y = data.get("x", 0), data.get("y", 0)
|
|
56
|
-
|
|
59
|
+
converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
|
|
57
60
|
|
|
58
61
|
elif action_type == "scroll":
|
|
59
|
-
x, y = data.get("x", 0), data.get("y", 0)
|
|
60
|
-
scroll_x = data.get("scroll_x", 0)
|
|
61
|
-
scroll_y = data.get("scroll_y", 0)
|
|
62
|
-
|
|
62
|
+
x, y = int(data.get("x", 0)), int(data.get("y", 0))
|
|
63
|
+
scroll_x = int(data.get("scroll_x", 0))
|
|
64
|
+
scroll_y = int(data.get("scroll_y", 0))
|
|
65
|
+
converted_action = ScrollAction(
|
|
66
|
+
point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
|
|
67
|
+
)
|
|
63
68
|
|
|
64
69
|
elif action_type == "type":
|
|
65
70
|
text = data.get("text", "")
|
|
66
|
-
|
|
71
|
+
converted_action = TypeAction(text=text, enter_after=False)
|
|
67
72
|
|
|
68
73
|
elif action_type == "wait":
|
|
69
74
|
ms = data.get("ms", 1000)
|
|
70
|
-
|
|
75
|
+
converted_action = WaitAction(time=ms)
|
|
71
76
|
|
|
72
77
|
elif action_type == "move":
|
|
73
78
|
x, y = data.get("x", 0), data.get("y", 0)
|
|
74
|
-
|
|
79
|
+
converted_action = MoveAction(point=Point(x=x, y=y))
|
|
75
80
|
|
|
76
81
|
elif action_type == "keypress":
|
|
77
82
|
keys = data.get("keys", [])
|
|
78
|
-
|
|
83
|
+
converted_action = PressAction(keys=[self._map_key(k) for k in keys])
|
|
79
84
|
|
|
80
85
|
elif action_type == "drag":
|
|
81
86
|
path = data.get("path", [])
|
|
82
87
|
points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
|
|
83
|
-
|
|
88
|
+
converted_action = DragAction(path=points)
|
|
84
89
|
|
|
85
90
|
elif action_type == "screenshot":
|
|
86
|
-
|
|
91
|
+
converted_action = ScreenshotFetch()
|
|
87
92
|
|
|
88
93
|
elif action_type == "response":
|
|
89
|
-
|
|
94
|
+
converted_action = ResponseAction(text=data.get("text", ""))
|
|
90
95
|
else:
|
|
91
96
|
raise ValueError(f"Unsupported action type: {action_type}")
|
|
92
97
|
|
|
98
|
+
converted_action.reasoning = data.get("reasoning", "")
|
|
99
|
+
converted_action.logs = data.get("logs", "")
|
|
100
|
+
|
|
101
|
+
return converted_action
|
|
102
|
+
|
|
93
103
|
except Exception as e:
|
|
94
104
|
raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
|
hud/agent/base.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Sequence, TypeVar, Generic
|
|
2
|
+
from typing import Any, Sequence, TypeVar, Generic
|
|
3
3
|
|
|
4
4
|
from hud.adapters import Adapter, CLA
|
|
5
5
|
from hud.types import Gym
|
|
@@ -31,6 +31,7 @@ class Agent(Generic[ClientT, ActionT], ABC):
|
|
|
31
31
|
self,
|
|
32
32
|
client: ClientT | None = None,
|
|
33
33
|
adapter: Adapter | None = None,
|
|
34
|
+
name: str | None = None,
|
|
34
35
|
):
|
|
35
36
|
"""
|
|
36
37
|
Initialize the agent.
|
|
@@ -41,6 +42,7 @@ class Agent(Generic[ClientT, ActionT], ABC):
|
|
|
41
42
|
"""
|
|
42
43
|
self.client = client
|
|
43
44
|
self.adapter = adapter
|
|
45
|
+
self.name = name
|
|
44
46
|
|
|
45
47
|
def preprocess(self, observation: Observation) -> Observation:
|
|
46
48
|
"""
|
|
@@ -70,9 +72,9 @@ class Agent(Generic[ClientT, ActionT], ABC):
|
|
|
70
72
|
observation: The preprocessed observation
|
|
71
73
|
|
|
72
74
|
Returns:
|
|
73
|
-
tuple[list[ActionT], bool]: A tuple containing the list of raw actions
|
|
75
|
+
tuple[list[ActionT], bool]: A tuple containing the list of raw actions,
|
|
74
76
|
boolean indicating if the agent believes it has
|
|
75
|
-
completed the task
|
|
77
|
+
completed the task.
|
|
76
78
|
"""
|
|
77
79
|
pass
|
|
78
80
|
|
|
@@ -82,7 +84,6 @@ class Agent(Generic[ClientT, ActionT], ABC):
|
|
|
82
84
|
|
|
83
85
|
Args:
|
|
84
86
|
actions: The raw actions from the model
|
|
85
|
-
|
|
86
87
|
Returns:
|
|
87
88
|
Sequence[CLA]: The actions converted to HUD format
|
|
88
89
|
"""
|
hud/agent/claude.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import logging
|
|
2
3
|
from typing import Any, cast
|
|
3
4
|
|
|
4
|
-
from anthropic import AsyncAnthropic
|
|
5
|
+
from anthropic import AsyncAnthropic, BadRequestError
|
|
5
6
|
from anthropic.types.beta import (
|
|
6
7
|
BetaMessageParam,
|
|
7
8
|
BetaToolResultBlockParam,
|
|
8
9
|
BetaToolComputerUse20250124Param,
|
|
9
10
|
BetaTextBlockParam,
|
|
10
11
|
BetaImageBlockParam,
|
|
12
|
+
BetaCacheControlEphemeralParam,
|
|
11
13
|
)
|
|
12
14
|
|
|
13
15
|
from hud.adapters import Adapter
|
|
@@ -16,6 +18,7 @@ from hud.adapters.claude import ClaudeAdapter
|
|
|
16
18
|
from hud.types import Gym
|
|
17
19
|
from hud.utils.common import Observation
|
|
18
20
|
from hud.settings import settings
|
|
21
|
+
from hud.adapters.common.types import LogType
|
|
19
22
|
|
|
20
23
|
logger = logging.getLogger(__name__)
|
|
21
24
|
|
|
@@ -63,6 +66,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
63
66
|
model: str = "claude-3-7-sonnet-20250219",
|
|
64
67
|
max_tokens: int = 4096,
|
|
65
68
|
max_iterations: int = 10,
|
|
69
|
+
name: str | None = None,
|
|
66
70
|
):
|
|
67
71
|
"""
|
|
68
72
|
Initialize the ClaudeAgent.
|
|
@@ -73,6 +77,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
73
77
|
model: The Claude model to use
|
|
74
78
|
max_tokens: Maximum tokens for Claude's response
|
|
75
79
|
max_iterations: Maximum number of iterations for the agent
|
|
80
|
+
name: The name of the agent
|
|
76
81
|
"""
|
|
77
82
|
# Initialize client if not provided
|
|
78
83
|
if client is None:
|
|
@@ -88,7 +93,10 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
88
93
|
|
|
89
94
|
adapter = adapter or ClaudeAdapter()
|
|
90
95
|
|
|
91
|
-
|
|
96
|
+
if name is None:
|
|
97
|
+
name = model
|
|
98
|
+
|
|
99
|
+
super().__init__(client=client, adapter=adapter, name=name)
|
|
92
100
|
|
|
93
101
|
self.model = model
|
|
94
102
|
self.max_tokens = max_tokens
|
|
@@ -115,12 +123,15 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
115
123
|
observation: The preprocessed observation
|
|
116
124
|
|
|
117
125
|
Returns:
|
|
118
|
-
tuple[list[Any], bool]: A tuple containing the list of raw actions
|
|
119
|
-
boolean indicating if the agent believes the task is complete
|
|
126
|
+
tuple[list[Any], bool, list[str | dict[str, Any]] | None]: A tuple containing the list of raw actions,
|
|
127
|
+
boolean indicating if the agent believes the task is complete, and a list of strings or dictionaries of logs.
|
|
120
128
|
"""
|
|
121
129
|
if not self.client:
|
|
122
130
|
raise ValueError("Client is required")
|
|
123
131
|
|
|
132
|
+
if not observation.text and not observation.screenshot:
|
|
133
|
+
raise ValueError("Observation must contain either text or screenshot")
|
|
134
|
+
|
|
124
135
|
# Prepare the user content for Claude
|
|
125
136
|
user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
|
|
126
137
|
|
|
@@ -159,15 +170,44 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
159
170
|
)
|
|
160
171
|
)
|
|
161
172
|
|
|
162
|
-
# Call Claude API using async client
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
173
|
+
# Call Claude API using async client, truncating 50 messages at a time if needed
|
|
174
|
+
while True:
|
|
175
|
+
# first, make a copy and add prompt caching to the last message
|
|
176
|
+
messages_cached = copy.deepcopy(self.messages)
|
|
177
|
+
# Mark last user message with cache control for prompt caching
|
|
178
|
+
last_msg = messages_cached[-1]
|
|
179
|
+
if last_msg.get("role") == "user":
|
|
180
|
+
last_content = last_msg["content"]
|
|
181
|
+
if isinstance(last_content, list):
|
|
182
|
+
for block in last_content:
|
|
183
|
+
if (
|
|
184
|
+
not block["type"] == "thinking"
|
|
185
|
+
and not block["type"] == "redacted_thinking"
|
|
186
|
+
):
|
|
187
|
+
cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
|
|
188
|
+
block["cache_control"] = cache_control
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
response = await self.client.beta.messages.create(
|
|
192
|
+
model=self.model,
|
|
193
|
+
max_tokens=self.max_tokens,
|
|
194
|
+
messages=messages_cached,
|
|
195
|
+
tools=[COMPUTER_TOOL],
|
|
196
|
+
betas=["computer-use-2025-01-24"],
|
|
197
|
+
tool_choice={"type": "auto", "disable_parallel_tool_use": True},
|
|
198
|
+
)
|
|
199
|
+
except BadRequestError as e:
|
|
200
|
+
if e.message.startswith("prompt is too long"):
|
|
201
|
+
logger.warning(
|
|
202
|
+
f"Prompt is too long, removing the first 50 messages except for the first user message: {e.message}"
|
|
203
|
+
)
|
|
204
|
+
self.messages = [self.messages[0]] + self.messages[50:]
|
|
205
|
+
continue
|
|
206
|
+
else:
|
|
207
|
+
raise e
|
|
208
|
+
|
|
209
|
+
# break out of the while loop if we get a response
|
|
210
|
+
break
|
|
171
211
|
|
|
172
212
|
# Add Claude's response to the conversation history
|
|
173
213
|
response_content = response.content
|
|
@@ -216,4 +256,16 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
216
256
|
# logger.info("No tool use and no final text block found.")
|
|
217
257
|
# Keep done = True, actions remains empty
|
|
218
258
|
|
|
259
|
+
reasoning = ""
|
|
260
|
+
for block in response_content:
|
|
261
|
+
if block.type == "thinking":
|
|
262
|
+
reasoning += f"Thinking: {block.thinking}\n"
|
|
263
|
+
elif block.type == "text":
|
|
264
|
+
reasoning += block.text
|
|
265
|
+
|
|
266
|
+
# add reasoning to the actions
|
|
267
|
+
for action in actions:
|
|
268
|
+
action["reasoning"] = reasoning
|
|
269
|
+
action["logs"] = response.model_dump()
|
|
270
|
+
|
|
219
271
|
return actions, done
|