hud-python 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

hud/__init__.py CHANGED
@@ -4,8 +4,6 @@ HUD SDK for interacting with the HUD evaluation platform.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- import logging
8
-
9
7
  from . import agent, env, gym, settings, task, taskset, types, utils
10
8
  from .adapters import ResponseAction as Response
11
9
  from .job import create_job, load_job, run_job
@@ -15,16 +13,21 @@ from .taskset import load_taskset
15
13
  from .telemetry import flush, init_telemetry, trace
16
14
  from .version import __version__
17
15
 
18
- init_telemetry()
16
+ if settings.settings.telemetry_enabled:
17
+ init_telemetry()
18
+
19
+ if settings.settings.fancy_logging:
20
+ import logging
19
21
 
20
- hud_logger = logging.getLogger("hud")
21
- hud_logger.setLevel(logging.INFO)
22
+ hud_logger = logging.getLogger("hud")
23
+ # TODO: Make this configurable
24
+ hud_logger.setLevel(logging.INFO)
22
25
 
23
- if not hud_logger.handlers:
24
- handler = logging.StreamHandler()
25
- formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
26
- handler.setFormatter(formatter)
27
- hud_logger.addHandler(handler)
26
+ if not hud_logger.handlers:
27
+ handler = logging.StreamHandler()
28
+ formatter = logging.Formatter("[%(levelname)s] %(asctime)s | %(name)s | %(message)s")
29
+ handler.setFormatter(formatter)
30
+ hud_logger.addHandler(handler)
28
31
 
29
32
  __all__ = [
30
33
  "Response",
@@ -29,12 +29,16 @@ class ClaudeAdapter(Adapter):
29
29
  "super_r": "win",
30
30
  "right shift": "shift",
31
31
  "left shift": "shift",
32
+ "down shift": "shift",
33
+ "windows": "win",
34
+ "page_down": "pagedown",
35
+ "page_up": "pageup",
32
36
  }
33
37
 
34
- def __init__(self) -> None:
38
+ def __init__(self, width: int = 1024, height: int = 768) -> None:
35
39
  super().__init__()
36
- self.agent_width = 1024 # Claude's preferred width
37
- self.agent_height = 768 # Claude's preferred height
40
+ self.agent_width = width # Claude's preferred width
41
+ self.agent_height = height # Claude's preferred height
38
42
 
39
43
  def _map_key(self, key: str) -> CLAKey:
40
44
  """Map a key to its standardized form."""
@@ -53,12 +57,13 @@ class ClaudeAdapter(Adapter):
53
57
  if "+" in data["text"]:
54
58
  keys: list[CLAKey] = [self._map_key(k) for k in (data["text"].split("+"))]
55
59
  assert len(keys) > 0
56
- return PressAction(keys=keys)
57
- return PressAction(keys=[self._map_key(data["text"])])
60
+ converted_action = PressAction(keys=keys)
61
+ else:
62
+ converted_action = PressAction(keys=[self._map_key(data["text"])])
58
63
 
59
64
  elif action_type == "type":
60
65
  assert "text" in data
61
- return TypeAction(
66
+ converted_action = TypeAction(
62
67
  text=data["text"],
63
68
  enter_after=False,
64
69
  )
@@ -69,14 +74,14 @@ class ClaudeAdapter(Adapter):
69
74
  coord = data["coordinate"]
70
75
  assert isinstance(coord, list)
71
76
  assert len(coord) == 2
72
- return MoveAction(point=Point(x=coord[0], y=coord[1]))
77
+ converted_action = MoveAction(point=Point(x=coord[0], y=coord[1]))
73
78
 
74
79
  elif action_type == "left_click":
75
80
  assert "coordinate" in data
76
81
  coord = data["coordinate"]
77
82
  assert isinstance(coord, list)
78
83
  assert len(coord) == 2
79
- return ClickAction(point=Point(x=coord[0], y=coord[1]), button="left")
84
+ converted_action = ClickAction(point=Point(x=coord[0], y=coord[1]), button="left")
80
85
 
81
86
  elif action_type == "left_click_drag":
82
87
  assert "coordinate" in data
@@ -93,28 +98,30 @@ class ClaudeAdapter(Adapter):
93
98
  ):
94
99
  raise ValueError("Left click drag must be preceded by a move or click action")
95
100
  else:
96
- return DragAction(path=[self.memory[-1].point, Point(x=coord[0], y=coord[1])])
101
+ converted_action = DragAction(
102
+ path=[self.memory[-1].point, Point(x=coord[0], y=coord[1])]
103
+ )
97
104
 
98
105
  elif action_type == "right_click":
99
106
  assert "coordinate" in data
100
107
  coord = data["coordinate"]
101
108
  assert isinstance(coord, list)
102
109
  assert len(coord) == 2
103
- return ClickAction(point=Point(x=coord[0], y=coord[1]), button="right")
110
+ converted_action = ClickAction(point=Point(x=coord[0], y=coord[1]), button="right")
104
111
 
105
112
  elif action_type == "middle_click":
106
113
  assert "coordinate" in data
107
114
  coord = data["coordinate"]
108
115
  assert isinstance(coord, list)
109
116
  assert len(coord) == 2
110
- return ClickAction(point=Point(x=coord[0], y=coord[1]), button="middle")
117
+ converted_action = ClickAction(point=Point(x=coord[0], y=coord[1]), button="middle")
111
118
 
112
119
  elif action_type == "double_click":
113
120
  assert "coordinate" in data
114
121
  coord = data["coordinate"]
115
122
  assert isinstance(coord, list)
116
123
  assert len(coord) == 2
117
- return ClickAction(
124
+ converted_action = ClickAction(
118
125
  point=Point(x=coord[0], y=coord[1]), button="left", pattern=[100]
119
126
  )
120
127
 
@@ -123,7 +130,7 @@ class ClaudeAdapter(Adapter):
123
130
  coord = data["coordinate"]
124
131
  assert isinstance(coord, list)
125
132
  assert len(coord) == 2
126
- return ClickAction(
133
+ converted_action = ClickAction(
127
134
  point=Point(x=coord[0], y=coord[1]),
128
135
  button="left",
129
136
  pattern=[100, 100],
@@ -144,25 +151,30 @@ class ClaudeAdapter(Adapter):
144
151
  else:
145
152
  raise ValueError(f"Unsupported scroll direction: {direction}")
146
153
 
147
- return ScrollAction(
154
+ converted_action = ScrollAction(
148
155
  point=Point(x=data["coordinate"][0], y=data["coordinate"][1]),
149
156
  scroll=scroll,
150
157
  )
151
158
 
152
159
  elif action_type == "screenshot":
153
- return ScreenshotFetch()
160
+ converted_action = ScreenshotFetch()
154
161
 
155
162
  elif action_type == "cursor_position":
156
- return PositionFetch()
163
+ converted_action = PositionFetch()
157
164
 
158
165
  elif action_type == "wait":
159
166
  assert "duration" in data
160
- return WaitAction(time=data["duration"])
167
+ converted_action = WaitAction(time=data["duration"])
161
168
 
162
169
  elif action_type == "response":
163
- return ResponseAction(text=data.get("text", ""))
170
+ converted_action = ResponseAction(text=data.get("text", ""))
164
171
 
165
172
  else:
166
173
  raise ValueError(f"Unsupported action type: {action_type}")
174
+
175
+ converted_action.reasoning = data.get("reasoning", None)
176
+ converted_action.logs = data.get("logs", None)
177
+
178
+ return converted_action
167
179
  except AssertionError:
168
180
  raise ValueError(f"Invalid action: {data}") from None
@@ -164,5 +164,4 @@ class Adapter:
164
164
  def adapt_list(self, actions: list[Any]) -> list[CLA]:
165
165
  if not isinstance(actions, list):
166
166
  raise ValueError("Please provide a list of actions")
167
-
168
167
  return [self.adapt(action) for action in actions]
@@ -1,13 +1,40 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Annotated, Literal, TypeAlias
3
+ from typing import Annotated, Any, Literal, TypeAlias
4
4
 
5
5
  from pydantic import BaseModel, Field
6
6
 
7
+ LogType = str | dict[str, Any] | list[str | dict[str, Any]] | None
8
+
9
+
10
+ # Helper function to format logs for display
11
+ def _format_logs_for_display(
12
+ logs: LogType | None = None,
13
+ reasoning: str | None = None,
14
+ max_log_len: int = 277,
15
+ ) -> str:
16
+ log_repr = repr(logs)
17
+ truncated_log = log_repr[:max_log_len] + "..." if len(log_repr) > max_log_len else log_repr
18
+ return f" │ Reasoning: {reasoning} │ Logs: {truncated_log}"
19
+
7
20
 
8
21
  # Base class for all actions
9
22
  class CLAAction(BaseModel):
10
23
  type: str
24
+ reasoning: str | None = None
25
+ logs: LogType | None = None
26
+
27
+ def __str__(self) -> str:
28
+ # Basic representation for actions that don't have a specific override
29
+ # This base __str__ will NOT include logs by default, subclasses should handle it.
30
+ attributes = ", ".join(
31
+ f"{k}='{v}'" if isinstance(v, str) else f"{k}={v}"
32
+ for k, v in self.model_dump().items()
33
+ if k != "type" and v is not None and k != "logs" and k != "reasoning"
34
+ )
35
+ action_str = f"{self.type.capitalize()}Action ({attributes})"
36
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
37
+ return action_str
11
38
 
12
39
 
13
40
  # Basic Point model for coordinates
@@ -16,32 +43,59 @@ class Point(BaseModel):
16
43
  y: int
17
44
 
18
45
 
19
- # CLICK ACTION (supports extra options)
46
+ # CLICK ACTION
20
47
  class ClickAction(CLAAction):
21
48
  type: Literal["click"] = "click"
22
49
  point: Point | None = None
23
50
  button: CLAButton = "left"
24
- pattern: list[int] | None = None # [delay_1, delay_2, ...]
51
+ pattern: list[int] | None = None
25
52
  hold_keys: list[CLAKey] | None = None
26
53
 
54
+ def __str__(self) -> str:
55
+ parts = ["💥 Click"]
56
+ if self.point:
57
+ parts.append(f"at ({self.point.x}, {self.point.y})")
58
+ if self.button != "left":
59
+ parts.append(f"with {self.button} button")
60
+ if self.hold_keys:
61
+ parts.append(f"holding {self.hold_keys}")
62
+ action_str = " ".join(parts)
63
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
64
+ return action_str
65
+
27
66
 
28
67
  # PRESS ACTION for key presses/hotkeys
29
68
  class PressAction(CLAAction):
30
69
  type: Literal["press"] = "press"
31
70
  keys: list[CLAKey]
32
71
 
72
+ def __str__(self) -> str:
73
+ action_str = f"🎹 Press keys: {'+'.join(self.keys)}"
74
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
75
+ return action_str
76
+
33
77
 
34
78
  # KEYDOWN ACTION for key presses/hotkeys
35
79
  class KeyDownAction(CLAAction):
36
80
  type: Literal["keydown"] = "keydown"
37
81
  keys: list[CLAKey]
38
82
 
83
+ def __str__(self) -> str:
84
+ action_str = f"👇 KeyDown: {'+'.join(self.keys)}"
85
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
86
+ return action_str
87
+
39
88
 
40
89
  # KEYUP ACTION for key presses/hotkeys
41
90
  class KeyUpAction(CLAAction):
42
91
  type: Literal["keyup"] = "keyup"
43
92
  keys: list[CLAKey]
44
93
 
94
+ def __str__(self) -> str:
95
+ action_str = f"👆 KeyUp: {'+'.join(self.keys)}"
96
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
97
+ return action_str
98
+
45
99
 
46
100
  # TYPE ACTION for text typing
47
101
  class TypeAction(CLAAction):
@@ -49,6 +103,13 @@ class TypeAction(CLAAction):
49
103
  text: str
50
104
  enter_after: bool | None = False
51
105
 
106
+ def __str__(self) -> str:
107
+ action_str = f'✍️ Type: "{self.text}"'
108
+ if self.enter_after:
109
+ action_str += " (and press Enter)"
110
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
111
+ return action_str
112
+
52
113
 
53
114
  # SCROLL ACTION
54
115
  class ScrollAction(CLAAction):
@@ -57,6 +118,18 @@ class ScrollAction(CLAAction):
57
118
  scroll: Point | None = None
58
119
  hold_keys: list[CLAKey] | None = None
59
120
 
121
+ def __str__(self) -> str:
122
+ parts = ["📄 Scroll"]
123
+ if self.point:
124
+ parts.append(f"at ({self.point.x}, {self.point.y})")
125
+ if self.scroll:
126
+ parts.append(f"by ({self.scroll.x}, {self.scroll.y})")
127
+ if self.hold_keys: # Added hold_keys for scroll
128
+ parts.append(f"holding {self.hold_keys}")
129
+ action_str = " ".join(parts)
130
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
131
+ return action_str
132
+
60
133
 
61
134
  # MOVE ACTION for mouse movement
62
135
  class MoveAction(CLAAction):
@@ -64,11 +137,26 @@ class MoveAction(CLAAction):
64
137
  point: Point | None = None
65
138
  offset: Point | None = None
66
139
 
140
+ def __str__(self) -> str:
141
+ parts = ["✨ Move"]
142
+ if self.point:
143
+ parts.append(f"to ({self.point.x},{self.point.y})")
144
+ if self.offset:
145
+ parts.append(f"by ({self.offset.x},{self.offset.y})")
146
+ action_str = " ".join(parts)
147
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
148
+ return action_str
149
+
67
150
 
68
151
  # WAIT ACTION
69
152
  class WaitAction(CLAAction):
70
153
  type: Literal["wait"] = "wait"
71
- time: int # in milliseconds
154
+ time: int
155
+
156
+ def __str__(self) -> str:
157
+ action_str = f"💤 Wait for {self.time}ms"
158
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
159
+ return action_str
72
160
 
73
161
 
74
162
  # DRAG ACTION
@@ -78,26 +166,63 @@ class DragAction(CLAAction):
78
166
  pattern: list[int] | None = None # [delay_1, delay_2, ...]
79
167
  hold_keys: list[CLAKey] | None = None
80
168
 
169
+ def __str__(self) -> str:
170
+ parts = ["🤏 Drag"]
171
+ if self.path and len(self.path) > 0:
172
+ if len(self.path) == 1:
173
+ parts.append(f"at ({self.path[0].x},{self.path[0].y})")
174
+ else:
175
+ parts.append(
176
+ f"from ({self.path[0].x}, {self.path[0].y}) to "
177
+ f"({self.path[-1].x}, {self.path[-1].y})"
178
+ )
179
+ if self.hold_keys: # Added hold_keys for drag
180
+ parts.append(f"holding {self.hold_keys}")
181
+ action_str = " ".join(parts)
182
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
183
+ return action_str
184
+
81
185
 
82
186
  # RESPONSE ACTION from agent
83
187
  class ResponseAction(CLAAction):
84
188
  type: Literal["response"] = "response"
85
189
  text: str # The final textual response from the agent
86
190
 
191
+ def __str__(self) -> str:
192
+ displayed_text = self.text if len(self.text) < 50 else self.text[:47] + "..."
193
+ action_str = f'💬 Response: "{displayed_text}"'
194
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
195
+ return action_str
196
+
87
197
 
88
198
  # SCREENSHOT ACTION
89
199
  class ScreenshotFetch(CLAAction):
90
200
  type: Literal["screenshot"] = "screenshot"
91
201
 
202
+ def __str__(self) -> str:
203
+ action_str = "📸 Screenshot"
204
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
205
+ return action_str
206
+
92
207
 
93
208
  class PositionFetch(CLAAction):
94
209
  type: Literal["position"] = "position"
95
210
 
211
+ def __str__(self) -> str:
212
+ action_str = "📍 Position"
213
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
214
+ return action_str
215
+
96
216
 
97
217
  class CustomAction(CLAAction):
98
218
  type: Literal["custom"] = "custom"
99
219
  action: str
100
220
 
221
+ def __str__(self) -> str:
222
+ action_str = f"⚙️ Custom: {self.action}"
223
+ action_str += _format_logs_for_display(self.logs, self.reasoning)
224
+ return action_str
225
+
101
226
 
102
227
  # Union of all possible actions
103
228
  CLA = Annotated[
@@ -26,6 +26,7 @@ class OperatorAdapter(Adapter):
26
26
  "arrowdown": "down",
27
27
  "arrowleft": "left",
28
28
  "arrowright": "right",
29
+ "cmd": "ctrl",
29
30
  }
30
31
 
31
32
  BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {"wheel": "middle"}
@@ -49,46 +50,55 @@ class OperatorAdapter(Adapter):
49
50
  x, y = data.get("x", 0), data.get("y", 0)
50
51
  button = data.get("button", "left")
51
52
  button = self.BUTTON_MAP.get(button, button)
52
- return ClickAction(point=Point(x=x, y=y), button=button)
53
+ if button is None:
54
+ button = "left"
55
+ converted_action = ClickAction(point=Point(x=x, y=y), button=button)
53
56
 
54
57
  elif action_type == "double_click":
55
58
  x, y = data.get("x", 0), data.get("y", 0)
56
- return ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
59
+ converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
57
60
 
58
61
  elif action_type == "scroll":
59
- x, y = data.get("x", 0), data.get("y", 0)
60
- scroll_x = data.get("scroll_x", 0)
61
- scroll_y = data.get("scroll_y", 0)
62
- return ScrollAction(point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y))
62
+ x, y = int(data.get("x", 0)), int(data.get("y", 0))
63
+ scroll_x = int(data.get("scroll_x", 0))
64
+ scroll_y = int(data.get("scroll_y", 0))
65
+ converted_action = ScrollAction(
66
+ point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
67
+ )
63
68
 
64
69
  elif action_type == "type":
65
70
  text = data.get("text", "")
66
- return TypeAction(text=text, enter_after=False)
71
+ converted_action = TypeAction(text=text, enter_after=False)
67
72
 
68
73
  elif action_type == "wait":
69
74
  ms = data.get("ms", 1000)
70
- return WaitAction(time=ms)
75
+ converted_action = WaitAction(time=ms)
71
76
 
72
77
  elif action_type == "move":
73
78
  x, y = data.get("x", 0), data.get("y", 0)
74
- return MoveAction(point=Point(x=x, y=y))
79
+ converted_action = MoveAction(point=Point(x=x, y=y))
75
80
 
76
81
  elif action_type == "keypress":
77
82
  keys = data.get("keys", [])
78
- return PressAction(keys=[self._map_key(k) for k in keys])
83
+ converted_action = PressAction(keys=[self._map_key(k) for k in keys])
79
84
 
80
85
  elif action_type == "drag":
81
86
  path = data.get("path", [])
82
87
  points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
83
- return DragAction(path=points)
88
+ converted_action = DragAction(path=points)
84
89
 
85
90
  elif action_type == "screenshot":
86
- return ScreenshotFetch()
91
+ converted_action = ScreenshotFetch()
87
92
 
88
93
  elif action_type == "response":
89
- return ResponseAction(text=data.get("text", ""))
94
+ converted_action = ResponseAction(text=data.get("text", ""))
90
95
  else:
91
96
  raise ValueError(f"Unsupported action type: {action_type}")
92
97
 
98
+ converted_action.reasoning = data.get("reasoning", "")
99
+ converted_action.logs = data.get("logs", "")
100
+
101
+ return converted_action
102
+
93
103
  except Exception as e:
94
104
  raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
hud/agent/base.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Sequence, TypeVar, Generic
2
+ from typing import Any, Sequence, TypeVar, Generic
3
3
 
4
4
  from hud.adapters import Adapter, CLA
5
5
  from hud.types import Gym
@@ -31,6 +31,7 @@ class Agent(Generic[ClientT, ActionT], ABC):
31
31
  self,
32
32
  client: ClientT | None = None,
33
33
  adapter: Adapter | None = None,
34
+ name: str | None = None,
34
35
  ):
35
36
  """
36
37
  Initialize the agent.
@@ -41,6 +42,7 @@ class Agent(Generic[ClientT, ActionT], ABC):
41
42
  """
42
43
  self.client = client
43
44
  self.adapter = adapter
45
+ self.name = name
44
46
 
45
47
  def preprocess(self, observation: Observation) -> Observation:
46
48
  """
@@ -70,9 +72,9 @@ class Agent(Generic[ClientT, ActionT], ABC):
70
72
  observation: The preprocessed observation
71
73
 
72
74
  Returns:
73
- tuple[list[ActionT], bool]: A tuple containing the list of raw actions and a
75
+ tuple[list[ActionT], bool]: A tuple containing the list of raw actions,
74
76
  boolean indicating if the agent believes it has
75
- completed the task
77
+ completed the task.
76
78
  """
77
79
  pass
78
80
 
@@ -82,7 +84,6 @@ class Agent(Generic[ClientT, ActionT], ABC):
82
84
 
83
85
  Args:
84
86
  actions: The raw actions from the model
85
-
86
87
  Returns:
87
88
  Sequence[CLA]: The actions converted to HUD format
88
89
  """
hud/agent/claude.py CHANGED
@@ -1,13 +1,15 @@
1
+ import copy
1
2
  import logging
2
3
  from typing import Any, cast
3
4
 
4
- from anthropic import AsyncAnthropic
5
+ from anthropic import AsyncAnthropic, BadRequestError
5
6
  from anthropic.types.beta import (
6
7
  BetaMessageParam,
7
8
  BetaToolResultBlockParam,
8
9
  BetaToolComputerUse20250124Param,
9
10
  BetaTextBlockParam,
10
11
  BetaImageBlockParam,
12
+ BetaCacheControlEphemeralParam,
11
13
  )
12
14
 
13
15
  from hud.adapters import Adapter
@@ -16,6 +18,7 @@ from hud.adapters.claude import ClaudeAdapter
16
18
  from hud.types import Gym
17
19
  from hud.utils.common import Observation
18
20
  from hud.settings import settings
21
+ from hud.adapters.common.types import LogType
19
22
 
20
23
  logger = logging.getLogger(__name__)
21
24
 
@@ -63,6 +66,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
63
66
  model: str = "claude-3-7-sonnet-20250219",
64
67
  max_tokens: int = 4096,
65
68
  max_iterations: int = 10,
69
+ name: str | None = None,
66
70
  ):
67
71
  """
68
72
  Initialize the ClaudeAgent.
@@ -73,6 +77,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
73
77
  model: The Claude model to use
74
78
  max_tokens: Maximum tokens for Claude's response
75
79
  max_iterations: Maximum number of iterations for the agent
80
+ name: The name of the agent
76
81
  """
77
82
  # Initialize client if not provided
78
83
  if client is None:
@@ -88,7 +93,10 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
88
93
 
89
94
  adapter = adapter or ClaudeAdapter()
90
95
 
91
- super().__init__(client=client, adapter=adapter)
96
+ if name is None:
97
+ name = model
98
+
99
+ super().__init__(client=client, adapter=adapter, name=name)
92
100
 
93
101
  self.model = model
94
102
  self.max_tokens = max_tokens
@@ -115,12 +123,15 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
115
123
  observation: The preprocessed observation
116
124
 
117
125
  Returns:
118
- tuple[list[Any], bool]: A tuple containing the list of raw actions and a
119
- boolean indicating if the agent believes the task is complete
126
+ tuple[list[Any], bool, list[str | dict[str, Any]] | None]: A tuple containing the list of raw actions,
127
+ boolean indicating if the agent believes the task is complete, and a list of strings or dictionaries of logs.
120
128
  """
121
129
  if not self.client:
122
130
  raise ValueError("Client is required")
123
131
 
132
+ if not observation.text and not observation.screenshot:
133
+ raise ValueError("Observation must contain either text or screenshot")
134
+
124
135
  # Prepare the user content for Claude
125
136
  user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
126
137
 
@@ -159,15 +170,44 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
159
170
  )
160
171
  )
161
172
 
162
- # Call Claude API using async client
163
- response = await self.client.beta.messages.create(
164
- model=self.model,
165
- max_tokens=self.max_tokens,
166
- messages=self.messages,
167
- tools=[COMPUTER_TOOL],
168
- betas=["computer-use-2025-01-24"],
169
- tool_choice={"type": "auto", "disable_parallel_tool_use": True},
170
- )
173
+ # Call Claude API using async client, truncating 50 messages at a time if needed
174
+ while True:
175
+ # first, make a copy and add prompt caching to the last message
176
+ messages_cached = copy.deepcopy(self.messages)
177
+ # Mark last user message with cache control for prompt caching
178
+ last_msg = messages_cached[-1]
179
+ if last_msg.get("role") == "user":
180
+ last_content = last_msg["content"]
181
+ if isinstance(last_content, list):
182
+ for block in last_content:
183
+ if (
184
+ not block["type"] == "thinking"
185
+ and not block["type"] == "redacted_thinking"
186
+ ):
187
+ cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
188
+ block["cache_control"] = cache_control
189
+
190
+ try:
191
+ response = await self.client.beta.messages.create(
192
+ model=self.model,
193
+ max_tokens=self.max_tokens,
194
+ messages=messages_cached,
195
+ tools=[COMPUTER_TOOL],
196
+ betas=["computer-use-2025-01-24"],
197
+ tool_choice={"type": "auto", "disable_parallel_tool_use": True},
198
+ )
199
+ except BadRequestError as e:
200
+ if e.message.startswith("prompt is too long"):
201
+ logger.warning(
202
+ f"Prompt is too long, removing the first 50 messages except for the first user message: {e.message}"
203
+ )
204
+ self.messages = [self.messages[0]] + self.messages[50:]
205
+ continue
206
+ else:
207
+ raise e
208
+
209
+ # break out of the while loop if we get a response
210
+ break
171
211
 
172
212
  # Add Claude's response to the conversation history
173
213
  response_content = response.content
@@ -216,4 +256,16 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
216
256
  # logger.info("No tool use and no final text block found.")
217
257
  # Keep done = True, actions remains empty
218
258
 
259
+ reasoning = ""
260
+ for block in response_content:
261
+ if block.type == "thinking":
262
+ reasoning += f"Thinking: {block.thinking}\n"
263
+ elif block.type == "text":
264
+ reasoning += block.text
265
+
266
+ # add reasoning to the actions
267
+ for action in actions:
268
+ action["reasoning"] = reasoning
269
+ action["logs"] = response.model_dump()
270
+
219
271
  return actions, done