hud-python 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +16 -12
- hud/adapters/__init__.py +4 -2
- hud/adapters/claude/adapter.py +9 -2
- hud/adapters/common/adapter.py +11 -10
- hud/adapters/common/types.py +34 -13
- hud/adapters/operator/__init__.py +5 -0
- hud/adapters/operator/adapter.py +97 -0
- hud/agent/__init__.py +7 -0
- hud/agent/base.py +109 -0
- hud/agent/claude.py +207 -0
- hud/agent/operator.py +208 -0
- hud/env/__init__.py +11 -0
- hud/env/client.py +35 -0
- hud/env/docker_client.py +306 -0
- hud/env/environment.py +354 -0
- hud/env/local_docker_client.py +251 -0
- hud/env/remote_client.py +185 -0
- hud/env/remote_docker_client.py +221 -0
- hud/evaluators/__init__.py +10 -0
- hud/evaluators/base.py +31 -0
- hud/evaluators/inspect.py +29 -0
- hud/evaluators/judge.py +213 -0
- hud/evaluators/match.py +163 -0
- hud/evaluators/remote.py +78 -0
- hud/gym.py +101 -15
- hud/job.py +185 -0
- hud/server/__init__.py +2 -2
- hud/server/requests.py +87 -0
- hud/settings.py +13 -2
- hud/task.py +144 -0
- hud/taskset.py +103 -0
- hud/trajectory.py +90 -0
- hud/types.py +65 -0
- hud/utils/__init__.py +4 -2
- hud/utils/common.py +96 -0
- hud/utils/config.py +91 -4
- hud/utils/telemetry.py +67 -0
- hud_python-0.2.1.dist-info/METADATA +181 -0
- hud_python-0.2.1.dist-info/RECORD +44 -0
- {hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/licenses/LICENSE +1 -1
- hud/client.py +0 -200
- hud/environment.py +0 -318
- hud/run.py +0 -208
- hud_python-0.1.5.dist-info/METADATA +0 -125
- hud_python-0.1.5.dist-info/RECORD +0 -21
- {hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/WHEEL +0 -0
hud/__init__.py
CHANGED
|
@@ -4,19 +4,23 @@ HUD Gym SDK - A Python SDK for interacting with HUD environments.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from hud.run import Run
|
|
7
|
+
from . import agent, env, gym, settings, task, taskset, types, utils
|
|
8
|
+
from .job import create_job, job, load_job
|
|
9
|
+
from .taskset import load_taskset
|
|
11
10
|
|
|
12
|
-
__version__ = "0.1
|
|
11
|
+
__version__ = "0.2.1"
|
|
13
12
|
|
|
14
13
|
__all__ = [
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
14
|
+
"agent",
|
|
15
|
+
"create_job",
|
|
16
|
+
"env",
|
|
17
|
+
"gym",
|
|
18
|
+
"job",
|
|
19
|
+
"load_job",
|
|
20
|
+
"load_taskset",
|
|
21
|
+
"settings",
|
|
22
|
+
"task",
|
|
23
|
+
"taskset",
|
|
24
|
+
"types",
|
|
25
|
+
"utils",
|
|
22
26
|
]
|
hud/adapters/__init__.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from .
|
|
3
|
+
from .claude import ClaudeAdapter
|
|
4
|
+
from .common import CLA, Adapter
|
|
5
|
+
from .operator import OperatorAdapter
|
|
4
6
|
|
|
5
|
-
__all__ = ["Adapter"]
|
|
7
|
+
__all__ = ["CLA", "Adapter", "ClaudeAdapter", "OperatorAdapter"]
|
hud/adapters/claude/adapter.py
CHANGED
|
@@ -13,6 +13,7 @@ from hud.adapters.common.types import (
|
|
|
13
13
|
Point,
|
|
14
14
|
PositionFetch,
|
|
15
15
|
PressAction,
|
|
16
|
+
ResponseAction,
|
|
16
17
|
ScreenshotFetch,
|
|
17
18
|
ScrollAction,
|
|
18
19
|
TypeAction,
|
|
@@ -21,7 +22,10 @@ from hud.adapters.common.types import (
|
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class ClaudeAdapter(Adapter):
|
|
24
|
-
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
|
25
|
+
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
|
26
|
+
"Return": "enter",
|
|
27
|
+
"Super": "win",
|
|
28
|
+
}
|
|
25
29
|
|
|
26
30
|
def __init__(self) -> None:
|
|
27
31
|
super().__init__()
|
|
@@ -31,7 +35,6 @@ class ClaudeAdapter(Adapter):
|
|
|
31
35
|
def _map_key(self, key: str) -> CLAKey:
|
|
32
36
|
"""Map a key to its standardized form."""
|
|
33
37
|
return self.KEY_MAP.get(key, key.lower()) # type: ignore
|
|
34
|
-
|
|
35
38
|
def convert(self, data: Any) -> CLA:
|
|
36
39
|
try:
|
|
37
40
|
action_type = data.get("action")
|
|
@@ -152,6 +155,10 @@ class ClaudeAdapter(Adapter):
|
|
|
152
155
|
elif action_type == "wait":
|
|
153
156
|
assert "duration" in data
|
|
154
157
|
return WaitAction(time=data["duration"])
|
|
158
|
+
|
|
159
|
+
elif action_type == "response":
|
|
160
|
+
return ResponseAction(text=data.get("text", ""))
|
|
161
|
+
|
|
155
162
|
else:
|
|
156
163
|
raise ValueError(f"Unsupported action type: {action_type}")
|
|
157
164
|
except AssertionError:
|
hud/adapters/common/adapter.py
CHANGED
|
@@ -2,16 +2,16 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
5
6
|
from PIL import Image
|
|
6
7
|
from pydantic import TypeAdapter, ValidationError
|
|
7
8
|
|
|
8
9
|
from .types import CLA
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
|
-
|
|
12
|
-
from typing_extensions import TypeAlias, TypeIs
|
|
12
|
+
from typing_extensions import TypeIs
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
ImageType = np.ndarray[Any, Any] | Image.Image | str | None
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def _is_numpy_array(observation: Any) -> TypeIs[np.ndarray]:
|
|
@@ -61,7 +61,7 @@ class Adapter:
|
|
|
61
61
|
observation: Image data, which can be:
|
|
62
62
|
- numpy array
|
|
63
63
|
- PIL Image
|
|
64
|
-
- base64 string (PNG)
|
|
64
|
+
- base64 string (PNG) # TODO: JPG
|
|
65
65
|
|
|
66
66
|
Returns:
|
|
67
67
|
Base64-encoded string of the resized image (PNG format)
|
|
@@ -146,7 +146,7 @@ class Adapter:
|
|
|
146
146
|
|
|
147
147
|
return processed_action
|
|
148
148
|
|
|
149
|
-
def adapt(self, action: Any) ->
|
|
149
|
+
def adapt(self, action: Any) -> CLA:
|
|
150
150
|
# any preprocessing steps
|
|
151
151
|
action = self.preprocess(action)
|
|
152
152
|
|
|
@@ -154,14 +154,15 @@ class Adapter:
|
|
|
154
154
|
action = self.convert(action)
|
|
155
155
|
self.memory.append(action)
|
|
156
156
|
|
|
157
|
-
# convert to json
|
|
157
|
+
# convert to json and apply coordinate rescaling
|
|
158
158
|
action_dict = self.json(action)
|
|
159
|
-
|
|
160
|
-
# apply coordinate rescaling
|
|
161
159
|
rescaled_action = self.postprocess_action(action_dict)
|
|
162
|
-
return rescaled_action
|
|
163
160
|
|
|
164
|
-
|
|
161
|
+
# convert back to CLA
|
|
162
|
+
return TypeAdapter(CLA).validate_python(rescaled_action)
|
|
163
|
+
|
|
164
|
+
def adapt_list(self, actions: list[Any]) -> list[CLA]:
|
|
165
165
|
if not isinstance(actions, list):
|
|
166
166
|
raise ValueError("Please provide a list of actions")
|
|
167
|
+
|
|
167
168
|
return [self.adapt(action) for action in actions]
|
hud/adapters/common/types.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Annotated, Literal
|
|
3
|
+
from typing import Annotated, Literal
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, Field
|
|
6
6
|
|
|
@@ -32,10 +32,23 @@ class PressAction(CLAAction):
|
|
|
32
32
|
keys: list[CLAKey]
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
# KEYDOWN ACTION for key presses/hotkeys
|
|
36
|
+
class KeyDownAction(CLAAction):
|
|
37
|
+
type: Literal["keydown"] = "keydown"
|
|
38
|
+
keys: list[CLAKey]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# KEYUP ACTION for key presses/hotkeys
|
|
42
|
+
class KeyUpAction(CLAAction):
|
|
43
|
+
type: Literal["keyup"] = "keyup"
|
|
44
|
+
keys: list[CLAKey]
|
|
45
|
+
|
|
46
|
+
|
|
35
47
|
# TYPE ACTION for text typing
|
|
36
48
|
class TypeAction(CLAAction):
|
|
37
49
|
type: Literal["type"] = "type"
|
|
38
50
|
text: str
|
|
51
|
+
selector: str | None = None
|
|
39
52
|
enter_after: bool | None = False
|
|
40
53
|
|
|
41
54
|
|
|
@@ -69,6 +82,12 @@ class DragAction(CLAAction):
|
|
|
69
82
|
hold_keys: list[CLAKey] | None = None
|
|
70
83
|
|
|
71
84
|
|
|
85
|
+
# RESPONSE ACTION from agent
|
|
86
|
+
class ResponseAction(CLAAction):
|
|
87
|
+
type: Literal["response"] = "response"
|
|
88
|
+
text: str # The final textual response from the agent
|
|
89
|
+
|
|
90
|
+
|
|
72
91
|
# SCREENSHOT ACTION
|
|
73
92
|
class ScreenshotFetch(CLAAction):
|
|
74
93
|
type: Literal["screenshot"] = "screenshot"
|
|
@@ -82,20 +101,22 @@ class CustomAction(CLAAction):
|
|
|
82
101
|
type: Literal["custom"] = "custom"
|
|
83
102
|
action: str
|
|
84
103
|
|
|
104
|
+
|
|
85
105
|
# Union of all possible actions
|
|
86
106
|
CLA = Annotated[
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
107
|
+
ClickAction
|
|
108
|
+
| PressAction
|
|
109
|
+
| KeyDownAction
|
|
110
|
+
| KeyUpAction
|
|
111
|
+
| TypeAction
|
|
112
|
+
| ResponseAction
|
|
113
|
+
| ScrollAction
|
|
114
|
+
| MoveAction
|
|
115
|
+
| WaitAction
|
|
116
|
+
| DragAction
|
|
117
|
+
| CustomAction
|
|
118
|
+
| ScreenshotFetch
|
|
119
|
+
| PositionFetch,
|
|
99
120
|
Field(discriminator="type"),
|
|
100
121
|
]
|
|
101
122
|
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, ClassVar
|
|
4
|
+
|
|
5
|
+
from hud.adapters.common import CLA, Adapter
|
|
6
|
+
from hud.adapters.common.types import (
|
|
7
|
+
CLAKey,
|
|
8
|
+
ClickAction,
|
|
9
|
+
DragAction,
|
|
10
|
+
MoveAction,
|
|
11
|
+
Point,
|
|
12
|
+
PressAction,
|
|
13
|
+
ResponseAction,
|
|
14
|
+
ScreenshotFetch,
|
|
15
|
+
ScrollAction,
|
|
16
|
+
TypeAction,
|
|
17
|
+
WaitAction,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OperatorAdapter(Adapter):
|
|
22
|
+
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
|
23
|
+
"Return": "enter",
|
|
24
|
+
"ArrowUp": "up",
|
|
25
|
+
"ArrowDown": "down",
|
|
26
|
+
"ArrowLeft": "left",
|
|
27
|
+
"ArrowRight": "right",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
super().__init__()
|
|
32
|
+
# OpenAI Computer Use default dimensions
|
|
33
|
+
self.agent_width = 1024
|
|
34
|
+
self.agent_height = 768
|
|
35
|
+
|
|
36
|
+
def _map_key(self, key: str) -> CLAKey:
|
|
37
|
+
"""Map a key to its standardized form."""
|
|
38
|
+
return self.KEY_MAP.get(key, key.lower()) # type: ignore
|
|
39
|
+
|
|
40
|
+
def convert(self, data: Any) -> CLA:
|
|
41
|
+
"""Convert a Computer Use action to a HUD action"""
|
|
42
|
+
try:
|
|
43
|
+
action_type = data.get("type")
|
|
44
|
+
|
|
45
|
+
if action_type == "click":
|
|
46
|
+
x, y = data.get("x", 0), data.get("y", 0)
|
|
47
|
+
button = data.get("button", "left")
|
|
48
|
+
return ClickAction(point=Point(x=x, y=y), button=button)
|
|
49
|
+
|
|
50
|
+
elif action_type == "double_click":
|
|
51
|
+
x, y = data.get("x", 0), data.get("y", 0)
|
|
52
|
+
return ClickAction(
|
|
53
|
+
point=Point(x=x, y=y),
|
|
54
|
+
button="left",
|
|
55
|
+
pattern=[100]
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
elif action_type == "scroll":
|
|
59
|
+
x, y = data.get("x", 0), data.get("y", 0)
|
|
60
|
+
scroll_x = data.get("scroll_x", 0)
|
|
61
|
+
scroll_y = data.get("scroll_y", 0)
|
|
62
|
+
return ScrollAction(
|
|
63
|
+
point=Point(x=x, y=y),
|
|
64
|
+
scroll=Point(x=scroll_x, y=scroll_y)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
elif action_type == "type":
|
|
68
|
+
text = data.get("text", "")
|
|
69
|
+
return TypeAction(text=text, enter_after=False)
|
|
70
|
+
|
|
71
|
+
elif action_type == "wait":
|
|
72
|
+
ms = data.get("ms", 1000)
|
|
73
|
+
return WaitAction(time=ms)
|
|
74
|
+
|
|
75
|
+
elif action_type == "move":
|
|
76
|
+
x, y = data.get("x", 0), data.get("y", 0)
|
|
77
|
+
return MoveAction(point=Point(x=x, y=y))
|
|
78
|
+
|
|
79
|
+
elif action_type == "keypress":
|
|
80
|
+
keys = data.get("keys", [])
|
|
81
|
+
return PressAction(keys=[self._map_key(k) for k in keys])
|
|
82
|
+
|
|
83
|
+
elif action_type == "drag":
|
|
84
|
+
path = data.get("path", [])
|
|
85
|
+
points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
|
|
86
|
+
return DragAction(path=points)
|
|
87
|
+
|
|
88
|
+
elif action_type == "screenshot":
|
|
89
|
+
return ScreenshotFetch()
|
|
90
|
+
|
|
91
|
+
elif action_type == "response":
|
|
92
|
+
return ResponseAction(text=data.get("text", ""))
|
|
93
|
+
else:
|
|
94
|
+
raise ValueError(f"Unsupported action type: {action_type}")
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
|
hud/agent/__init__.py
ADDED
hud/agent/base.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Sequence, TypeVar, Generic
|
|
3
|
+
|
|
4
|
+
from hud.adapters import Adapter, CLA
|
|
5
|
+
from hud.env.environment import Observation
|
|
6
|
+
|
|
7
|
+
# Generic type for different client types (Anthropic, OpenAI, etc.)
|
|
8
|
+
ClientT = TypeVar('ClientT')
|
|
9
|
+
ActionT = TypeVar('ActionT')
|
|
10
|
+
|
|
11
|
+
class Agent(Generic[ClientT, ActionT], ABC):
|
|
12
|
+
"""
|
|
13
|
+
Base class for all agents.
|
|
14
|
+
|
|
15
|
+
Implements a three-stage prediction process:
|
|
16
|
+
1. preprocess - Prepare observation data (e.g., rescale screenshot)
|
|
17
|
+
2. fetch_response - Make API calls to get model response
|
|
18
|
+
3. postprocess - Convert model actions to HUD format
|
|
19
|
+
|
|
20
|
+
Subclasses only need to implement the fetch_response method.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, client: ClientT | None = None, adapter: Adapter | None = None):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the agent.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
client: The client to use for API calls
|
|
29
|
+
adapter: The adapter to use for preprocessing and postprocessing
|
|
30
|
+
"""
|
|
31
|
+
self.client = client
|
|
32
|
+
self.adapter = adapter
|
|
33
|
+
|
|
34
|
+
def preprocess(self, observation: Observation) -> Observation:
|
|
35
|
+
"""
|
|
36
|
+
Preprocess the observation before sending to the model.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
observation: The raw observation from the environment
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Observation: The processed observation ready for the model
|
|
43
|
+
"""
|
|
44
|
+
if not self.adapter or not observation.screenshot:
|
|
45
|
+
return observation
|
|
46
|
+
|
|
47
|
+
# Create a new observation with the rescaled screenshot
|
|
48
|
+
processed_obs = Observation(
|
|
49
|
+
text=observation.text,
|
|
50
|
+
screenshot=self.adapter.rescale(observation.screenshot)
|
|
51
|
+
)
|
|
52
|
+
return processed_obs
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
async def fetch_response(self, observation: Observation) -> tuple[list[ActionT], bool]:
|
|
56
|
+
"""
|
|
57
|
+
Fetch a response from the model based on the observation.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
observation: The preprocessed observation
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
tuple[list[ActionT], bool]: A tuple containing the list of raw actions and a
|
|
64
|
+
boolean indicating if the agent believes it has
|
|
65
|
+
completed the task
|
|
66
|
+
"""
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
def postprocess(self, actions: list[ActionT]) -> list[CLA]:
|
|
70
|
+
"""
|
|
71
|
+
Convert model actions to HUD actions.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
actions: The raw actions from the model
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Sequence[CLA]: The actions converted to HUD format
|
|
78
|
+
"""
|
|
79
|
+
if not self.adapter:
|
|
80
|
+
raise ValueError("Cannot postprocess actions without an adapter")
|
|
81
|
+
|
|
82
|
+
return self.adapter.adapt_list(actions)
|
|
83
|
+
|
|
84
|
+
async def predict(self, observation: Observation) -> tuple[list[CLA] | list[ActionT], bool]:
|
|
85
|
+
"""
|
|
86
|
+
Predict the next action based on the observation.
|
|
87
|
+
|
|
88
|
+
Implements the full three-stage prediction process.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
observation: The observation from the environment
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
tuple[list[CLA] | list[ActionT], bool]: A tuple containing the list of actions and a boolean
|
|
95
|
+
indicating if the agent believes it has completed the task
|
|
96
|
+
"""
|
|
97
|
+
# Stage 1: Preprocess the observation
|
|
98
|
+
processed_obs = self.preprocess(observation)
|
|
99
|
+
|
|
100
|
+
# Stage 2: Fetch response from the model
|
|
101
|
+
actions, done = await self.fetch_response(processed_obs)
|
|
102
|
+
|
|
103
|
+
# Stage 3: Postprocess the actions if we have an adapter
|
|
104
|
+
if self.adapter and actions:
|
|
105
|
+
hud_actions = self.postprocess(actions)
|
|
106
|
+
return hud_actions, done
|
|
107
|
+
|
|
108
|
+
# If no adapter, return actions as is
|
|
109
|
+
return actions, done
|
hud/agent/claude.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from anthropic import AsyncAnthropic
|
|
6
|
+
from anthropic.types.beta import (
|
|
7
|
+
BetaMessageParam,
|
|
8
|
+
BetaToolResultBlockParam,
|
|
9
|
+
BetaToolComputerUse20250124Param,
|
|
10
|
+
BetaTextBlockParam,
|
|
11
|
+
BetaImageBlockParam,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from hud.adapters import Adapter
|
|
15
|
+
from hud.agent.base import Agent
|
|
16
|
+
from hud.adapters.claude import ClaudeAdapter
|
|
17
|
+
from hud.env.environment import Observation
|
|
18
|
+
from hud.settings import settings
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
def base64_to_content_block(base64: str) -> BetaImageBlockParam:
|
|
23
|
+
return {
|
|
24
|
+
"type": "image",
|
|
25
|
+
"source": {
|
|
26
|
+
"type": "base64",
|
|
27
|
+
"media_type": "image/png",
|
|
28
|
+
"data": base64
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
def text_to_content_block(text: str) -> BetaTextBlockParam:
|
|
33
|
+
return {
|
|
34
|
+
"type": "text",
|
|
35
|
+
"text": text
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def tool_use_content_block(tool_use_id: str, content: list[BetaTextBlockParam | BetaImageBlockParam]) -> BetaToolResultBlockParam:
|
|
39
|
+
return {
|
|
40
|
+
"type": "tool_result",
|
|
41
|
+
"tool_use_id": tool_use_id,
|
|
42
|
+
"content": content
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Claude's Computer Use Tool definition
|
|
46
|
+
COMPUTER_TOOL: BetaToolComputerUse20250124Param = {
|
|
47
|
+
"type": "computer_20250124",
|
|
48
|
+
"name": "computer",
|
|
49
|
+
"display_width_px": 1024,
|
|
50
|
+
"display_height_px": 768
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
54
|
+
"""
|
|
55
|
+
An agent implementation using Anthropic's Claude API with Computer Use.
|
|
56
|
+
|
|
57
|
+
This agent interacts with HUD environments using Claude's Computer Use API
|
|
58
|
+
through the ClaudeAdapter which converts actions to the format expected by HUD.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
client: AsyncAnthropic | None = None,
|
|
64
|
+
adapter: Adapter | None = None,
|
|
65
|
+
model: str = "claude-3-7-sonnet-20250219",
|
|
66
|
+
max_tokens: int = 4096,
|
|
67
|
+
max_iterations: int = 10,
|
|
68
|
+
):
|
|
69
|
+
"""
|
|
70
|
+
Initialize the ClaudeAgent.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
client: The AsyncAnthropic client for API calls (optional, created automatically if not provided)
|
|
74
|
+
adapter: The adapter to use for preprocessing and postprocessing
|
|
75
|
+
model: The Claude model to use
|
|
76
|
+
max_tokens: Maximum tokens for Claude's response
|
|
77
|
+
max_iterations: Maximum number of iterations for the agent
|
|
78
|
+
"""
|
|
79
|
+
# Initialize client if not provided
|
|
80
|
+
if client is None:
|
|
81
|
+
# Get API key from settings
|
|
82
|
+
api_key = settings.anthropic_api_key
|
|
83
|
+
if not api_key:
|
|
84
|
+
raise ValueError("Anthropic API key not found in settings or environment variables. Set ANTHROPIC_API_KEY.")
|
|
85
|
+
|
|
86
|
+
# Create client
|
|
87
|
+
client = AsyncAnthropic(api_key=api_key)
|
|
88
|
+
|
|
89
|
+
adapter = adapter or ClaudeAdapter()
|
|
90
|
+
|
|
91
|
+
super().__init__(client=client, adapter=adapter)
|
|
92
|
+
|
|
93
|
+
self.model = model
|
|
94
|
+
self.max_tokens = max_tokens
|
|
95
|
+
self.max_iterations = max_iterations
|
|
96
|
+
|
|
97
|
+
# Default dimensions - will be updated if adapter is provided
|
|
98
|
+
self.width_px = 1024
|
|
99
|
+
self.height_px = 768
|
|
100
|
+
|
|
101
|
+
# Update dimensions if adapter is provided
|
|
102
|
+
if self.adapter:
|
|
103
|
+
self.width_px = self.adapter.agent_width
|
|
104
|
+
self.height_px = self.adapter.agent_height
|
|
105
|
+
|
|
106
|
+
# Message history
|
|
107
|
+
self.messages: list[BetaMessageParam] = []
|
|
108
|
+
self.pending_computer_use_tool_id = None
|
|
109
|
+
|
|
110
|
+
async def fetch_response(self, observation: Observation) -> tuple[list[Any], bool]:
|
|
111
|
+
"""
|
|
112
|
+
Fetch a response from Claude based on the observation.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
observation: The preprocessed observation
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
tuple[list[Any], bool]: A tuple containing the list of raw actions and a
|
|
119
|
+
boolean indicating if the agent believes the task is complete
|
|
120
|
+
"""
|
|
121
|
+
if not self.client:
|
|
122
|
+
raise ValueError("Client is required")
|
|
123
|
+
|
|
124
|
+
# Prepare the user content for Claude
|
|
125
|
+
user_content: list[BetaImageBlockParam | BetaTextBlockParam | BetaToolResultBlockParam] = []
|
|
126
|
+
|
|
127
|
+
# Add text instruction if present
|
|
128
|
+
if observation.text:
|
|
129
|
+
logger.info("Adding text to user content: %s", observation.text)
|
|
130
|
+
user_content.append(text_to_content_block(str(observation.text)))
|
|
131
|
+
|
|
132
|
+
# Add screenshot if present
|
|
133
|
+
if observation.screenshot:
|
|
134
|
+
logger.info("Adding screenshot to user content")
|
|
135
|
+
if not self.pending_computer_use_tool_id:
|
|
136
|
+
logger.info("Adding screenshot to user content, no tool id")
|
|
137
|
+
user_content.append(base64_to_content_block(observation.screenshot))
|
|
138
|
+
else:
|
|
139
|
+
logger.info("Adding screenshot to user content, tool id: %s", self.pending_computer_use_tool_id)
|
|
140
|
+
user_content.append(
|
|
141
|
+
tool_use_content_block(
|
|
142
|
+
self.pending_computer_use_tool_id,
|
|
143
|
+
[base64_to_content_block(observation.screenshot)]
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
self.pending_computer_use_tool_id = None
|
|
147
|
+
|
|
148
|
+
# Add the user content to the messages
|
|
149
|
+
self.messages.append(cast(BetaMessageParam, {
|
|
150
|
+
"role": "user",
|
|
151
|
+
"content": user_content,
|
|
152
|
+
}))
|
|
153
|
+
|
|
154
|
+
# Call Claude API using async client
|
|
155
|
+
response = await self.client.beta.messages.create(
|
|
156
|
+
model=self.model,
|
|
157
|
+
max_tokens=self.max_tokens,
|
|
158
|
+
messages=self.messages,
|
|
159
|
+
tools=[COMPUTER_TOOL],
|
|
160
|
+
betas=["computer-use-2025-01-24"],
|
|
161
|
+
tool_choice={"type": "auto", "disable_parallel_tool_use": True}
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Add Claude's response to the conversation history
|
|
165
|
+
response_content = response.content
|
|
166
|
+
self.messages.append(cast(BetaMessageParam, {
|
|
167
|
+
"role": "assistant",
|
|
168
|
+
"content": response_content,
|
|
169
|
+
}))
|
|
170
|
+
|
|
171
|
+
# Process tool use
|
|
172
|
+
actions: list[Any] = []
|
|
173
|
+
done = True # Assume we're done unless we find a tool use
|
|
174
|
+
|
|
175
|
+
for block in response_content:
|
|
176
|
+
logger.info("Processing block: %s", block)
|
|
177
|
+
if block.type == "tool_use":
|
|
178
|
+
logger.info("Processing tool use: %s", block)
|
|
179
|
+
assert block.name == "computer"
|
|
180
|
+
|
|
181
|
+
# Store the raw action
|
|
182
|
+
actions.append(block.input)
|
|
183
|
+
self.pending_computer_use_tool_id = block.id
|
|
184
|
+
|
|
185
|
+
# If we found a tool use, we're not done
|
|
186
|
+
done = False
|
|
187
|
+
break
|
|
188
|
+
|
|
189
|
+
# If no tool use action was found, check for a final text response
|
|
190
|
+
if not actions and done:
|
|
191
|
+
final_text_response = ""
|
|
192
|
+
for block in response_content:
|
|
193
|
+
if block.type == "text":
|
|
194
|
+
final_text_response += block.text
|
|
195
|
+
|
|
196
|
+
if final_text_response.strip():
|
|
197
|
+
logger.info(f"No tool use found. Using final text as response: {final_text_response}")
|
|
198
|
+
actions = [{
|
|
199
|
+
"action": "response",
|
|
200
|
+
"text": final_text_response.strip()
|
|
201
|
+
}]
|
|
202
|
+
# Keep done = True
|
|
203
|
+
else:
|
|
204
|
+
logger.info("No tool use and no final text block found.")
|
|
205
|
+
# Keep done = True, actions remains empty
|
|
206
|
+
|
|
207
|
+
return actions, done
|