hud-python 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +1 -1
- hud/adapters/claude/adapter.py +9 -1
- hud/adapters/common/types.py +7 -0
- hud/adapters/operator/adapter.py +4 -0
- hud/agent/claude.py +22 -2
- hud/agent/operator.py +35 -17
- hud/env/docker_client.py +1 -1
- hud/env/environment.py +182 -9
- hud/env/local_docker_client.py +3 -1
- hud/task.py +41 -30
- hud/taskset.py +8 -0
- hud/utils/common.py +28 -1
- hud/utils/config.py +1 -92
- {hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/METADATA +19 -26
- {hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/RECORD +17 -17
- {hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/WHEEL +0 -0
- {hud_python-0.2.0.dist-info → hud_python-0.2.1.dist-info}/licenses/LICENSE +0 -0
hud/__init__.py
CHANGED
hud/adapters/claude/adapter.py
CHANGED
|
@@ -13,6 +13,7 @@ from hud.adapters.common.types import (
|
|
|
13
13
|
Point,
|
|
14
14
|
PositionFetch,
|
|
15
15
|
PressAction,
|
|
16
|
+
ResponseAction,
|
|
16
17
|
ScreenshotFetch,
|
|
17
18
|
ScrollAction,
|
|
18
19
|
TypeAction,
|
|
@@ -21,7 +22,10 @@ from hud.adapters.common.types import (
|
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class ClaudeAdapter(Adapter):
|
|
24
|
-
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
|
25
|
+
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
|
26
|
+
"Return": "enter",
|
|
27
|
+
"Super": "win",
|
|
28
|
+
}
|
|
25
29
|
|
|
26
30
|
def __init__(self) -> None:
|
|
27
31
|
super().__init__()
|
|
@@ -151,6 +155,10 @@ class ClaudeAdapter(Adapter):
|
|
|
151
155
|
elif action_type == "wait":
|
|
152
156
|
assert "duration" in data
|
|
153
157
|
return WaitAction(time=data["duration"])
|
|
158
|
+
|
|
159
|
+
elif action_type == "response":
|
|
160
|
+
return ResponseAction(text=data.get("text", ""))
|
|
161
|
+
|
|
154
162
|
else:
|
|
155
163
|
raise ValueError(f"Unsupported action type: {action_type}")
|
|
156
164
|
except AssertionError:
|
hud/adapters/common/types.py
CHANGED
|
@@ -82,6 +82,12 @@ class DragAction(CLAAction):
|
|
|
82
82
|
hold_keys: list[CLAKey] | None = None
|
|
83
83
|
|
|
84
84
|
|
|
85
|
+
# RESPONSE ACTION from agent
|
|
86
|
+
class ResponseAction(CLAAction):
|
|
87
|
+
type: Literal["response"] = "response"
|
|
88
|
+
text: str # The final textual response from the agent
|
|
89
|
+
|
|
90
|
+
|
|
85
91
|
# SCREENSHOT ACTION
|
|
86
92
|
class ScreenshotFetch(CLAAction):
|
|
87
93
|
type: Literal["screenshot"] = "screenshot"
|
|
@@ -103,6 +109,7 @@ CLA = Annotated[
|
|
|
103
109
|
| KeyDownAction
|
|
104
110
|
| KeyUpAction
|
|
105
111
|
| TypeAction
|
|
112
|
+
| ResponseAction
|
|
106
113
|
| ScrollAction
|
|
107
114
|
| MoveAction
|
|
108
115
|
| WaitAction
|
hud/adapters/operator/adapter.py
CHANGED
|
@@ -10,6 +10,7 @@ from hud.adapters.common.types import (
|
|
|
10
10
|
MoveAction,
|
|
11
11
|
Point,
|
|
12
12
|
PressAction,
|
|
13
|
+
ResponseAction,
|
|
13
14
|
ScreenshotFetch,
|
|
14
15
|
ScrollAction,
|
|
15
16
|
TypeAction,
|
|
@@ -86,6 +87,9 @@ class OperatorAdapter(Adapter):
|
|
|
86
87
|
|
|
87
88
|
elif action_type == "screenshot":
|
|
88
89
|
return ScreenshotFetch()
|
|
90
|
+
|
|
91
|
+
elif action_type == "response":
|
|
92
|
+
return ResponseAction(text=data.get("text", ""))
|
|
89
93
|
else:
|
|
90
94
|
raise ValueError(f"Unsupported action type: {action_type}")
|
|
91
95
|
|
hud/agent/claude.py
CHANGED
|
@@ -11,7 +11,7 @@ from anthropic.types.beta import (
|
|
|
11
11
|
BetaImageBlockParam,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
from hud.adapters import Adapter
|
|
15
15
|
from hud.agent.base import Agent
|
|
16
16
|
from hud.adapters.claude import ClaudeAdapter
|
|
17
17
|
from hud.env.environment import Observation
|
|
@@ -61,7 +61,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
61
61
|
def __init__(
|
|
62
62
|
self,
|
|
63
63
|
client: AsyncAnthropic | None = None,
|
|
64
|
-
adapter:
|
|
64
|
+
adapter: Adapter | None = None,
|
|
65
65
|
model: str = "claude-3-7-sonnet-20250219",
|
|
66
66
|
max_tokens: int = 4096,
|
|
67
67
|
max_iterations: int = 10,
|
|
@@ -85,6 +85,8 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
85
85
|
|
|
86
86
|
# Create client
|
|
87
87
|
client = AsyncAnthropic(api_key=api_key)
|
|
88
|
+
|
|
89
|
+
adapter = adapter or ClaudeAdapter()
|
|
88
90
|
|
|
89
91
|
super().__init__(client=client, adapter=adapter)
|
|
90
92
|
|
|
@@ -184,4 +186,22 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
|
|
|
184
186
|
done = False
|
|
185
187
|
break
|
|
186
188
|
|
|
189
|
+
# If no tool use action was found, check for a final text response
|
|
190
|
+
if not actions and done:
|
|
191
|
+
final_text_response = ""
|
|
192
|
+
for block in response_content:
|
|
193
|
+
if block.type == "text":
|
|
194
|
+
final_text_response += block.text
|
|
195
|
+
|
|
196
|
+
if final_text_response.strip():
|
|
197
|
+
logger.info(f"No tool use found. Using final text as response: {final_text_response}")
|
|
198
|
+
actions = [{
|
|
199
|
+
"action": "response",
|
|
200
|
+
"text": final_text_response.strip()
|
|
201
|
+
}]
|
|
202
|
+
# Keep done = True
|
|
203
|
+
else:
|
|
204
|
+
logger.info("No tool use and no final text block found.")
|
|
205
|
+
# Keep done = True, actions remains empty
|
|
206
|
+
|
|
187
207
|
return actions, done
|
hud/agent/operator.py
CHANGED
|
@@ -9,9 +9,11 @@ from openai.types.responses import (
|
|
|
9
9
|
ResponseInputParam,
|
|
10
10
|
ResponseInputItemParam,
|
|
11
11
|
ResponseOutputMessage,
|
|
12
|
-
ResponseComputerToolCall
|
|
12
|
+
ResponseComputerToolCall,
|
|
13
|
+
ResponseOutputText
|
|
13
14
|
)
|
|
14
15
|
|
|
16
|
+
from hud.adapters import Adapter
|
|
15
17
|
from hud.agent.base import Agent
|
|
16
18
|
from hud.adapters.operator import OperatorAdapter
|
|
17
19
|
from hud.env.environment import Observation
|
|
@@ -32,7 +34,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
32
34
|
client: OpenAI | None = None,
|
|
33
35
|
model: str = "computer-use-preview",
|
|
34
36
|
environment: Literal["windows", "mac", "linux", "browser"] = "windows",
|
|
35
|
-
adapter:
|
|
37
|
+
adapter: Adapter | None = None,
|
|
36
38
|
max_iterations: int = 8
|
|
37
39
|
):
|
|
38
40
|
"""
|
|
@@ -54,6 +56,8 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
54
56
|
|
|
55
57
|
# Create synchronous client
|
|
56
58
|
client = OpenAI(api_key=api_key)
|
|
59
|
+
|
|
60
|
+
adapter = adapter or OperatorAdapter()
|
|
57
61
|
|
|
58
62
|
super().__init__(client=client, adapter=adapter)
|
|
59
63
|
|
|
@@ -74,7 +78,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
74
78
|
self.last_response_id = None
|
|
75
79
|
self.pending_call_id = None
|
|
76
80
|
self.initial_prompt = None
|
|
77
|
-
|
|
81
|
+
|
|
78
82
|
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
|
79
83
|
"""
|
|
80
84
|
Fetch a response from the model based on the observation.
|
|
@@ -158,33 +162,47 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
|
|
|
158
162
|
# Store the response ID for the next call
|
|
159
163
|
self.last_response_id = response.id
|
|
160
164
|
|
|
161
|
-
# Process the response to extract
|
|
165
|
+
# Process the response to extract actions or final text
|
|
162
166
|
actions = []
|
|
163
|
-
done = True # Assume
|
|
164
|
-
|
|
165
|
-
|
|
167
|
+
done = True # Assume done unless a computer call is found
|
|
168
|
+
final_text_response = ""
|
|
169
|
+
|
|
170
|
+
# Check for computer calls first
|
|
166
171
|
computer_calls = [
|
|
167
172
|
item for item in response.output
|
|
168
173
|
if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
|
|
169
174
|
]
|
|
170
175
|
|
|
171
176
|
if computer_calls:
|
|
172
|
-
#
|
|
177
|
+
# If computer calls exist, process them and set done=False
|
|
173
178
|
done = False
|
|
174
|
-
|
|
175
|
-
# Process all computer calls
|
|
176
179
|
for computer_call in computer_calls:
|
|
177
180
|
self.pending_call_id = computer_call.call_id
|
|
178
181
|
action = computer_call.action
|
|
179
|
-
actions.append(action.model_dump())
|
|
180
|
-
|
|
181
|
-
# Log the action
|
|
182
|
+
actions.append(action.model_dump()) # Convert Pydantic model to dict
|
|
182
183
|
logger.info(f"Computer call action: {action}")
|
|
183
184
|
else:
|
|
184
|
-
#
|
|
185
|
-
logger.info("No computer call found
|
|
185
|
+
# No computer calls, check for a final text message
|
|
186
|
+
logger.info("No computer call found. Checking for final message.")
|
|
187
|
+
logger.info(response.output)
|
|
186
188
|
for item in response.output:
|
|
187
189
|
if isinstance(item, ResponseOutputMessage) and item.type == "message":
|
|
188
|
-
|
|
189
|
-
|
|
190
|
+
# Extract text from content blocks within the message
|
|
191
|
+
full_text = "".join([c.text for c in item.content if isinstance(c, ResponseOutputText)])
|
|
192
|
+
if full_text:
|
|
193
|
+
final_text_response = full_text
|
|
194
|
+
logger.info(f"Final text message: {final_text_response}")
|
|
195
|
+
break # Stop after finding the first text message
|
|
196
|
+
|
|
197
|
+
# If we found final text, package it as a 'response' action
|
|
198
|
+
if final_text_response:
|
|
199
|
+
actions = [{
|
|
200
|
+
"type": "response",
|
|
201
|
+
"text": final_text_response
|
|
202
|
+
}]
|
|
203
|
+
# Keep done = True
|
|
204
|
+
else:
|
|
205
|
+
logger.info("No computer calls and no final text message found.")
|
|
206
|
+
# Keep done = True, actions remains empty
|
|
207
|
+
|
|
190
208
|
return actions, done
|
hud/env/docker_client.py
CHANGED
|
@@ -215,7 +215,7 @@ class DockerClient(Client):
|
|
|
215
215
|
raise ValueError("Could not find package name in pyproject.toml")
|
|
216
216
|
logger.info("Installing %s in /root/controller", self._package_name)
|
|
217
217
|
result = await self.execute(
|
|
218
|
-
["bash", "-c", "cd /root/controller && pip install -e ."],
|
|
218
|
+
["bash", "-c", "cd /root/controller && pip install -e . --break-system-packages"],
|
|
219
219
|
timeout=60,
|
|
220
220
|
)
|
|
221
221
|
if result["stdout"]:
|
hud/env/environment.py
CHANGED
|
@@ -10,14 +10,13 @@ from pydantic import BaseModel
|
|
|
10
10
|
from hud.env.client import Client
|
|
11
11
|
from hud.env.remote_client import RemoteClient
|
|
12
12
|
from hud.task import Task
|
|
13
|
-
from hud.utils import
|
|
14
|
-
from hud.utils.config import REMOTE_EVALUATE,
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from hud.adapters.common import CLA
|
|
13
|
+
from hud.utils.common import HudStyleConfig, HudStyleConfigs
|
|
14
|
+
from hud.utils.config import REMOTE_EVALUATE, REMOTE_FUNCTION_PREFIX, REMOTE_SETUP, expand_config
|
|
18
15
|
|
|
19
16
|
logger = logging.getLogger("hud.environment")
|
|
20
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from hud.adapters.common import CLA
|
|
21
20
|
|
|
22
21
|
class Observation(BaseModel):
|
|
23
22
|
"""
|
|
@@ -46,6 +45,9 @@ class Environment(BaseModel):
|
|
|
46
45
|
task: Task | None = None
|
|
47
46
|
build_data: dict[str, Any]
|
|
48
47
|
|
|
48
|
+
# final response
|
|
49
|
+
final_response: str | None = None
|
|
50
|
+
|
|
49
51
|
async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]:
|
|
50
52
|
# Execute each config and collect results
|
|
51
53
|
configs_all = [configs] if not isinstance(configs, list) else configs
|
|
@@ -76,7 +78,7 @@ class Environment(BaseModel):
|
|
|
76
78
|
config: The configuration to use for the setup
|
|
77
79
|
"""
|
|
78
80
|
if isinstance(self.client, RemoteClient):
|
|
79
|
-
await self._invoke_all(create_remote_config(self
|
|
81
|
+
await self._invoke_all(create_remote_config(self, config, REMOTE_SETUP))
|
|
80
82
|
else:
|
|
81
83
|
if config is not None:
|
|
82
84
|
await self._invoke_all(config)
|
|
@@ -97,7 +99,7 @@ class Environment(BaseModel):
|
|
|
97
99
|
"""
|
|
98
100
|
if isinstance(self.client, RemoteClient):
|
|
99
101
|
results = await self._invoke_all(
|
|
100
|
-
create_remote_config(self
|
|
102
|
+
create_remote_config(self, config, REMOTE_EVALUATE))
|
|
101
103
|
else:
|
|
102
104
|
if config is not None:
|
|
103
105
|
results = await self._invoke_all(config)
|
|
@@ -143,9 +145,14 @@ class Environment(BaseModel):
|
|
|
143
145
|
"""
|
|
144
146
|
if actions is None or len(actions) == 0:
|
|
145
147
|
actions = []
|
|
146
|
-
|
|
148
|
+
args = [[action.model_dump() for action in actions]]
|
|
149
|
+
|
|
150
|
+
# TODO: Move this into the server side
|
|
151
|
+
if self._maybe_store_response(actions):
|
|
152
|
+
return Observation(text=self.final_response), 0, False, {}
|
|
153
|
+
|
|
147
154
|
result, stdout, stderr = await self.client.invoke(
|
|
148
|
-
HudStyleConfig(function="step", args=
|
|
155
|
+
HudStyleConfig(function="step", args=args)
|
|
149
156
|
)
|
|
150
157
|
if stdout:
|
|
151
158
|
logger.info("Step produced stdout: %s", stdout.decode())
|
|
@@ -156,6 +163,21 @@ class Environment(BaseModel):
|
|
|
156
163
|
observation = Observation.model_validate(result["observation"], strict=True)
|
|
157
164
|
|
|
158
165
|
return observation, 0, False, {}
|
|
166
|
+
|
|
167
|
+
def _maybe_store_response(self, actions: list[CLA]) -> bool:
|
|
168
|
+
"""Store the final response into the environment.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
actions: The action(s) to check
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
bool: True if the response was submitted, False otherwise
|
|
175
|
+
"""
|
|
176
|
+
if len(actions) > 0 and actions[-1].type == "response":
|
|
177
|
+
self.final_response = actions[-1].text
|
|
178
|
+
return True
|
|
179
|
+
return False
|
|
180
|
+
|
|
159
181
|
|
|
160
182
|
async def get_urls(self) -> dict[str, Any]:
|
|
161
183
|
"""Get URLs for the environment.
|
|
@@ -179,3 +201,154 @@ class Environment(BaseModel):
|
|
|
179
201
|
This should release any resources and clean up the environment.
|
|
180
202
|
"""
|
|
181
203
|
await self.client.close()
|
|
204
|
+
|
|
205
|
+
def create_remote_config(
|
|
206
|
+
env: Environment | None = None,
|
|
207
|
+
config: HudStyleConfigs | None = None,
|
|
208
|
+
function: str | None = None,
|
|
209
|
+
) -> list[HudStyleConfig]:
|
|
210
|
+
"""
|
|
211
|
+
Create a remote configuration for setup or evaluate, determining the final
|
|
212
|
+
function call structure based on the provided task or explicit config.
|
|
213
|
+
|
|
214
|
+
This function orchestrates how setup and evaluate steps defined in a Task
|
|
215
|
+
or passed directly are prepared for remote execution via `env._invoke_all`.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
env: Environment object, potentially containing a task definition.
|
|
219
|
+
Used to access `env.task` and `env.final_response`.
|
|
220
|
+
config: Direct configuration override (e.g., passed to `env.evaluate(config=...)`).
|
|
221
|
+
Can be in various HudStyleConfigs formats.
|
|
222
|
+
function: The top-level function context, typically "setup" or "evaluate".
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
list[HudStyleConfig]: A list containing a single HudStyleConfig object
|
|
226
|
+
ready for remote invocation via `client.invoke`.
|
|
227
|
+
The specific function/arguments are chosen based on this priority:
|
|
228
|
+
1. Explicit `config` parameter (if provided).
|
|
229
|
+
2. Specific `task` attribute (e.g., `task.evaluate`).
|
|
230
|
+
3. General `task.config` dictionary.
|
|
231
|
+
4. Default private function using `task.id`
|
|
232
|
+
(e.g., `private_evaluate(task.id)`).
|
|
233
|
+
5. Base `function` name with minimal/default arguments.
|
|
234
|
+
|
|
235
|
+
Logic & Examples (Assuming `function="evaluate"` for examples):
|
|
236
|
+
|
|
237
|
+
1) Explicit `config` provided: The `config` is expanded and becomes the `args`
|
|
238
|
+
for the top-level `function` call. If the environment has a final_response,
|
|
239
|
+
it's appended to these args.
|
|
240
|
+
- Example Input:
|
|
241
|
+
`env` (with `final_response="Paris"`)
|
|
242
|
+
`config=("contains_text", "Paris")`
|
|
243
|
+
`function="evaluate"`
|
|
244
|
+
- Example Output:
|
|
245
|
+
`[HudStyleConfig(function='evaluate', args=[
|
|
246
|
+
HudStyleConfig(function='contains_text', args=['Paris', 'Paris'])
|
|
247
|
+
])]`
|
|
248
|
+
|
|
249
|
+
2) No explicit `config`, Task has the attribute (e.g., `task.evaluate`):
|
|
250
|
+
The Task's attribute value (e.g., `task.evaluate`) is expanded and becomes the `args`
|
|
251
|
+
for the top-level `function` call. Task ID is added if present. `final_response` is
|
|
252
|
+
appended if present.
|
|
253
|
+
- Example Input:
|
|
254
|
+
`env` (`task=Task(id="t1", evaluate=("check_answer",), ...)`, `final_response="42"`)
|
|
255
|
+
`config=None`
|
|
256
|
+
`function="evaluate"`
|
|
257
|
+
- Example Output:
|
|
258
|
+
`[HudStyleConfig(function='evaluate', args=[HudStyleConfig(function='check_answer',
|
|
259
|
+
args=['42'], id='t1')])]`
|
|
260
|
+
|
|
261
|
+
3) No explicit `config`, no specific Task attribute, Task has `task.config`:
|
|
262
|
+
The `task.config` dictionary becomes the single argument for the top-level
|
|
263
|
+
`function` call. Task ID is added to the config dict if present. `final_response` is
|
|
264
|
+
appended if present.
|
|
265
|
+
- Example Input:
|
|
266
|
+
`env` (with `task=Task(id="t2", config={"expected": "val"}, ...)`)
|
|
267
|
+
`config=None`
|
|
268
|
+
`function="evaluate"`
|
|
269
|
+
- Example Output:
|
|
270
|
+
`[HudStyleConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
|
|
271
|
+
|
|
272
|
+
4) No explicit `config`, no specific Task attribute, no `task.config`, Task has `task.id`:
|
|
273
|
+
Calls a private function (`private_<function>`) on the remote end, passing
|
|
274
|
+
the `task.id` as the only argument.
|
|
275
|
+
- Example Input:
|
|
276
|
+
`env` (with `task=Task(id="t3", ...)`)
|
|
277
|
+
`config=None`
|
|
278
|
+
`function="evaluate"`
|
|
279
|
+
- Example Output:
|
|
280
|
+
`[HudStyleConfig(function='private_evaluate', args=['t3'])]`
|
|
281
|
+
|
|
282
|
+
5) No explicit `config` and no relevant Task info:
|
|
283
|
+
Calls the top-level `function` with empty args.
|
|
284
|
+
- Example Input:
|
|
285
|
+
`env` (with `task=Task(...)`)
|
|
286
|
+
`config=None`
|
|
287
|
+
`function="evaluate"`
|
|
288
|
+
- Example Output:
|
|
289
|
+
`[HudStyleConfig(function='evaluate', args=[])]`
|
|
290
|
+
"""
|
|
291
|
+
# If no function provided, just expand the config and return it directly
|
|
292
|
+
if function is None:
|
|
293
|
+
if config:
|
|
294
|
+
return expand_config(config)
|
|
295
|
+
raise ValueError("Either function or config must be provided")
|
|
296
|
+
|
|
297
|
+
# Case 1: Explicit config provided
|
|
298
|
+
if config:
|
|
299
|
+
expanded_configs = expand_config(config)
|
|
300
|
+
if env and env.final_response:
|
|
301
|
+
# Ensure args is a list before appending
|
|
302
|
+
if not isinstance(expanded_configs[0].args, list):
|
|
303
|
+
expanded_configs[0].args = [expanded_configs[0].args]
|
|
304
|
+
expanded_configs[0].args.append(env.final_response) # for remote responses
|
|
305
|
+
return [HudStyleConfig(function=function, args=expanded_configs)]
|
|
306
|
+
|
|
307
|
+
# Otherwise, use the environment's task
|
|
308
|
+
task = env.task if env else None
|
|
309
|
+
|
|
310
|
+
# Must have a task for the remaining cases
|
|
311
|
+
if task is None:
|
|
312
|
+
raise ValueError("Either task or config must be provided")
|
|
313
|
+
|
|
314
|
+
# Case 2: Task has the specified function attribute
|
|
315
|
+
task_config = getattr(task, function, None)
|
|
316
|
+
if task_config:
|
|
317
|
+
expanded_configs = expand_config(task_config)
|
|
318
|
+
if task.id:
|
|
319
|
+
expanded_configs[0].id = task.id # for remote IDs
|
|
320
|
+
elif env and env.final_response:
|
|
321
|
+
# Ensure args is a list before appending
|
|
322
|
+
if not isinstance(expanded_configs[0].args, list):
|
|
323
|
+
expanded_configs[0].args = [expanded_configs[0].args]
|
|
324
|
+
expanded_configs[0].args.append(env.final_response) # for remote responses
|
|
325
|
+
return [HudStyleConfig(function=function, args=expanded_configs)]
|
|
326
|
+
|
|
327
|
+
# Case 3: Check for task.config
|
|
328
|
+
if hasattr(task, "config") and task.config:
|
|
329
|
+
# Ensure task.config is a dictionary before adding id
|
|
330
|
+
final_args = task.config.copy() if isinstance(task.config, dict) else {}
|
|
331
|
+
if task.id:
|
|
332
|
+
final_args["id"] = task.id # for remote IDs
|
|
333
|
+
if env and env.final_response:
|
|
334
|
+
# Append response, ensuring args exists and is a list
|
|
335
|
+
if "args" not in final_args:
|
|
336
|
+
final_args["args"] = []
|
|
337
|
+
if not isinstance(final_args["args"], list):
|
|
338
|
+
final_args["args"] = [final_args["args"]]
|
|
339
|
+
final_args["args"].append(env.final_response)
|
|
340
|
+
return [HudStyleConfig(function=function, args=[final_args])]
|
|
341
|
+
|
|
342
|
+
# Case 4: Use task.id
|
|
343
|
+
if task.id:
|
|
344
|
+
args_list = [task.id]
|
|
345
|
+
if env and env.final_response:
|
|
346
|
+
args_list.append(env.final_response) # Append final response
|
|
347
|
+
return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
|
|
348
|
+
|
|
349
|
+
# Case 5: No valid configuration found
|
|
350
|
+
args_list = []
|
|
351
|
+
if env and env.final_response:
|
|
352
|
+
args_list.append(env.final_response)
|
|
353
|
+
return [HudStyleConfig(function=function, args=args_list)]
|
|
354
|
+
|
hud/env/local_docker_client.py
CHANGED
|
@@ -25,7 +25,9 @@ class LocalDockerClient(DockerClient):
|
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
27
|
@classmethod
|
|
28
|
-
async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[
|
|
28
|
+
async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[
|
|
29
|
+
LocalDockerClient, dict[str, Any]
|
|
30
|
+
]:
|
|
29
31
|
"""
|
|
30
32
|
Creates a Docker environment client from a dockerfile.
|
|
31
33
|
|
hud/task.py
CHANGED
|
@@ -5,8 +5,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
7
7
|
from hud.types import CustomGym, Gym
|
|
8
|
-
from hud.utils import HudStyleConfig
|
|
9
|
-
from hud.utils.config import HudStyleConfigs
|
|
8
|
+
from hud.utils.common import HudStyleConfig, HudStyleConfigs
|
|
10
9
|
|
|
11
10
|
if TYPE_CHECKING:
|
|
12
11
|
from inspect_ai.dataset import Sample
|
|
@@ -35,7 +34,7 @@ class Task(BaseModel):
|
|
|
35
34
|
|
|
36
35
|
The setup and evaluate configurations can be in several formats:
|
|
37
36
|
- String (function name): "chrome.maximize"
|
|
38
|
-
-
|
|
37
|
+
- Tuple (function with args): ("chrome.activate_tab", 5)
|
|
39
38
|
- Dict: {"function": "chrome.navigate", "args": ["https://example.com"]}
|
|
40
39
|
- List of the above: ["chrome.maximize", {"function": "chrome.navigate", "args": ["https://example.com"]}]
|
|
41
40
|
|
|
@@ -68,15 +67,15 @@ class Task(BaseModel):
|
|
|
68
67
|
@classmethod
|
|
69
68
|
def from_inspect_sample(cls, sample: Sample) -> Task:
|
|
70
69
|
"""Create a Task from an Inspect dataset sample.
|
|
71
|
-
|
|
72
|
-
|
|
70
|
+
Automatically detects if a CustomGym (docker) or QA Gym is needed based on sample.sandbox.
|
|
71
|
+
Configures evaluation using 'response_includes' or 'match_all' based on sample.target.
|
|
73
72
|
|
|
74
73
|
Args:
|
|
75
74
|
sample: An Inspect dataset Sample object
|
|
76
75
|
|
|
77
76
|
Returns:
|
|
78
77
|
Task instance
|
|
79
|
-
|
|
78
|
+
|
|
80
79
|
The Inspect Sample has these fields:
|
|
81
80
|
- input (str | list[ChatMessage]): The input to be submitted to the model
|
|
82
81
|
- choices (list[str] | None): Optional multiple choice answer list
|
|
@@ -87,10 +86,8 @@ class Task(BaseModel):
|
|
|
87
86
|
- files (dict[str, str] | None): Optional files that go with the sample
|
|
88
87
|
- setup (str | None): Optional setup script to run for sample
|
|
89
88
|
"""
|
|
90
|
-
# Extract the input as prompt
|
|
91
89
|
prompt = sample.input
|
|
92
|
-
if isinstance(prompt, list):
|
|
93
|
-
# Convert chat message list to a string representation
|
|
90
|
+
if isinstance(prompt, list):
|
|
94
91
|
prompt_parts = []
|
|
95
92
|
for message in prompt:
|
|
96
93
|
role = message.role
|
|
@@ -98,36 +95,50 @@ class Task(BaseModel):
|
|
|
98
95
|
prompt_parts.append(f"{role.capitalize()}: {content}")
|
|
99
96
|
prompt = "\n\n".join(prompt_parts)
|
|
100
97
|
|
|
101
|
-
|
|
98
|
+
evaluate_config = None
|
|
99
|
+
if sample.target:
|
|
100
|
+
if isinstance(sample.target, str):
|
|
101
|
+
evaluate_config = ("response_includes", [sample.target])
|
|
102
|
+
elif isinstance(sample.target, list):
|
|
103
|
+
evaluate_config = ("match_all", sample.target)
|
|
104
|
+
|
|
105
|
+
task_gym: Gym | None = None
|
|
106
|
+
task_setup: HudStyleConfigs | None = None
|
|
107
|
+
|
|
102
108
|
sandbox = sample.sandbox
|
|
103
109
|
dockerfile = None
|
|
110
|
+
use_qa_gym = True
|
|
111
|
+
|
|
104
112
|
if sandbox:
|
|
105
113
|
if isinstance(sandbox, str):
|
|
106
|
-
if sandbox
|
|
107
|
-
|
|
114
|
+
if sandbox == "docker":
|
|
115
|
+
dockerfile = UBUNTU_DOCKERFILE
|
|
116
|
+
use_qa_gym = False
|
|
108
117
|
elif isinstance(sandbox, tuple) and len(sandbox) == 2:
|
|
109
118
|
sandbox_type, sandbox_config = sandbox
|
|
110
|
-
if sandbox_type
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
119
|
+
if sandbox_type == "docker":
|
|
120
|
+
dockerfile = sandbox_config
|
|
121
|
+
use_qa_gym = False
|
|
122
|
+
|
|
123
|
+
if use_qa_gym:
|
|
124
|
+
task_gym = "qa"
|
|
125
|
+
task_setup = None
|
|
126
|
+
else:
|
|
127
|
+
task_gym = CustomGym(
|
|
128
|
+
dockerfile=dockerfile or UBUNTU_DOCKERFILE,
|
|
129
|
+
location="local",
|
|
130
|
+
)
|
|
131
|
+
task_setup = [x for x in convert_inspect_setup(sample.setup)] if sample.setup else None
|
|
132
|
+
# TODO: Handle sample.files for CustomGym case if needed
|
|
133
|
+
|
|
120
134
|
|
|
121
135
|
return cls(
|
|
122
|
-
id=
|
|
136
|
+
id=None,
|
|
123
137
|
prompt=prompt,
|
|
124
|
-
setup=
|
|
138
|
+
setup=task_setup,
|
|
125
139
|
metadata=sample.metadata,
|
|
126
140
|
choices=sample.choices,
|
|
127
|
-
|
|
128
|
-
gym=
|
|
141
|
+
evaluate=evaluate_config,
|
|
142
|
+
gym=task_gym,
|
|
143
|
+
# files=sample.files, # TODO: Decide how/if to handle files
|
|
129
144
|
)
|
|
130
|
-
|
|
131
|
-
def convert_sdk01(self) -> None:
|
|
132
|
-
self.setup = [HudStyleConfig(function="reset", args=[{"task_id": self.id}])]
|
|
133
|
-
self.evaluate = [HudStyleConfig(function="evaluate", args=[])]
|
hud/taskset.py
CHANGED
|
@@ -9,6 +9,8 @@ from hud.settings import settings
|
|
|
9
9
|
from hud.task import Task
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Iterator
|
|
13
|
+
|
|
12
14
|
from inspect_ai.dataset import Dataset
|
|
13
15
|
|
|
14
16
|
|
|
@@ -49,6 +51,12 @@ class TaskSet(BaseModel):
|
|
|
49
51
|
"""
|
|
50
52
|
return len(self.tasks)
|
|
51
53
|
|
|
54
|
+
def __iter__(self) -> Iterator[Task]:
|
|
55
|
+
"""
|
|
56
|
+
Returns an iterator over the tasks in the taskset.
|
|
57
|
+
"""
|
|
58
|
+
return iter(self.tasks)
|
|
59
|
+
|
|
52
60
|
|
|
53
61
|
async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
|
|
54
62
|
"""
|
hud/utils/common.py
CHANGED
|
@@ -3,16 +3,43 @@ from __future__ import annotations
|
|
|
3
3
|
import io
|
|
4
4
|
import logging
|
|
5
5
|
import tarfile
|
|
6
|
-
from typing import TYPE_CHECKING, TypedDict
|
|
6
|
+
from typing import TYPE_CHECKING, Any, TypedDict
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
7
9
|
|
|
8
10
|
from hud.server.requests import make_request
|
|
9
11
|
from hud.settings import settings
|
|
10
12
|
|
|
11
13
|
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Iterator
|
|
12
15
|
from pathlib import Path
|
|
13
16
|
|
|
14
17
|
logger = logging.getLogger("hud.utils.common")
|
|
15
18
|
|
|
19
|
+
class HudStyleConfig(BaseModel):
|
|
20
|
+
function: str # Format: "x.y.z"
|
|
21
|
+
args: list[Any] # Must be json serializable
|
|
22
|
+
|
|
23
|
+
id: str | None = None # Optional id for remote execution
|
|
24
|
+
|
|
25
|
+
def __len__(self) -> int:
|
|
26
|
+
return len(self.args)
|
|
27
|
+
|
|
28
|
+
def __getitem__(self, index: int) -> Any:
|
|
29
|
+
return self.args[index]
|
|
30
|
+
|
|
31
|
+
def __iter__(self) -> Iterator[Any]:
|
|
32
|
+
return iter(self.args)
|
|
33
|
+
|
|
34
|
+
def __str__(self) -> str:
|
|
35
|
+
return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
|
|
36
|
+
|
|
37
|
+
# Type alias for the shorthand config, which just converts to function name and args
|
|
38
|
+
ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
|
|
39
|
+
|
|
40
|
+
# Type alias for multiple config formats
|
|
41
|
+
HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
|
|
42
|
+
|
|
16
43
|
class ExecuteResult(TypedDict):
|
|
17
44
|
"""
|
|
18
45
|
Result of an execute command.
|
hud/utils/config.py
CHANGED
|
@@ -2,14 +2,8 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import re
|
|
5
|
-
from typing import TYPE_CHECKING, Any
|
|
6
5
|
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
if TYPE_CHECKING:
|
|
10
|
-
from collections.abc import Iterator
|
|
11
|
-
|
|
12
|
-
from hud.task import Task
|
|
6
|
+
from hud.utils.common import HudStyleConfig, HudStyleConfigs
|
|
13
7
|
|
|
14
8
|
logger = logging.getLogger("hud.utils.config")
|
|
15
9
|
|
|
@@ -17,30 +11,6 @@ REMOTE_FUNCTION_PREFIX = "private_"
|
|
|
17
11
|
REMOTE_SETUP = "setup"
|
|
18
12
|
REMOTE_EVALUATE = "evaluate"
|
|
19
13
|
|
|
20
|
-
class HudStyleConfig(BaseModel):
|
|
21
|
-
function: str # Format: "x.y.z"
|
|
22
|
-
args: list[Any] # Must be json serializable
|
|
23
|
-
|
|
24
|
-
id: str | None = None # Optional id for remote execution
|
|
25
|
-
|
|
26
|
-
def __len__(self) -> int:
|
|
27
|
-
return len(self.args)
|
|
28
|
-
|
|
29
|
-
def __getitem__(self, index: int) -> Any:
|
|
30
|
-
return self.args[index]
|
|
31
|
-
|
|
32
|
-
def __iter__(self) -> Iterator[Any]:
|
|
33
|
-
return iter(self.args)
|
|
34
|
-
|
|
35
|
-
def __str__(self) -> str:
|
|
36
|
-
return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
|
|
37
|
-
|
|
38
|
-
# Type alias for the shorthand config, which just converts to function name and args
|
|
39
|
-
ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
|
|
40
|
-
|
|
41
|
-
# Type alias for multiple config formats
|
|
42
|
-
HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
|
|
43
|
-
|
|
44
14
|
def _is_valid_python_name(name: str) -> bool:
|
|
45
15
|
"""Check if a string is a valid Python identifier."""
|
|
46
16
|
return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name))
|
|
@@ -122,64 +92,3 @@ def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
|
|
|
122
92
|
error_msg = f"Unknown configuration type: {type(config)}"
|
|
123
93
|
logger.error(error_msg)
|
|
124
94
|
raise ValueError(error_msg)
|
|
125
|
-
|
|
126
|
-
def create_remote_config(
|
|
127
|
-
task: Task | None = None,
|
|
128
|
-
config: HudStyleConfigs | None = None,
|
|
129
|
-
function: str | None = None,
|
|
130
|
-
) -> list[HudStyleConfig]:
|
|
131
|
-
"""
|
|
132
|
-
Create a configuration based on provided inputs.
|
|
133
|
-
|
|
134
|
-
Args:
|
|
135
|
-
task: Task object with configuration
|
|
136
|
-
config: Direct configuration (expanded or not)
|
|
137
|
-
function: Function name to use
|
|
138
|
-
|
|
139
|
-
Returns:
|
|
140
|
-
list[HudStyleConfig]: List of standardized configurations
|
|
141
|
-
|
|
142
|
-
Logic:
|
|
143
|
-
1) If explicit config: expand and return HudStyleConfig with func of the function,
|
|
144
|
-
and args of expanded config
|
|
145
|
-
2) If task has the specified function defined: use that
|
|
146
|
-
3) If no task function: check for task._config and use that
|
|
147
|
-
4) If no _config: use task.id and create private_[function]
|
|
148
|
-
"""
|
|
149
|
-
# If no function provided, just expand the config and return it directly
|
|
150
|
-
if function is None:
|
|
151
|
-
if config:
|
|
152
|
-
return expand_config(config)
|
|
153
|
-
raise ValueError("Either function or config must be provided")
|
|
154
|
-
|
|
155
|
-
# Case 1: Explicit config provided
|
|
156
|
-
if config:
|
|
157
|
-
expanded_configs = expand_config(config)
|
|
158
|
-
return [HudStyleConfig(function=function, args=expanded_configs)]
|
|
159
|
-
|
|
160
|
-
# Must have a task for the remaining cases
|
|
161
|
-
if task is None:
|
|
162
|
-
raise ValueError("Either task or config must be provided")
|
|
163
|
-
|
|
164
|
-
# Case 2: Task has the specified function attribute
|
|
165
|
-
task_config = getattr(task, function, None)
|
|
166
|
-
if task_config and len(task_config) > 0:
|
|
167
|
-
expanded_configs = expand_config(task_config)
|
|
168
|
-
if task.id:
|
|
169
|
-
expanded_configs[0].id = task.id # for remote IDs
|
|
170
|
-
return [HudStyleConfig(function=function, args=expanded_configs)]
|
|
171
|
-
|
|
172
|
-
# Case 3: Check for _config
|
|
173
|
-
if hasattr(task, "config") and task.config:
|
|
174
|
-
if task.id:
|
|
175
|
-
task.config["id"] = task.id # for remote IDs
|
|
176
|
-
return [HudStyleConfig(function=function, args=[task.config])]
|
|
177
|
-
|
|
178
|
-
# Case 4: Use task.id
|
|
179
|
-
if task.id:
|
|
180
|
-
return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=[task.id])]
|
|
181
|
-
|
|
182
|
-
# No valid configuration found
|
|
183
|
-
#logger.warning("No valid configuration found for function: %s", function)
|
|
184
|
-
return [HudStyleConfig(function=function, args=[])]
|
|
185
|
-
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
|
-
Project-URL: Homepage, https://github.com/
|
|
6
|
-
Project-URL: Bug Tracker, https://github.com/
|
|
5
|
+
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
7
7
|
Project-URL: Documentation, https://hud.so
|
|
8
8
|
Author-email: Human Union Data SDK <founders@hud.so>
|
|
9
9
|
License: MIT License
|
|
@@ -57,7 +57,7 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
|
57
57
|
Requires-Dist: ruff==0.9.8; extra == 'dev'
|
|
58
58
|
Description-Content-Type: text/markdown
|
|
59
59
|
|
|
60
|
-
# HUD
|
|
60
|
+
# HUD
|
|
61
61
|
|
|
62
62
|
A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
|
|
63
63
|
|
|
@@ -86,21 +86,20 @@ export HUD_API_KEY=your_api_key_here
|
|
|
86
86
|
pip install hud-python
|
|
87
87
|
```
|
|
88
88
|
|
|
89
|
-
### Simple Browser Example with
|
|
89
|
+
### Simple Browser Example with Claude Computer Use
|
|
90
90
|
|
|
91
91
|
> This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
|
|
92
92
|
|
|
93
|
+
Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
|
|
94
|
+
|
|
93
95
|
```python
|
|
94
|
-
import os
|
|
95
96
|
import asyncio
|
|
96
97
|
from hud import gym, job
|
|
97
98
|
from hud.task import Task
|
|
98
|
-
from hud.
|
|
99
|
-
from hud.agent import OperatorAgent
|
|
99
|
+
from hud.agent import ClaudeAgent
|
|
100
100
|
|
|
101
101
|
@job("test-run")
|
|
102
102
|
async def main():
|
|
103
|
-
# Define a simple task
|
|
104
103
|
task = Task(
|
|
105
104
|
prompt="Insert the text 'capybara' into the search bar",
|
|
106
105
|
gym="hud-browser",
|
|
@@ -108,26 +107,20 @@ async def main():
|
|
|
108
107
|
evaluate=("contains_text", "capybara")
|
|
109
108
|
)
|
|
110
109
|
|
|
111
|
-
# Create environment
|
|
110
|
+
# Create environment using the gym module
|
|
112
111
|
env = await gym.make(task)
|
|
113
112
|
|
|
114
|
-
# Get URLs and display live view (optional)
|
|
115
|
-
# urls = await env.get_urls()
|
|
116
|
-
# stream(urls["live_url"])
|
|
117
|
-
|
|
118
113
|
# Initialize Operator agent (API key is loaded automatically)
|
|
119
|
-
agent =
|
|
114
|
+
agent = ClaudeAgent()
|
|
120
115
|
|
|
121
|
-
# Agent loop
|
|
122
|
-
obs, _ = env.reset()
|
|
116
|
+
# Agent loop with predict and step functions
|
|
117
|
+
obs, _ = await env.reset() # Gets first observation
|
|
123
118
|
for i in range(5):
|
|
124
119
|
actions, done = await agent.predict(obs)
|
|
125
120
|
if done:
|
|
126
121
|
break
|
|
127
122
|
|
|
128
123
|
obs, reward, terminated, info = await env.step(actions)
|
|
129
|
-
if terminated:
|
|
130
|
-
break
|
|
131
124
|
|
|
132
125
|
# Evaluate and close
|
|
133
126
|
result = await env.evaluate()
|
|
@@ -143,26 +136,26 @@ if __name__ == "__main__":
|
|
|
143
136
|
|
|
144
137
|
Explore the core concepts and features of the SDK:
|
|
145
138
|
|
|
146
|
-
* **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios.
|
|
139
|
+
* **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
|
|
147
140
|
* **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
|
|
148
141
|
* **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
|
|
149
142
|
* **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
|
|
150
143
|
* **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
|
|
151
144
|
* **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
|
|
152
145
|
* **Advanced Topics**:
|
|
146
|
+
* **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
|
|
153
147
|
* **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
|
|
154
148
|
* **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
|
|
155
|
-
* **[CLA Action Details](/advanced/cla-details)**: Dive deeper into the standardized action format.
|
|
156
149
|
|
|
157
150
|
* **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
|
|
158
151
|
|
|
159
152
|
## [Examples](examples/)
|
|
160
153
|
|
|
161
|
-
We
|
|
154
|
+
We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
|
|
162
155
|
|
|
163
156
|
1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
|
|
164
157
|
2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
|
|
165
|
-
3. [OSWorld](examples/osworld.ipynb) -
|
|
158
|
+
3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
|
|
166
159
|
4. [Local Development](examples/local.ipynb) - Setting up local custom environments
|
|
167
160
|
|
|
168
161
|
## Documentation
|
|
@@ -180,9 +173,9 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
180
173
|
```bibtex
|
|
181
174
|
@software{hud2025agentevalplatform,
|
|
182
175
|
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
|
|
183
|
-
title = {{HUD: An Evaluation Platform for
|
|
184
|
-
date = {2025-
|
|
185
|
-
url = {https://github.com/
|
|
176
|
+
title = {{HUD: An Evaluation Platform for Agents}},
|
|
177
|
+
date = {2025-04},
|
|
178
|
+
url = {https://github.com/hud-evals/hud-sdk},
|
|
186
179
|
langid = {en}
|
|
187
180
|
}
|
|
188
181
|
```
|
|
@@ -1,28 +1,28 @@
|
|
|
1
|
-
hud/__init__.py,sha256=
|
|
1
|
+
hud/__init__.py,sha256=HFL1iwPhLZd7z--2QADzipur68XlekwGrOzU2vWL-Vw,464
|
|
2
2
|
hud/gym.py,sha256=cKjIuJS7A0vJx4K7fctpUjIEv8TkW5x6aB_PRrODrDY,3651
|
|
3
3
|
hud/job.py,sha256=E4RN1CkppRQVy46RWCUDjNIyhMa7lNlFfCgpky2vKFk,5463
|
|
4
4
|
hud/settings.py,sha256=rv8TiZx4wmBzIoEEkOzoywC0nt8UZXlHxIa_LW4tWAg,1346
|
|
5
|
-
hud/task.py,sha256=
|
|
6
|
-
hud/taskset.py,sha256=
|
|
5
|
+
hud/task.py,sha256=aNbHMlO7r1cm5DcO0QLU1SZ7EawOFw9W6DZwTNy72-4,5383
|
|
6
|
+
hud/taskset.py,sha256=xDPBXeDm4AlSOwl-MM98lN0x6PmGV8t9jv7sNyS_u0c,2426
|
|
7
7
|
hud/trajectory.py,sha256=PA-sE2iyt2BctO2Dex-2ZaRmS95AkEXTicZjHCVCYqE,3749
|
|
8
8
|
hud/types.py,sha256=fJZnzK3j3mq7G0gO5TbqRaN92qT4xAb4jUNOXIX8ZZ0,2395
|
|
9
9
|
hud/adapters/__init__.py,sha256=0RNQgrzBCkhNBq1Q7JRESN1WfUVLs_99fR5g1re3APs,207
|
|
10
10
|
hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
|
|
11
|
-
hud/adapters/claude/adapter.py,sha256=
|
|
11
|
+
hud/adapters/claude/adapter.py,sha256=x0qQglWsg7n8DJ_NacsymlUQBnkpqNVguUlkQRpYX-A,5955
|
|
12
12
|
hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
|
|
13
13
|
hud/adapters/common/adapter.py,sha256=ls-gXtg1N_SQc211rkDb3LL511HNZv6etm1nx2ZtrkQ,5808
|
|
14
|
-
hud/adapters/common/types.py,sha256=
|
|
14
|
+
hud/adapters/common/types.py,sha256=APxGEmoePwjF7OYXAKqBTVT73PJTFV0eBmbURbaT5xk,5091
|
|
15
15
|
hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
|
|
16
|
-
hud/adapters/operator/adapter.py,sha256=
|
|
16
|
+
hud/adapters/operator/adapter.py,sha256=svHgjCdUeMyfgfGzRO3ItGWTKGkm3tmldO2zfjX_sGI,3301
|
|
17
17
|
hud/agent/__init__.py,sha256=cI3bqfmG2_Lwzn2RjrxV0X9qIxCRDiffwd1UaWToct4,238
|
|
18
18
|
hud/agent/base.py,sha256=RThJ_h4A3oU23zyvvKtxY2a_YM03Vd1XYDXdY3bAf8g,3881
|
|
19
|
-
hud/agent/claude.py,sha256=
|
|
20
|
-
hud/agent/operator.py,sha256=
|
|
19
|
+
hud/agent/claude.py,sha256=tbDKAzGCLJPnUnHc8eV-zZmj3ZG6QQx0ukWKoO4Ekec,7445
|
|
20
|
+
hud/agent/operator.py,sha256=44t19TzcCrS1N3-rnD25ZLXx5s4Io8On27LomALuugs,8185
|
|
21
21
|
hud/env/__init__.py,sha256=BHFY_N0kEI142pjWtMyqUb3BGnoiekY8evRCIbSbO2w,271
|
|
22
22
|
hud/env/client.py,sha256=SPR6ct6NFxmIrgIi3K8tEC-vnqOmCbCBtuT81PaVjuY,869
|
|
23
|
-
hud/env/docker_client.py,sha256=
|
|
24
|
-
hud/env/environment.py,sha256=
|
|
25
|
-
hud/env/local_docker_client.py,sha256=
|
|
23
|
+
hud/env/docker_client.py,sha256=56_u3Ri4NulGcBumAg-7-KilmFmBKthOwEIM5bOLOZc,10418
|
|
24
|
+
hud/env/environment.py,sha256=Xyq4KQO9aWYPwZ0uESAetB5EEZgmlEnZVc7sA0DLz2c,13706
|
|
25
|
+
hud/env/local_docker_client.py,sha256=TCD9z1qjafxjwAWLatAL8d587_ioMDHjs8T5cBgusr8,7789
|
|
26
26
|
hud/env/remote_client.py,sha256=iJiwueuf98xOx0_Y2ltu_63BwKIKNvohhim73Goq74E,5804
|
|
27
27
|
hud/env/remote_docker_client.py,sha256=FwaO7NyygDt9oe3pDD7PwUS21pxzc465mwcXk-Cx-60,6838
|
|
28
28
|
hud/evaluators/__init__.py,sha256=XophB666xPnurhQ_ygfW44h0Jh0BQGCgUzCXEOG2Q1g,158
|
|
@@ -34,11 +34,11 @@ hud/evaluators/remote.py,sha256=NVUJJvrpGQj2eL-aFxzTEnAWW7iuSI9eDWtar54dc6E,2174
|
|
|
34
34
|
hud/server/__init__.py,sha256=cxDKTwMdGzhj7bYajtejN8XCt7K8Xq3eKB2No0qBpoY,169
|
|
35
35
|
hud/server/requests.py,sha256=s8LZZYWT1wl7lPu2vwRaYPZs9_gjKwSg3LZLvS5-s6E,9085
|
|
36
36
|
hud/utils/__init__.py,sha256=LnoI2tQUnd-mQ4eg-gpJJgmHBBIhggJ6c9ap7MBgrfs,260
|
|
37
|
-
hud/utils/common.py,sha256=
|
|
38
|
-
hud/utils/config.py,sha256=
|
|
37
|
+
hud/utils/common.py,sha256=XJZ-hKJkeaNmelG2QD5ybi9FpZQS1ErA40fAYzUSHVE,2742
|
|
38
|
+
hud/utils/config.py,sha256=ePi3GDo8mDUnOZ5G5HyMprqGRvxrxCMfixGNuTOA8rQ,3266
|
|
39
39
|
hud/utils/telemetry.py,sha256=md7AuKxtDqsONMeeTOHen1XpmNds8CbXROX_PnkDxFc,1993
|
|
40
40
|
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
hud_python-0.2.
|
|
42
|
-
hud_python-0.2.
|
|
43
|
-
hud_python-0.2.
|
|
44
|
-
hud_python-0.2.
|
|
41
|
+
hud_python-0.2.1.dist-info/METADATA,sha256=f2lyqGmu9L7_zgCOqrhZ6ZX1JUU6Z0e92bRTfmojSqQ,7219
|
|
42
|
+
hud_python-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
43
|
+
hud_python-0.2.1.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
|
|
44
|
+
hud_python-0.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|