hud-python 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

hud/__init__.py CHANGED
@@ -8,7 +8,7 @@ from . import agent, env, gym, settings, task, taskset, types, utils
8
8
  from .job import create_job, job, load_job
9
9
  from .taskset import load_taskset
10
10
 
11
- __version__ = "0.2.0"
11
+ __version__ = "0.2.1"
12
12
 
13
13
  __all__ = [
14
14
  "agent",
@@ -13,6 +13,7 @@ from hud.adapters.common.types import (
13
13
  Point,
14
14
  PositionFetch,
15
15
  PressAction,
16
+ ResponseAction,
16
17
  ScreenshotFetch,
17
18
  ScrollAction,
18
19
  TypeAction,
@@ -21,7 +22,10 @@ from hud.adapters.common.types import (
21
22
 
22
23
 
23
24
  class ClaudeAdapter(Adapter):
24
- KEY_MAP: ClassVar[dict[str, CLAKey]] = {"Return": "enter"}
25
+ KEY_MAP: ClassVar[dict[str, CLAKey]] = {
26
+ "Return": "enter",
27
+ "Super": "win",
28
+ }
25
29
 
26
30
  def __init__(self) -> None:
27
31
  super().__init__()
@@ -151,6 +155,10 @@ class ClaudeAdapter(Adapter):
151
155
  elif action_type == "wait":
152
156
  assert "duration" in data
153
157
  return WaitAction(time=data["duration"])
158
+
159
+ elif action_type == "response":
160
+ return ResponseAction(text=data.get("text", ""))
161
+
154
162
  else:
155
163
  raise ValueError(f"Unsupported action type: {action_type}")
156
164
  except AssertionError:
@@ -82,6 +82,12 @@ class DragAction(CLAAction):
82
82
  hold_keys: list[CLAKey] | None = None
83
83
 
84
84
 
85
+ # RESPONSE ACTION from agent
86
+ class ResponseAction(CLAAction):
87
+ type: Literal["response"] = "response"
88
+ text: str # The final textual response from the agent
89
+
90
+
85
91
  # SCREENSHOT ACTION
86
92
  class ScreenshotFetch(CLAAction):
87
93
  type: Literal["screenshot"] = "screenshot"
@@ -103,6 +109,7 @@ CLA = Annotated[
103
109
  | KeyDownAction
104
110
  | KeyUpAction
105
111
  | TypeAction
112
+ | ResponseAction
106
113
  | ScrollAction
107
114
  | MoveAction
108
115
  | WaitAction
@@ -10,6 +10,7 @@ from hud.adapters.common.types import (
10
10
  MoveAction,
11
11
  Point,
12
12
  PressAction,
13
+ ResponseAction,
13
14
  ScreenshotFetch,
14
15
  ScrollAction,
15
16
  TypeAction,
@@ -86,6 +87,9 @@ class OperatorAdapter(Adapter):
86
87
 
87
88
  elif action_type == "screenshot":
88
89
  return ScreenshotFetch()
90
+
91
+ elif action_type == "response":
92
+ return ResponseAction(text=data.get("text", ""))
89
93
  else:
90
94
  raise ValueError(f"Unsupported action type: {action_type}")
91
95
 
hud/agent/claude.py CHANGED
@@ -11,7 +11,7 @@ from anthropic.types.beta import (
11
11
  BetaImageBlockParam,
12
12
  )
13
13
 
14
-
14
+ from hud.adapters import Adapter
15
15
  from hud.agent.base import Agent
16
16
  from hud.adapters.claude import ClaudeAdapter
17
17
  from hud.env.environment import Observation
@@ -61,7 +61,7 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
61
61
  def __init__(
62
62
  self,
63
63
  client: AsyncAnthropic | None = None,
64
- adapter: ClaudeAdapter | None = None,
64
+ adapter: Adapter | None = None,
65
65
  model: str = "claude-3-7-sonnet-20250219",
66
66
  max_tokens: int = 4096,
67
67
  max_iterations: int = 10,
@@ -85,6 +85,8 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
85
85
 
86
86
  # Create client
87
87
  client = AsyncAnthropic(api_key=api_key)
88
+
89
+ adapter = adapter or ClaudeAdapter()
88
90
 
89
91
  super().__init__(client=client, adapter=adapter)
90
92
 
@@ -184,4 +186,22 @@ class ClaudeAgent(Agent[AsyncAnthropic, Any]):
184
186
  done = False
185
187
  break
186
188
 
189
+ # If no tool use action was found, check for a final text response
190
+ if not actions and done:
191
+ final_text_response = ""
192
+ for block in response_content:
193
+ if block.type == "text":
194
+ final_text_response += block.text
195
+
196
+ if final_text_response.strip():
197
+ logger.info(f"No tool use found. Using final text as response: {final_text_response}")
198
+ actions = [{
199
+ "action": "response",
200
+ "text": final_text_response.strip()
201
+ }]
202
+ # Keep done = True
203
+ else:
204
+ logger.info("No tool use and no final text block found.")
205
+ # Keep done = True, actions remains empty
206
+
187
207
  return actions, done
hud/agent/operator.py CHANGED
@@ -9,9 +9,11 @@ from openai.types.responses import (
9
9
  ResponseInputParam,
10
10
  ResponseInputItemParam,
11
11
  ResponseOutputMessage,
12
- ResponseComputerToolCall
12
+ ResponseComputerToolCall,
13
+ ResponseOutputText
13
14
  )
14
15
 
16
+ from hud.adapters import Adapter
15
17
  from hud.agent.base import Agent
16
18
  from hud.adapters.operator import OperatorAdapter
17
19
  from hud.env.environment import Observation
@@ -32,7 +34,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
32
34
  client: OpenAI | None = None,
33
35
  model: str = "computer-use-preview",
34
36
  environment: Literal["windows", "mac", "linux", "browser"] = "windows",
35
- adapter: OperatorAdapter | None = None,
37
+ adapter: Adapter | None = None,
36
38
  max_iterations: int = 8
37
39
  ):
38
40
  """
@@ -54,6 +56,8 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
54
56
 
55
57
  # Create synchronous client
56
58
  client = OpenAI(api_key=api_key)
59
+
60
+ adapter = adapter or OperatorAdapter()
57
61
 
58
62
  super().__init__(client=client, adapter=adapter)
59
63
 
@@ -74,7 +78,7 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
74
78
  self.last_response_id = None
75
79
  self.pending_call_id = None
76
80
  self.initial_prompt = None
77
-
81
+
78
82
  async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
79
83
  """
80
84
  Fetch a response from the model based on the observation.
@@ -158,33 +162,47 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
158
162
  # Store the response ID for the next call
159
163
  self.last_response_id = response.id
160
164
 
161
- # Process the response to extract computer calls
165
+ # Process the response to extract actions or final text
162
166
  actions = []
163
- done = True # Assume we're done unless we find a computer call
164
-
165
- # Loop through all items in the output to find computer_call items
167
+ done = True # Assume done unless a computer call is found
168
+ final_text_response = ""
169
+
170
+ # Check for computer calls first
166
171
  computer_calls = [
167
172
  item for item in response.output
168
173
  if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
169
174
  ]
170
175
 
171
176
  if computer_calls:
172
- # Extract the computer calls and mark that we're not done
177
+ # If computer calls exist, process them and set done=False
173
178
  done = False
174
-
175
- # Process all computer calls
176
179
  for computer_call in computer_calls:
177
180
  self.pending_call_id = computer_call.call_id
178
181
  action = computer_call.action
179
- actions.append(action.model_dump())
180
-
181
- # Log the action
182
+ actions.append(action.model_dump()) # Convert Pydantic model to dict
182
183
  logger.info(f"Computer call action: {action}")
183
184
  else:
184
- # If there are no computer calls, print some debug info
185
- logger.info("No computer call found in the response. Either complete or error.")
185
+ # No computer calls, check for a final text message
186
+ logger.info("No computer call found. Checking for final message.")
187
+ logger.info(response.output)
186
188
  for item in response.output:
187
189
  if isinstance(item, ResponseOutputMessage) and item.type == "message":
188
- logger.info(f"Message: {item.content}")
189
-
190
+ # Extract text from content blocks within the message
191
+ full_text = "".join([c.text for c in item.content if isinstance(c, ResponseOutputText)])
192
+ if full_text:
193
+ final_text_response = full_text
194
+ logger.info(f"Final text message: {final_text_response}")
195
+ break # Stop after finding the first text message
196
+
197
+ # If we found final text, package it as a 'response' action
198
+ if final_text_response:
199
+ actions = [{
200
+ "type": "response",
201
+ "text": final_text_response
202
+ }]
203
+ # Keep done = True
204
+ else:
205
+ logger.info("No computer calls and no final text message found.")
206
+ # Keep done = True, actions remains empty
207
+
190
208
  return actions, done
hud/env/docker_client.py CHANGED
@@ -215,7 +215,7 @@ class DockerClient(Client):
215
215
  raise ValueError("Could not find package name in pyproject.toml")
216
216
  logger.info("Installing %s in /root/controller", self._package_name)
217
217
  result = await self.execute(
218
- ["bash", "-c", "cd /root/controller && pip install -e ."],
218
+ ["bash", "-c", "cd /root/controller && pip install -e . --break-system-packages"],
219
219
  timeout=60,
220
220
  )
221
221
  if result["stdout"]:
hud/env/environment.py CHANGED
@@ -10,14 +10,13 @@ from pydantic import BaseModel
10
10
  from hud.env.client import Client
11
11
  from hud.env.remote_client import RemoteClient
12
12
  from hud.task import Task
13
- from hud.utils import HudStyleConfigs, expand_config
14
- from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP, HudStyleConfig, create_remote_config
15
-
16
- if TYPE_CHECKING:
17
- from hud.adapters.common import CLA
13
+ from hud.utils.common import HudStyleConfig, HudStyleConfigs
14
+ from hud.utils.config import REMOTE_EVALUATE, REMOTE_FUNCTION_PREFIX, REMOTE_SETUP, expand_config
18
15
 
19
16
  logger = logging.getLogger("hud.environment")
20
17
 
18
+ if TYPE_CHECKING:
19
+ from hud.adapters.common import CLA
21
20
 
22
21
  class Observation(BaseModel):
23
22
  """
@@ -46,6 +45,9 @@ class Environment(BaseModel):
46
45
  task: Task | None = None
47
46
  build_data: dict[str, Any]
48
47
 
48
+ # final response
49
+ final_response: str | None = None
50
+
49
51
  async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]:
50
52
  # Execute each config and collect results
51
53
  configs_all = [configs] if not isinstance(configs, list) else configs
@@ -76,7 +78,7 @@ class Environment(BaseModel):
76
78
  config: The configuration to use for the setup
77
79
  """
78
80
  if isinstance(self.client, RemoteClient):
79
- await self._invoke_all(create_remote_config(self.task, config, REMOTE_SETUP))
81
+ await self._invoke_all(create_remote_config(self, config, REMOTE_SETUP))
80
82
  else:
81
83
  if config is not None:
82
84
  await self._invoke_all(config)
@@ -97,7 +99,7 @@ class Environment(BaseModel):
97
99
  """
98
100
  if isinstance(self.client, RemoteClient):
99
101
  results = await self._invoke_all(
100
- create_remote_config(self.task, config, REMOTE_EVALUATE))
102
+ create_remote_config(self, config, REMOTE_EVALUATE))
101
103
  else:
102
104
  if config is not None:
103
105
  results = await self._invoke_all(config)
@@ -143,9 +145,14 @@ class Environment(BaseModel):
143
145
  """
144
146
  if actions is None or len(actions) == 0:
145
147
  actions = []
146
-
148
+ args = [[action.model_dump() for action in actions]]
149
+
150
+ # TODO: Move this into the server side
151
+ if self._maybe_store_response(actions):
152
+ return Observation(text=self.final_response), 0, False, {}
153
+
147
154
  result, stdout, stderr = await self.client.invoke(
148
- HudStyleConfig(function="step", args=[[action.model_dump() for action in actions]])
155
+ HudStyleConfig(function="step", args=args)
149
156
  )
150
157
  if stdout:
151
158
  logger.info("Step produced stdout: %s", stdout.decode())
@@ -156,6 +163,21 @@ class Environment(BaseModel):
156
163
  observation = Observation.model_validate(result["observation"], strict=True)
157
164
 
158
165
  return observation, 0, False, {}
166
+
167
+ def _maybe_store_response(self, actions: list[CLA]) -> bool:
168
+ """Store the final response into the environment.
169
+
170
+ Args:
171
+ actions: The action(s) to check
172
+
173
+ Returns:
174
+ bool: True if the response was submitted, False otherwise
175
+ """
176
+ if len(actions) > 0 and actions[-1].type == "response":
177
+ self.final_response = actions[-1].text
178
+ return True
179
+ return False
180
+
159
181
 
160
182
  async def get_urls(self) -> dict[str, Any]:
161
183
  """Get URLs for the environment.
@@ -179,3 +201,154 @@ class Environment(BaseModel):
179
201
  This should release any resources and clean up the environment.
180
202
  """
181
203
  await self.client.close()
204
+
205
+ def create_remote_config(
206
+ env: Environment | None = None,
207
+ config: HudStyleConfigs | None = None,
208
+ function: str | None = None,
209
+ ) -> list[HudStyleConfig]:
210
+ """
211
+ Create a remote configuration for setup or evaluate, determining the final
212
+ function call structure based on the provided task or explicit config.
213
+
214
+ This function orchestrates how setup and evaluate steps defined in a Task
215
+ or passed directly are prepared for remote execution via `env._invoke_all`.
216
+
217
+ Args:
218
+ env: Environment object, potentially containing a task definition.
219
+ Used to access `env.task` and `env.final_response`.
220
+ config: Direct configuration override (e.g., passed to `env.evaluate(config=...)`).
221
+ Can be in various HudStyleConfigs formats.
222
+ function: The top-level function context, typically "setup" or "evaluate".
223
+
224
+ Returns:
225
+ list[HudStyleConfig]: A list containing a single HudStyleConfig object
226
+ ready for remote invocation via `client.invoke`.
227
+ The specific function/arguments are chosen based on this priority:
228
+ 1. Explicit `config` parameter (if provided).
229
+ 2. Specific `task` attribute (e.g., `task.evaluate`).
230
+ 3. General `task.config` dictionary.
231
+ 4. Default private function using `task.id`
232
+ (e.g., `private_evaluate(task.id)`).
233
+ 5. Base `function` name with minimal/default arguments.
234
+
235
+ Logic & Examples (Assuming `function="evaluate"` for examples):
236
+
237
+ 1) Explicit `config` provided: The `config` is expanded and becomes the `args`
238
+ for the top-level `function` call. If the environment has a final_response,
239
+ it's appended to these args.
240
+ - Example Input:
241
+ `env` (with `final_response="Paris"`)
242
+ `config=("contains_text", "Paris")`
243
+ `function="evaluate"`
244
+ - Example Output:
245
+ `[HudStyleConfig(function='evaluate', args=[
246
+ HudStyleConfig(function='contains_text', args=['Paris', 'Paris'])
247
+ ])]`
248
+
249
+ 2) No explicit `config`, Task has the attribute (e.g., `task.evaluate`):
250
+ The Task's attribute value (e.g., `task.evaluate`) is expanded and becomes the `args`
251
+ for the top-level `function` call. Task ID is added if present. `final_response` is
252
+ appended if present.
253
+ - Example Input:
254
+ `env` (`task=Task(id="t1", evaluate=("check_answer",), ...)`, `final_response="42"`)
255
+ `config=None`
256
+ `function="evaluate"`
257
+ - Example Output:
258
+ `[HudStyleConfig(function='evaluate', args=[HudStyleConfig(function='check_answer',
259
+ args=['42'], id='t1')])]`
260
+
261
+ 3) No explicit `config`, no specific Task attribute, Task has `task.config`:
262
+ The `task.config` dictionary becomes the single argument for the top-level
263
+ `function` call. Task ID is added to the config dict if present. `final_response` is
264
+ appended if present.
265
+ - Example Input:
266
+ `env` (with `task=Task(id="t2", config={"expected": "val"}, ...)`)
267
+ `config=None`
268
+ `function="evaluate"`
269
+ - Example Output:
270
+ `[HudStyleConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
271
+
272
+ 4) No explicit `config`, no specific Task attribute, no `task.config`, Task has `task.id`:
273
+ Calls a private function (`private_<function>`) on the remote end, passing
274
+ the `task.id` as the only argument.
275
+ - Example Input:
276
+ `env` (with `task=Task(id="t3", ...)`)
277
+ `config=None`
278
+ `function="evaluate"`
279
+ - Example Output:
280
+ `[HudStyleConfig(function='private_evaluate', args=['t3'])]`
281
+
282
+ 5) No explicit `config` and no relevant Task info:
283
+ Calls the top-level `function` with empty args.
284
+ - Example Input:
285
+ `env` (with `task=Task(...)`)
286
+ `config=None`
287
+ `function="evaluate"`
288
+ - Example Output:
289
+ `[HudStyleConfig(function='evaluate', args=[])]`
290
+ """
291
+ # If no function provided, just expand the config and return it directly
292
+ if function is None:
293
+ if config:
294
+ return expand_config(config)
295
+ raise ValueError("Either function or config must be provided")
296
+
297
+ # Case 1: Explicit config provided
298
+ if config:
299
+ expanded_configs = expand_config(config)
300
+ if env and env.final_response:
301
+ # Ensure args is a list before appending
302
+ if not isinstance(expanded_configs[0].args, list):
303
+ expanded_configs[0].args = [expanded_configs[0].args]
304
+ expanded_configs[0].args.append(env.final_response) # for remote responses
305
+ return [HudStyleConfig(function=function, args=expanded_configs)]
306
+
307
+ # Otherwise, use the environment's task
308
+ task = env.task if env else None
309
+
310
+ # Must have a task for the remaining cases
311
+ if task is None:
312
+ raise ValueError("Either task or config must be provided")
313
+
314
+ # Case 2: Task has the specified function attribute
315
+ task_config = getattr(task, function, None)
316
+ if task_config:
317
+ expanded_configs = expand_config(task_config)
318
+ if task.id:
319
+ expanded_configs[0].id = task.id # for remote IDs
320
+ elif env and env.final_response:
321
+ # Ensure args is a list before appending
322
+ if not isinstance(expanded_configs[0].args, list):
323
+ expanded_configs[0].args = [expanded_configs[0].args]
324
+ expanded_configs[0].args.append(env.final_response) # for remote responses
325
+ return [HudStyleConfig(function=function, args=expanded_configs)]
326
+
327
+ # Case 3: Check for task.config
328
+ if hasattr(task, "config") and task.config:
329
+ # Ensure task.config is a dictionary before adding id
330
+ final_args = task.config.copy() if isinstance(task.config, dict) else {}
331
+ if task.id:
332
+ final_args["id"] = task.id # for remote IDs
333
+ if env and env.final_response:
334
+ # Append response, ensuring args exists and is a list
335
+ if "args" not in final_args:
336
+ final_args["args"] = []
337
+ if not isinstance(final_args["args"], list):
338
+ final_args["args"] = [final_args["args"]]
339
+ final_args["args"].append(env.final_response)
340
+ return [HudStyleConfig(function=function, args=[final_args])]
341
+
342
+ # Case 4: Use task.id
343
+ if task.id:
344
+ args_list = [task.id]
345
+ if env and env.final_response:
346
+ args_list.append(env.final_response) # Append final response
347
+ return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
348
+
349
+ # Case 5: No valid configuration found
350
+ args_list = []
351
+ if env and env.final_response:
352
+ args_list.append(env.final_response)
353
+ return [HudStyleConfig(function=function, args=args_list)]
354
+
@@ -25,7 +25,9 @@ class LocalDockerClient(DockerClient):
25
25
  """
26
26
 
27
27
  @classmethod
28
- async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[LocalDockerClient, dict[str, Any]]:
28
+ async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[
29
+ LocalDockerClient, dict[str, Any]
30
+ ]:
29
31
  """
30
32
  Creates a Docker environment client from a dockerfile.
31
33
 
hud/task.py CHANGED
@@ -5,8 +5,7 @@ from typing import TYPE_CHECKING, Any
5
5
  from pydantic import BaseModel
6
6
 
7
7
  from hud.types import CustomGym, Gym
8
- from hud.utils import HudStyleConfig
9
- from hud.utils.config import HudStyleConfigs
8
+ from hud.utils.common import HudStyleConfig, HudStyleConfigs
10
9
 
11
10
  if TYPE_CHECKING:
12
11
  from inspect_ai.dataset import Sample
@@ -35,7 +34,7 @@ class Task(BaseModel):
35
34
 
36
35
  The setup and evaluate configurations can be in several formats:
37
36
  - String (function name): "chrome.maximize"
38
- - String (function with args): "chrome.activate_tab 5"
37
+ - Tuple (function with args): ("chrome.activate_tab", 5)
39
38
  - Dict: {"function": "chrome.navigate", "args": ["https://example.com"]}
40
39
  - List of the above: ["chrome.maximize", {"function": "chrome.navigate", "args": ["https://example.com"]}]
41
40
 
@@ -68,15 +67,15 @@ class Task(BaseModel):
68
67
  @classmethod
69
68
  def from_inspect_sample(cls, sample: Sample) -> Task:
70
69
  """Create a Task from an Inspect dataset sample.
71
- The task's sandbox is a local ubuntu container using the standard controller.
72
- Files will be copied to the user directory
70
+ Automatically detects if a CustomGym (docker) or QA Gym is needed based on sample.sandbox.
71
+ Configures evaluation using 'response_includes' or 'match_all' based on sample.target.
73
72
 
74
73
  Args:
75
74
  sample: An Inspect dataset Sample object
76
75
 
77
76
  Returns:
78
77
  Task instance
79
-
78
+
80
79
  The Inspect Sample has these fields:
81
80
  - input (str | list[ChatMessage]): The input to be submitted to the model
82
81
  - choices (list[str] | None): Optional multiple choice answer list
@@ -87,10 +86,8 @@ class Task(BaseModel):
87
86
  - files (dict[str, str] | None): Optional files that go with the sample
88
87
  - setup (str | None): Optional setup script to run for sample
89
88
  """
90
- # Extract the input as prompt
91
89
  prompt = sample.input
92
- if isinstance(prompt, list): # Handle ChatMessage format
93
- # Convert chat message list to a string representation
90
+ if isinstance(prompt, list):
94
91
  prompt_parts = []
95
92
  for message in prompt:
96
93
  role = message.role
@@ -98,36 +95,50 @@ class Task(BaseModel):
98
95
  prompt_parts.append(f"{role.capitalize()}: {content}")
99
96
  prompt = "\n\n".join(prompt_parts)
100
97
 
101
- # Map sandbox from Inspect to our envspec
98
+ evaluate_config = None
99
+ if sample.target:
100
+ if isinstance(sample.target, str):
101
+ evaluate_config = ("response_includes", [sample.target])
102
+ elif isinstance(sample.target, list):
103
+ evaluate_config = ("match_all", sample.target)
104
+
105
+ task_gym: Gym | None = None
106
+ task_setup: HudStyleConfigs | None = None
107
+
102
108
  sandbox = sample.sandbox
103
109
  dockerfile = None
110
+ use_qa_gym = True
111
+
104
112
  if sandbox:
105
113
  if isinstance(sandbox, str):
106
- if sandbox != "docker":
107
- raise ValueError("docker is the only supported sandbox")
114
+ if sandbox == "docker":
115
+ dockerfile = UBUNTU_DOCKERFILE
116
+ use_qa_gym = False
108
117
  elif isinstance(sandbox, tuple) and len(sandbox) == 2:
109
118
  sandbox_type, sandbox_config = sandbox
110
- if sandbox_type != "docker":
111
- raise ValueError("docker is the only supported sandbox")
112
- dockerfile = sandbox_config
113
- else:
114
- raise ValueError("Invalid sandbox configuration")
115
-
116
- gym = CustomGym(
117
- dockerfile=dockerfile or UBUNTU_DOCKERFILE,
118
- location="local",
119
- )
119
+ if sandbox_type == "docker":
120
+ dockerfile = sandbox_config
121
+ use_qa_gym = False
122
+
123
+ if use_qa_gym:
124
+ task_gym = "qa"
125
+ task_setup = None
126
+ else:
127
+ task_gym = CustomGym(
128
+ dockerfile=dockerfile or UBUNTU_DOCKERFILE,
129
+ location="local",
130
+ )
131
+ task_setup = [x for x in convert_inspect_setup(sample.setup)] if sample.setup else None
132
+ # TODO: Handle sample.files for CustomGym case if needed
133
+
120
134
 
121
135
  return cls(
122
- id=str(sample.id) if sample.id else None,
136
+ id=None,
123
137
  prompt=prompt,
124
- setup=[x for x in convert_inspect_setup(sample.setup)] if sample.setup else [],
138
+ setup=task_setup,
125
139
  metadata=sample.metadata,
126
140
  choices=sample.choices,
127
- target=sample.target,
128
- gym=gym,
141
+ evaluate=evaluate_config,
142
+ gym=task_gym,
143
+ # files=sample.files, # TODO: Decide how/if to handle files
129
144
  )
130
-
131
- def convert_sdk01(self) -> None:
132
- self.setup = [HudStyleConfig(function="reset", args=[{"task_id": self.id}])]
133
- self.evaluate = [HudStyleConfig(function="evaluate", args=[])]
hud/taskset.py CHANGED
@@ -9,6 +9,8 @@ from hud.settings import settings
9
9
  from hud.task import Task
10
10
 
11
11
  if TYPE_CHECKING:
12
+ from collections.abc import Iterator
13
+
12
14
  from inspect_ai.dataset import Dataset
13
15
 
14
16
 
@@ -49,6 +51,12 @@ class TaskSet(BaseModel):
49
51
  """
50
52
  return len(self.tasks)
51
53
 
54
+ def __iter__(self) -> Iterator[Task]:
55
+ """
56
+ Returns an iterator over the tasks in the taskset.
57
+ """
58
+ return iter(self.tasks)
59
+
52
60
 
53
61
  async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
54
62
  """
hud/utils/common.py CHANGED
@@ -3,16 +3,43 @@ from __future__ import annotations
3
3
  import io
4
4
  import logging
5
5
  import tarfile
6
- from typing import TYPE_CHECKING, TypedDict
6
+ from typing import TYPE_CHECKING, Any, TypedDict
7
+
8
+ from pydantic import BaseModel
7
9
 
8
10
  from hud.server.requests import make_request
9
11
  from hud.settings import settings
10
12
 
11
13
  if TYPE_CHECKING:
14
+ from collections.abc import Iterator
12
15
  from pathlib import Path
13
16
 
14
17
  logger = logging.getLogger("hud.utils.common")
15
18
 
19
+ class HudStyleConfig(BaseModel):
20
+ function: str # Format: "x.y.z"
21
+ args: list[Any] # Must be json serializable
22
+
23
+ id: str | None = None # Optional id for remote execution
24
+
25
+ def __len__(self) -> int:
26
+ return len(self.args)
27
+
28
+ def __getitem__(self, index: int) -> Any:
29
+ return self.args[index]
30
+
31
+ def __iter__(self) -> Iterator[Any]:
32
+ return iter(self.args)
33
+
34
+ def __str__(self) -> str:
35
+ return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
36
+
37
+ # Type alias for the shorthand config, which just converts to function name and args
38
+ ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
39
+
40
+ # Type alias for multiple config formats
41
+ HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
42
+
16
43
  class ExecuteResult(TypedDict):
17
44
  """
18
45
  Result of an execute command.
hud/utils/config.py CHANGED
@@ -2,14 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import re
5
- from typing import TYPE_CHECKING, Any
6
5
 
7
- from pydantic import BaseModel
8
-
9
- if TYPE_CHECKING:
10
- from collections.abc import Iterator
11
-
12
- from hud.task import Task
6
+ from hud.utils.common import HudStyleConfig, HudStyleConfigs
13
7
 
14
8
  logger = logging.getLogger("hud.utils.config")
15
9
 
@@ -17,30 +11,6 @@ REMOTE_FUNCTION_PREFIX = "private_"
17
11
  REMOTE_SETUP = "setup"
18
12
  REMOTE_EVALUATE = "evaluate"
19
13
 
20
- class HudStyleConfig(BaseModel):
21
- function: str # Format: "x.y.z"
22
- args: list[Any] # Must be json serializable
23
-
24
- id: str | None = None # Optional id for remote execution
25
-
26
- def __len__(self) -> int:
27
- return len(self.args)
28
-
29
- def __getitem__(self, index: int) -> Any:
30
- return self.args[index]
31
-
32
- def __iter__(self) -> Iterator[Any]:
33
- return iter(self.args)
34
-
35
- def __str__(self) -> str:
36
- return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
37
-
38
- # Type alias for the shorthand config, which just converts to function name and args
39
- ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
40
-
41
- # Type alias for multiple config formats
42
- HudStyleConfigs = ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | dict[str, Any] | str
43
-
44
14
  def _is_valid_python_name(name: str) -> bool:
45
15
  """Check if a string is a valid Python identifier."""
46
16
  return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name))
@@ -122,64 +92,3 @@ def expand_config(config: HudStyleConfigs) -> list[HudStyleConfig]:
122
92
  error_msg = f"Unknown configuration type: {type(config)}"
123
93
  logger.error(error_msg)
124
94
  raise ValueError(error_msg)
125
-
126
- def create_remote_config(
127
- task: Task | None = None,
128
- config: HudStyleConfigs | None = None,
129
- function: str | None = None,
130
- ) -> list[HudStyleConfig]:
131
- """
132
- Create a configuration based on provided inputs.
133
-
134
- Args:
135
- task: Task object with configuration
136
- config: Direct configuration (expanded or not)
137
- function: Function name to use
138
-
139
- Returns:
140
- list[HudStyleConfig]: List of standardized configurations
141
-
142
- Logic:
143
- 1) If explicit config: expand and return HudStyleConfig with func of the function,
144
- and args of expanded config
145
- 2) If task has the specified function defined: use that
146
- 3) If no task function: check for task._config and use that
147
- 4) If no _config: use task.id and create private_[function]
148
- """
149
- # If no function provided, just expand the config and return it directly
150
- if function is None:
151
- if config:
152
- return expand_config(config)
153
- raise ValueError("Either function or config must be provided")
154
-
155
- # Case 1: Explicit config provided
156
- if config:
157
- expanded_configs = expand_config(config)
158
- return [HudStyleConfig(function=function, args=expanded_configs)]
159
-
160
- # Must have a task for the remaining cases
161
- if task is None:
162
- raise ValueError("Either task or config must be provided")
163
-
164
- # Case 2: Task has the specified function attribute
165
- task_config = getattr(task, function, None)
166
- if task_config and len(task_config) > 0:
167
- expanded_configs = expand_config(task_config)
168
- if task.id:
169
- expanded_configs[0].id = task.id # for remote IDs
170
- return [HudStyleConfig(function=function, args=expanded_configs)]
171
-
172
- # Case 3: Check for _config
173
- if hasattr(task, "config") and task.config:
174
- if task.id:
175
- task.config["id"] = task.id # for remote IDs
176
- return [HudStyleConfig(function=function, args=[task.config])]
177
-
178
- # Case 4: Use task.id
179
- if task.id:
180
- return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=[task.id])]
181
-
182
- # No valid configuration found
183
- #logger.warning("No valid configuration found for function: %s", function)
184
- return [HudStyleConfig(function=function, args=[])]
185
-
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: SDK for the HUD evaluation platform.
5
- Project-URL: Homepage, https://github.com/Human-Data/hud-sdk
6
- Project-URL: Bug Tracker, https://github.com/Human-Data/hud-sdk/issues
5
+ Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
+ Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
7
7
  Project-URL: Documentation, https://hud.so
8
8
  Author-email: Human Union Data SDK <founders@hud.so>
9
9
  License: MIT License
@@ -57,7 +57,7 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
57
57
  Requires-Dist: ruff==0.9.8; extra == 'dev'
58
58
  Description-Content-Type: text/markdown
59
59
 
60
- # HUD SDK - Human-Agent Interaction Toolkit
60
+ # HUD
61
61
 
62
62
  A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
63
63
 
@@ -86,21 +86,20 @@ export HUD_API_KEY=your_api_key_here
86
86
  pip install hud-python
87
87
  ```
88
88
 
89
- ### Simple Browser Example with Operator
89
+ ### Simple Browser Example with Claude Computer Use
90
90
 
91
91
  > This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
92
92
 
93
+ Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
94
+
93
95
  ```python
94
- import os
95
96
  import asyncio
96
97
  from hud import gym, job
97
98
  from hud.task import Task
98
- from hud.utils import stream
99
- from hud.agent import OperatorAgent
99
+ from hud.agent import ClaudeAgent
100
100
 
101
101
  @job("test-run")
102
102
  async def main():
103
- # Define a simple task
104
103
  task = Task(
105
104
  prompt="Insert the text 'capybara' into the search bar",
106
105
  gym="hud-browser",
@@ -108,26 +107,20 @@ async def main():
108
107
  evaluate=("contains_text", "capybara")
109
108
  )
110
109
 
111
- # Create environment
110
+ # Create environment using the gym module
112
111
  env = await gym.make(task)
113
112
 
114
- # Get URLs and display live view (optional)
115
- # urls = await env.get_urls()
116
- # stream(urls["live_url"])
117
-
118
113
  # Initialize Operator agent (API key is loaded automatically)
119
- agent = OperatorAgent()
114
+ agent = ClaudeAgent()
120
115
 
121
- # Agent loop
122
- obs, _ = env.reset()
116
+ # Agent loop with predict and step functions
117
+ obs, _ = await env.reset() # Gets first observation
123
118
  for i in range(5):
124
119
  actions, done = await agent.predict(obs)
125
120
  if done:
126
121
  break
127
122
 
128
123
  obs, reward, terminated, info = await env.step(actions)
129
- if terminated:
130
- break
131
124
 
132
125
  # Evaluate and close
133
126
  result = await env.evaluate()
@@ -143,26 +136,26 @@ if __name__ == "__main__":
143
136
 
144
137
  Explore the core concepts and features of the SDK:
145
138
 
146
- * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios.
139
+ * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
147
140
  * **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
148
141
  * **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
149
142
  * **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
150
143
  * **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
151
144
  * **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
152
145
  * **Advanced Topics**:
146
+ * **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
153
147
  * **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
154
148
  * **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
155
- * **[CLA Action Details](/advanced/cla-details)**: Dive deeper into the standardized action format.
156
149
 
157
150
  * **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
158
151
 
159
152
  ## [Examples](examples/)
160
153
 
161
- We provide several example notebooks showing how to use the HUD SDK:
154
+ We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
162
155
 
163
156
  1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
164
157
  2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
165
- 3. [OSWorld](examples/osworld.ipynb) - Working with OS environments
158
+ 3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
166
159
  4. [Local Development](examples/local.ipynb) - Setting up local custom environments
167
160
 
168
161
  ## Documentation
@@ -180,9 +173,9 @@ If you use this SDK in your research, please cite it as follows:
180
173
  ```bibtex
181
174
  @software{hud2025agentevalplatform,
182
175
  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
183
- title = {{HUD: An Evaluation Platform for Computer Use Agents}},
184
- date = {2025-03},
185
- url = {https://github.com/Human-Data/hud-sdk},
176
+ title = {{HUD: An Evaluation Platform for Agents}},
177
+ date = {2025-04},
178
+ url = {https://github.com/hud-evals/hud-sdk},
186
179
  langid = {en}
187
180
  }
188
181
  ```
@@ -1,28 +1,28 @@
1
- hud/__init__.py,sha256=YX9zAqOSjAFZqHbDJGUVefOsxg7PhkH1ZDflRoiSgP8,464
1
+ hud/__init__.py,sha256=HFL1iwPhLZd7z--2QADzipur68XlekwGrOzU2vWL-Vw,464
2
2
  hud/gym.py,sha256=cKjIuJS7A0vJx4K7fctpUjIEv8TkW5x6aB_PRrODrDY,3651
3
3
  hud/job.py,sha256=E4RN1CkppRQVy46RWCUDjNIyhMa7lNlFfCgpky2vKFk,5463
4
4
  hud/settings.py,sha256=rv8TiZx4wmBzIoEEkOzoywC0nt8UZXlHxIa_LW4tWAg,1346
5
- hud/task.py,sha256=q1E_urMavnfsb87x2JHkRNMBzbkkaQI1skOulkpJ5DY,5132
6
- hud/taskset.py,sha256=fV4QgHf8tphDoMjTdBzkyCJT7pQBLEMoGu_Uxuji2DM,2226
5
+ hud/task.py,sha256=aNbHMlO7r1cm5DcO0QLU1SZ7EawOFw9W6DZwTNy72-4,5383
6
+ hud/taskset.py,sha256=xDPBXeDm4AlSOwl-MM98lN0x6PmGV8t9jv7sNyS_u0c,2426
7
7
  hud/trajectory.py,sha256=PA-sE2iyt2BctO2Dex-2ZaRmS95AkEXTicZjHCVCYqE,3749
8
8
  hud/types.py,sha256=fJZnzK3j3mq7G0gO5TbqRaN92qT4xAb4jUNOXIX8ZZ0,2395
9
9
  hud/adapters/__init__.py,sha256=0RNQgrzBCkhNBq1Q7JRESN1WfUVLs_99fR5g1re3APs,207
10
10
  hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
11
- hud/adapters/claude/adapter.py,sha256=sgdgkCtNFjFPSSmfsUD1vx0Xz9xhG81A_it4BvRsOXE,5781
11
+ hud/adapters/claude/adapter.py,sha256=x0qQglWsg7n8DJ_NacsymlUQBnkpqNVguUlkQRpYX-A,5955
12
12
  hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
13
13
  hud/adapters/common/adapter.py,sha256=ls-gXtg1N_SQc211rkDb3LL511HNZv6etm1nx2ZtrkQ,5808
14
- hud/adapters/common/types.py,sha256=ubnWlm4JMtCkTNonKZGb425p6oi8jZyIVcekp-pjTXQ,4905
14
+ hud/adapters/common/types.py,sha256=APxGEmoePwjF7OYXAKqBTVT73PJTFV0eBmbURbaT5xk,5091
15
15
  hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
16
- hud/adapters/operator/adapter.py,sha256=j2bBe_bwOhdbd7Qr6UvWUEkTkUTOA-ADvWYx0B1c_TU,3159
16
+ hud/adapters/operator/adapter.py,sha256=svHgjCdUeMyfgfGzRO3ItGWTKGkm3tmldO2zfjX_sGI,3301
17
17
  hud/agent/__init__.py,sha256=cI3bqfmG2_Lwzn2RjrxV0X9qIxCRDiffwd1UaWToct4,238
18
18
  hud/agent/base.py,sha256=RThJ_h4A3oU23zyvvKtxY2a_YM03Vd1XYDXdY3bAf8g,3881
19
- hud/agent/claude.py,sha256=ZPoged_sun2CmPgludfkV4uv-gjak_yyIlGgCIRcWx0,6583
20
- hud/agent/operator.py,sha256=zJaYW5kJ7rgvRQCufrjsoNCPn2Ra9EakmFFwut_v7Hk,7335
19
+ hud/agent/claude.py,sha256=tbDKAzGCLJPnUnHc8eV-zZmj3ZG6QQx0ukWKoO4Ekec,7445
20
+ hud/agent/operator.py,sha256=44t19TzcCrS1N3-rnD25ZLXx5s4Io8On27LomALuugs,8185
21
21
  hud/env/__init__.py,sha256=BHFY_N0kEI142pjWtMyqUb3BGnoiekY8evRCIbSbO2w,271
22
22
  hud/env/client.py,sha256=SPR6ct6NFxmIrgIi3K8tEC-vnqOmCbCBtuT81PaVjuY,869
23
- hud/env/docker_client.py,sha256=4G3OeFBCbIqg9zOXxreDekNvLNMhgtc2cMAjMbqB6Tk,10394
24
- hud/env/environment.py,sha256=h-Z7I_1Y8vXBL1oOYbC5xRIKwl28NZt0PJ4GmKcd0AM,5863
25
- hud/env/local_docker_client.py,sha256=9p2IHeSRmk9_lU7FRiHaCMWn0CjbtWLQjsT3x8x6qxY,7767
23
+ hud/env/docker_client.py,sha256=56_u3Ri4NulGcBumAg-7-KilmFmBKthOwEIM5bOLOZc,10418
24
+ hud/env/environment.py,sha256=Xyq4KQO9aWYPwZ0uESAetB5EEZgmlEnZVc7sA0DLz2c,13706
25
+ hud/env/local_docker_client.py,sha256=TCD9z1qjafxjwAWLatAL8d587_ioMDHjs8T5cBgusr8,7789
26
26
  hud/env/remote_client.py,sha256=iJiwueuf98xOx0_Y2ltu_63BwKIKNvohhim73Goq74E,5804
27
27
  hud/env/remote_docker_client.py,sha256=FwaO7NyygDt9oe3pDD7PwUS21pxzc465mwcXk-Cx-60,6838
28
28
  hud/evaluators/__init__.py,sha256=XophB666xPnurhQ_ygfW44h0Jh0BQGCgUzCXEOG2Q1g,158
@@ -34,11 +34,11 @@ hud/evaluators/remote.py,sha256=NVUJJvrpGQj2eL-aFxzTEnAWW7iuSI9eDWtar54dc6E,2174
34
34
  hud/server/__init__.py,sha256=cxDKTwMdGzhj7bYajtejN8XCt7K8Xq3eKB2No0qBpoY,169
35
35
  hud/server/requests.py,sha256=s8LZZYWT1wl7lPu2vwRaYPZs9_gjKwSg3LZLvS5-s6E,9085
36
36
  hud/utils/__init__.py,sha256=LnoI2tQUnd-mQ4eg-gpJJgmHBBIhggJ6c9ap7MBgrfs,260
37
- hud/utils/common.py,sha256=qTAgiqQqplfrCrll06SAYYr9TyT8gnV4mwDSxsj-W1s,1842
38
- hud/utils/config.py,sha256=x3F9Rg2lTGEG8_FcnEyymh4Y02qD1UWmcDlOSA1Xq0U,6476
37
+ hud/utils/common.py,sha256=XJZ-hKJkeaNmelG2QD5ybi9FpZQS1ErA40fAYzUSHVE,2742
38
+ hud/utils/config.py,sha256=ePi3GDo8mDUnOZ5G5HyMprqGRvxrxCMfixGNuTOA8rQ,3266
39
39
  hud/utils/telemetry.py,sha256=md7AuKxtDqsONMeeTOHen1XpmNds8CbXROX_PnkDxFc,1993
40
40
  hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- hud_python-0.2.0.dist-info/METADATA,sha256=GbG7OHnQ8WqR3iXT6utC26PkCmgPKrOePTdCNZxuwK4,7222
42
- hud_python-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
43
- hud_python-0.2.0.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
44
- hud_python-0.2.0.dist-info/RECORD,,
41
+ hud_python-0.2.1.dist-info/METADATA,sha256=f2lyqGmu9L7_zgCOqrhZ6ZX1JUU6Z0e92bRTfmojSqQ,7219
42
+ hud_python-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
43
+ hud_python-0.2.1.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
44
+ hud_python-0.2.1.dist-info/RECORD,,