hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (59) hide show
  1. hud/__init__.py +5 -3
  2. hud/adapters/__init__.py +2 -1
  3. hud/adapters/claude/adapter.py +13 -17
  4. hud/adapters/common/adapter.py +3 -3
  5. hud/adapters/common/tests/__init__.py +0 -0
  6. hud/adapters/common/tests/test_adapter.py +277 -0
  7. hud/adapters/common/types.py +3 -6
  8. hud/adapters/operator/adapter.py +22 -29
  9. hud/agent/__init__.py +9 -1
  10. hud/agent/base.py +28 -28
  11. hud/agent/claude.py +69 -60
  12. hud/agent/langchain.py +204 -0
  13. hud/agent/operator.py +75 -67
  14. hud/env/__init__.py +5 -5
  15. hud/env/client.py +2 -2
  16. hud/env/docker_client.py +37 -39
  17. hud/env/environment.py +91 -66
  18. hud/env/local_docker_client.py +5 -7
  19. hud/env/remote_client.py +40 -29
  20. hud/env/remote_docker_client.py +13 -3
  21. hud/evaluators/__init__.py +2 -3
  22. hud/evaluators/base.py +4 -3
  23. hud/evaluators/inspect.py +3 -8
  24. hud/evaluators/judge.py +34 -58
  25. hud/evaluators/match.py +42 -49
  26. hud/evaluators/remote.py +13 -26
  27. hud/evaluators/tests/__init__.py +0 -0
  28. hud/evaluators/tests/test_inspect.py +12 -0
  29. hud/evaluators/tests/test_judge.py +231 -0
  30. hud/evaluators/tests/test_match.py +115 -0
  31. hud/evaluators/tests/test_remote.py +98 -0
  32. hud/exceptions.py +167 -0
  33. hud/gym.py +12 -10
  34. hud/job.py +525 -47
  35. hud/server/__init__.py +2 -2
  36. hud/server/requests.py +148 -186
  37. hud/server/tests/__init__.py +0 -0
  38. hud/server/tests/test_requests.py +275 -0
  39. hud/settings.py +3 -2
  40. hud/task.py +12 -22
  41. hud/taskset.py +44 -11
  42. hud/trajectory.py +6 -9
  43. hud/types.py +14 -9
  44. hud/utils/__init__.py +2 -2
  45. hud/utils/common.py +37 -13
  46. hud/utils/config.py +44 -29
  47. hud/utils/progress.py +149 -0
  48. hud/utils/telemetry.py +10 -11
  49. hud/utils/tests/__init__.py +0 -0
  50. hud/utils/tests/test_common.py +52 -0
  51. hud/utils/tests/test_config.py +129 -0
  52. hud/utils/tests/test_progress.py +225 -0
  53. hud/utils/tests/test_telemetry.py +37 -0
  54. hud/utils/tests/test_version.py +8 -0
  55. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
  56. hud_python-0.2.3.dist-info/RECORD +62 -0
  57. hud_python-0.2.1.dist-info/RECORD +0 -44
  58. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
  59. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0
hud/env/environment.py CHANGED
@@ -10,25 +10,21 @@ from pydantic import BaseModel
10
10
  from hud.env.client import Client
11
11
  from hud.env.remote_client import RemoteClient
12
12
  from hud.task import Task
13
- from hud.utils.common import HudStyleConfig, HudStyleConfigs
14
- from hud.utils.config import REMOTE_EVALUATE, REMOTE_FUNCTION_PREFIX, REMOTE_SETUP, expand_config
13
+ from hud.utils.common import FunctionConfig, FunctionConfigs, Observation
14
+ from hud.utils.config import (
15
+ LOCAL_EVALUATORS,
16
+ REMOTE_EVALUATE,
17
+ REMOTE_FUNCTION_PREFIX,
18
+ REMOTE_SETUP,
19
+ expand_config,
20
+ )
21
+ from hud.utils.telemetry import stream
15
22
 
16
23
  logger = logging.getLogger("hud.environment")
17
24
 
18
25
  if TYPE_CHECKING:
19
26
  from hud.adapters.common import CLA
20
-
21
- class Observation(BaseModel):
22
- """
23
- Observation from the environment.
24
-
25
- Attributes:
26
- screenshot: Base64 encoded PNG string of the screen
27
- text: Text observation, if available
28
- """
29
-
30
- screenshot: str | None = None # base64 string png
31
- text: str | None = None
27
+ from hud.agent import Agent
32
28
 
33
29
 
34
30
  class Environment(BaseModel):
@@ -48,7 +44,7 @@ class Environment(BaseModel):
48
44
  # final response
49
45
  final_response: str | None = None
50
46
 
51
- async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]:
47
+ async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]:
52
48
  # Execute each config and collect results
53
49
  configs_all = [configs] if not isinstance(configs, list) else configs
54
50
  results = []
@@ -69,8 +65,8 @@ class Environment(BaseModel):
69
65
  stderr.decode(),
70
66
  )
71
67
  return results
72
-
73
- async def _setup(self, config: HudStyleConfigs | None = None) -> None:
68
+
69
+ async def _setup(self, config: FunctionConfigs | None = None) -> None:
74
70
  """
75
71
  Setup the environment.
76
72
 
@@ -87,7 +83,7 @@ class Environment(BaseModel):
87
83
  else:
88
84
  raise ValueError("No config or task provided for local environment")
89
85
 
90
- async def evaluate(self, config: HudStyleConfigs | None = None) -> Any:
86
+ async def evaluate(self, config: FunctionConfigs | None = None) -> Any:
91
87
  """
92
88
  Evaluate the environment.
93
89
 
@@ -98,8 +94,7 @@ class Environment(BaseModel):
98
94
  Any: Result of the evaluation
99
95
  """
100
96
  if isinstance(self.client, RemoteClient):
101
- results = await self._invoke_all(
102
- create_remote_config(self, config, REMOTE_EVALUATE))
97
+ results = await self._invoke_all(create_remote_config(self, config, REMOTE_EVALUATE))
103
98
  else:
104
99
  if config is not None:
105
100
  results = await self._invoke_all(config)
@@ -111,11 +106,10 @@ class Environment(BaseModel):
111
106
  return results[0]
112
107
  else:
113
108
  return results
114
-
115
109
 
116
- async def reset(self, configs: HudStyleConfigs | None = None) -> tuple[
117
- Observation, dict[str, Any]
118
- ]:
110
+ async def reset(
111
+ self, configs: FunctionConfigs | None = None
112
+ ) -> tuple[Observation, dict[str, Any]]:
119
113
  """
120
114
  Reset the environment.
121
115
 
@@ -126,15 +120,15 @@ class Environment(BaseModel):
126
120
  Observation: The first observation from the environment
127
121
  info: Dictionary of information about the environment
128
122
  """
129
- #await self._setup(configs)
123
+ # await self._setup(configs)
130
124
  obs, _, _, info = await self.step()
131
125
  if self.task and self.task.prompt:
132
126
  obs.text = self.task.prompt
133
127
  return obs, info
134
128
 
135
- async def step(self, actions: list[CLA] | None = None) -> tuple[
136
- Observation, float, bool, dict[str, Any]
137
- ]:
129
+ async def step(
130
+ self, actions: CLA | list[CLA] | None = None
131
+ ) -> tuple[Observation, float, bool, dict[str, Any]]:
138
132
  """Execute a step in the environment.
139
133
 
140
134
  Args:
@@ -143,6 +137,8 @@ class Environment(BaseModel):
143
137
  Returns:
144
138
  Any: Result of the step execution
145
139
  """
140
+ if not isinstance(actions, list) and actions is not None:
141
+ actions = [actions]
146
142
  if actions is None or len(actions) == 0:
147
143
  actions = []
148
144
  args = [[action.model_dump() for action in actions]]
@@ -150,20 +146,19 @@ class Environment(BaseModel):
150
146
  # TODO: Move this into the server side
151
147
  if self._maybe_store_response(actions):
152
148
  return Observation(text=self.final_response), 0, False, {}
153
-
149
+
154
150
  result, stdout, stderr = await self.client.invoke(
155
- HudStyleConfig(function="step", args=args)
151
+ FunctionConfig(function="step", args=args)
156
152
  )
157
153
  if stdout:
158
154
  logger.info("Step produced stdout: %s", stdout.decode())
159
155
  if stderr:
160
156
  logger.warning("Step produced stderr: %s", stderr.decode())
161
157
 
162
-
163
158
  observation = Observation.model_validate(result["observation"], strict=True)
164
159
 
165
160
  return observation, 0, False, {}
166
-
161
+
167
162
  def _maybe_store_response(self, actions: list[CLA]) -> bool:
168
163
  """Store the final response into the environment.
169
164
 
@@ -178,14 +173,13 @@ class Environment(BaseModel):
178
173
  return True
179
174
  return False
180
175
 
181
-
182
176
  async def get_urls(self) -> dict[str, Any]:
183
177
  """Get URLs for the environment.
184
178
 
185
179
  Returns:
186
180
  dict: Dictionary of URLs for accessing the environment
187
181
  """
188
- data, _, _ = await self.client.invoke(HudStyleConfig(function="get_urls", args=[]))
182
+ data, _, _ = await self.client.invoke(FunctionConfig(function="get_urls", args=[]))
189
183
 
190
184
  self.url = data.get("url")
191
185
  self.live_url = data.get("live_url")
@@ -202,11 +196,43 @@ class Environment(BaseModel):
202
196
  """
203
197
  await self.client.close()
204
198
 
199
+ async def stream(self) -> str | None:
200
+ urls = await self.get_urls()
201
+ if urls["live_url"] is None:
202
+ logger.warning("No live URL found")
203
+ return None
204
+ # Stream the live view
205
+ return stream(urls["live_url"])
206
+
207
+ async def run(self, agent: Agent, max_steps: int = 27, verbose: bool = True) -> Any:
208
+ """Run an agent in the environment.
209
+
210
+ Args:
211
+ agent: The agent to run
212
+ """
213
+ if verbose:
214
+ logger.info("[HUD] Running agent in environment...")
215
+ obs, _ = await self.reset()
216
+ for i in range(max_steps):
217
+ action, done = await agent.predict(obs)
218
+ if verbose:
219
+ logger.info("[HUD] Step %d: Action: %s", i, action)
220
+ obs, reward, terminated, info = await self.step(action)
221
+ if verbose:
222
+ logger.info("[HUD] Step %d: Observation: %s", i, obs)
223
+ if done or terminated:
224
+ break
225
+ result = await self.evaluate()
226
+ if verbose:
227
+ logger.info("[HUD] Evaluation result: %s", result)
228
+ return result
229
+
230
+
205
231
  def create_remote_config(
206
232
  env: Environment | None = None,
207
- config: HudStyleConfigs | None = None,
233
+ config: FunctionConfigs | None = None,
208
234
  function: str | None = None,
209
- ) -> list[HudStyleConfig]:
235
+ ) -> list[FunctionConfig]:
210
236
  """
211
237
  Create a remote configuration for setup or evaluate, determining the final
212
238
  function call structure based on the provided task or explicit config.
@@ -218,11 +244,11 @@ def create_remote_config(
218
244
  env: Environment object, potentially containing a task definition.
219
245
  Used to access `env.task` and `env.final_response`.
220
246
  config: Direct configuration override (e.g., passed to `env.evaluate(config=...)`).
221
- Can be in various HudStyleConfigs formats.
247
+ Can be in various FunctionConfigs formats.
222
248
  function: The top-level function context, typically "setup" or "evaluate".
223
249
 
224
250
  Returns:
225
- list[HudStyleConfig]: A list containing a single HudStyleConfig object
251
+ list[FunctionConfig]: A list containing a single FunctionConfig object
226
252
  ready for remote invocation via `client.invoke`.
227
253
  The specific function/arguments are chosen based on this priority:
228
254
  1. Explicit `config` parameter (if provided).
@@ -242,8 +268,8 @@ def create_remote_config(
242
268
  `config=("contains_text", "Paris")`
243
269
  `function="evaluate"`
244
270
  - Example Output:
245
- `[HudStyleConfig(function='evaluate', args=[
246
- HudStyleConfig(function='contains_text', args=['Paris', 'Paris'])
271
+ `[FunctionConfig(function='evaluate', args=[
272
+ FunctionConfig(function='contains_text', args=['Paris', 'Paris'])
247
273
  ])]`
248
274
 
249
275
  2) No explicit `config`, Task has the attribute (e.g., `task.evaluate`):
@@ -255,7 +281,7 @@ def create_remote_config(
255
281
  `config=None`
256
282
  `function="evaluate"`
257
283
  - Example Output:
258
- `[HudStyleConfig(function='evaluate', args=[HudStyleConfig(function='check_answer',
284
+ `[FunctionConfig(function='evaluate', args=[FunctionConfig(function='check_answer',
259
285
  args=['42'], id='t1')])]`
260
286
 
261
287
  3) No explicit `config`, no specific Task attribute, Task has `task.config`:
@@ -267,7 +293,7 @@ def create_remote_config(
267
293
  `config=None`
268
294
  `function="evaluate"`
269
295
  - Example Output:
270
- `[HudStyleConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
296
+ `[FunctionConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
271
297
 
272
298
  4) No explicit `config`, no specific Task attribute, no `task.config`, Task has `task.id`:
273
299
  Calls a private function (`private_<function>`) on the remote end, passing
@@ -277,7 +303,7 @@ def create_remote_config(
277
303
  `config=None`
278
304
  `function="evaluate"`
279
305
  - Example Output:
280
- `[HudStyleConfig(function='private_evaluate', args=['t3'])]`
306
+ `[FunctionConfig(function='private_evaluate', args=['t3'])]`
281
307
 
282
308
  5) No explicit `config` and no relevant Task info:
283
309
  Calls the top-level `function` with empty args.
@@ -286,50 +312,50 @@ def create_remote_config(
286
312
  `config=None`
287
313
  `function="evaluate"`
288
314
  - Example Output:
289
- `[HudStyleConfig(function='evaluate', args=[])]`
315
+ `[FunctionConfig(function='evaluate', args=[])]`
290
316
  """
291
317
  # If no function provided, just expand the config and return it directly
292
318
  if function is None:
293
319
  if config:
294
320
  return expand_config(config)
295
321
  raise ValueError("Either function or config must be provided")
296
-
322
+
297
323
  # Case 1: Explicit config provided
298
324
  if config:
299
325
  expanded_configs = expand_config(config)
300
- if env and env.final_response:
326
+ if env and env.final_response and expanded_configs[0].args[0] in LOCAL_EVALUATORS:
301
327
  # Ensure args is a list before appending
302
328
  if not isinstance(expanded_configs[0].args, list):
303
- expanded_configs[0].args = [expanded_configs[0].args]
304
- expanded_configs[0].args.append(env.final_response) # for remote responses
305
- return [HudStyleConfig(function=function, args=expanded_configs)]
306
-
329
+ expanded_configs[0].args = [expanded_configs[0].args]
330
+ expanded_configs[0].args.append(env.final_response) # for remote responses
331
+ return [FunctionConfig(function=function, args=expanded_configs)]
332
+
307
333
  # Otherwise, use the environment's task
308
334
  task = env.task if env else None
309
-
335
+
310
336
  # Must have a task for the remaining cases
311
337
  if task is None:
312
338
  raise ValueError("Either task or config must be provided")
313
-
339
+
314
340
  # Case 2: Task has the specified function attribute
315
341
  task_config = getattr(task, function, None)
316
342
  if task_config:
317
343
  expanded_configs = expand_config(task_config)
318
344
  if task.id:
319
- expanded_configs[0].id = task.id # for remote IDs
320
- elif env and env.final_response:
345
+ expanded_configs[0].id = task.id # for remote IDs
346
+ if env and env.final_response and expanded_configs[0].args[0] in LOCAL_EVALUATORS:
321
347
  # Ensure args is a list before appending
322
348
  if not isinstance(expanded_configs[0].args, list):
323
- expanded_configs[0].args = [expanded_configs[0].args]
324
- expanded_configs[0].args.append(env.final_response) # for remote responses
325
- return [HudStyleConfig(function=function, args=expanded_configs)]
326
-
349
+ expanded_configs[0].args = [expanded_configs[0].args]
350
+ expanded_configs[0].args.append(env.final_response) # for remote responses
351
+ return [FunctionConfig(function=function, args=expanded_configs)]
352
+
327
353
  # Case 3: Check for task.config
328
354
  if hasattr(task, "config") and task.config:
329
355
  # Ensure task.config is a dictionary before adding id
330
356
  final_args = task.config.copy() if isinstance(task.config, dict) else {}
331
357
  if task.id:
332
- final_args["id"] = task.id # for remote IDs
358
+ final_args["id"] = task.id # for remote IDs
333
359
  if env and env.final_response:
334
360
  # Append response, ensuring args exists and is a list
335
361
  if "args" not in final_args:
@@ -337,18 +363,17 @@ def create_remote_config(
337
363
  if not isinstance(final_args["args"], list):
338
364
  final_args["args"] = [final_args["args"]]
339
365
  final_args["args"].append(env.final_response)
340
- return [HudStyleConfig(function=function, args=[final_args])]
341
-
366
+ return [FunctionConfig(function=function, args=[final_args])]
367
+
342
368
  # Case 4: Use task.id
343
369
  if task.id:
344
370
  args_list = [task.id]
345
371
  if env and env.final_response:
346
- args_list.append(env.final_response) # Append final response
347
- return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
348
-
372
+ args_list.append(env.final_response) # Append final response
373
+ return [FunctionConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
374
+
349
375
  # Case 5: No valid configuration found
350
376
  args_list = []
351
377
  if env and env.final_response:
352
378
  args_list.append(env.final_response)
353
- return [HudStyleConfig(function=function, args=args_list)]
354
-
379
+ return [FunctionConfig(function=function, args=args_list)]
@@ -19,15 +19,16 @@ if TYPE_CHECKING:
19
19
 
20
20
  logger = logging.getLogger("hud.env.docker_env_client")
21
21
 
22
+
22
23
  class LocalDockerClient(DockerClient):
23
24
  """
24
25
  Docker-based environment client implementation.
25
26
  """
26
27
 
27
28
  @classmethod
28
- async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[
29
- LocalDockerClient, dict[str, Any]
30
- ]:
29
+ async def create(
30
+ cls, dockerfile: str, ports: list[int] | None = None
31
+ ) -> tuple[LocalDockerClient, dict[str, Any]]:
31
32
  """
32
33
  Creates a Docker environment client from a dockerfile.
33
34
 
@@ -86,9 +87,7 @@ class LocalDockerClient(DockerClient):
86
87
  "HostConfig": {
87
88
  "PublishAllPorts": True,
88
89
  },
89
- "ExposedPorts": {
90
- f"{port}/tcp": {} for port in ports
91
- },
90
+ "ExposedPorts": {f"{port}/tcp": {} for port in ports},
92
91
  }
93
92
 
94
93
  container = await docker_client.containers.create(config=container_config)
@@ -198,7 +197,6 @@ class LocalDockerClient(DockerClient):
198
197
  exit_code=0,
199
198
  )
200
199
 
201
-
202
200
  async def get_archive(self, path: str) -> bytes:
203
201
  """
204
202
  Get an archive of a path from the container.
hud/env/remote_client.py CHANGED
@@ -5,23 +5,25 @@ from base64 import b64decode
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
7
  from hud.env.client import Client
8
+ from hud.exceptions import HudResponseError
8
9
  from hud.server import make_request
9
10
  from hud.settings import settings
10
11
  from hud.types import EnvironmentStatus
11
12
  from hud.utils import ExecuteResult
12
13
 
13
14
  if TYPE_CHECKING:
14
- from hud.utils.config import HudStyleConfig
15
+ from hud.utils.config import FunctionConfig
15
16
 
16
17
  logger = logging.getLogger("hud.env.remote_env_client")
17
18
 
19
+
18
20
  class RemoteClient(Client):
19
21
  """
20
22
  Remote environment client implementation.
21
-
23
+
22
24
  Uses the HUD API to manage a remote environment.
23
25
  """
24
-
26
+
25
27
  @classmethod
26
28
  async def create(
27
29
  cls,
@@ -33,21 +35,23 @@ class RemoteClient(Client):
33
35
  ) -> tuple[RemoteClient, dict[str, Any]]:
34
36
  """
35
37
  Creates a remote environment client from a dockerfile or gym_id.
36
-
38
+
37
39
  Args:
38
40
  dockerfile: The dockerfile content to build the environment
39
41
  gym_id: The gym_id of the environment to create
40
42
  metadata: Metadata to associate with the environment
41
-
43
+
42
44
  Returns:
43
- RemoteClient: An instance of the remote environment client
45
+ A tuple containing the remote environment client and the build metadata
46
+
47
+ Raises:
48
+ HudResponseError: If the environment creation is successful but the response is invalid.
44
49
  """
45
50
 
46
51
  # Validate arguments
47
52
  if metadata is None:
48
53
  metadata = {}
49
54
 
50
-
51
55
  request_data = {
52
56
  # still named run_id for backwards compatibility
53
57
  "run_id": job_id,
@@ -63,29 +67,38 @@ class RemoteClient(Client):
63
67
  json=request_data,
64
68
  api_key=settings.api_key,
65
69
  )
66
-
70
+
67
71
  # Get the environment ID from the response
68
72
  env_id = response.get("id")
69
73
  if not env_id:
70
- raise ValueError("Failed to create remote environment: No ID returned")
71
-
74
+ raise HudResponseError(
75
+ message="Failed to create remote environment: No ID returned in API response. "
76
+ "Please contact support if this issue persists.",
77
+ response_json=response,
78
+ )
79
+
72
80
  # Create the controller instance
73
81
  controller = cls(env_id)
74
-
82
+
75
83
  build_data = response.get("metadata", {})
76
-
84
+
85
+ if response.get("readme"):
86
+ logger.info(
87
+ "[HUD] %s gym created, see how to use it at %s", gym_id, response.get("readme")
88
+ )
89
+
77
90
  return controller, build_data
78
91
 
79
92
  def __init__(self, env_id: str) -> None:
80
93
  """
81
94
  Initialize the RemoteClient.
82
-
95
+
83
96
  Args:
84
97
  env_id: ID of the remote environment to control
85
98
  """
86
99
  super().__init__()
87
100
  self._env_id = env_id
88
-
101
+
89
102
  @property
90
103
  def env_id(self) -> str:
91
104
  """The ID of the remote environment."""
@@ -94,7 +107,7 @@ class RemoteClient(Client):
94
107
  async def get_status(self) -> EnvironmentStatus:
95
108
  """
96
109
  Get the current status of the remote environment.
97
-
110
+
98
111
  Returns:
99
112
  EnvironmentStatus: The current status of the environment
100
113
  """
@@ -107,7 +120,7 @@ class RemoteClient(Client):
107
120
  logger.debug("Environment status response: %s", response)
108
121
 
109
122
  status = response.get("state", "").lower()
110
-
123
+
111
124
  if status == "running":
112
125
  return EnvironmentStatus.RUNNING
113
126
  elif status == "initializing" or status == "pending":
@@ -118,12 +131,12 @@ class RemoteClient(Client):
118
131
  # Any other status is considered an error
119
132
  logger.warning("Abnormal environment status response: %s", response)
120
133
  return EnvironmentStatus.ERROR
121
-
134
+
122
135
  except Exception:
123
136
  # If we can't connect to the API or there's any other error
124
137
  logger.info("(potentially transient) Error getting environment status")
125
138
  return EnvironmentStatus.ERROR
126
-
139
+
127
140
  async def execute(
128
141
  self,
129
142
  command: list[str],
@@ -134,11 +147,11 @@ class RemoteClient(Client):
134
147
  """
135
148
  Execute a command in the environment.
136
149
  No-op in some environments (like browser use).
137
-
150
+
138
151
  Args:
139
152
  command: Command to execute
140
153
  workdir: Working directory for the command (ignored for remote environments)
141
-
154
+
142
155
  Returns:
143
156
  ExecuteResult: Result of the command execution
144
157
  """
@@ -146,21 +159,20 @@ class RemoteClient(Client):
146
159
  method="POST",
147
160
  url=f"{settings.base_url}/v2/environments/{self.env_id}/execute",
148
161
  json={
149
- "command": command,
150
- "workdir": workdir,
151
- "timeout": timeout,
162
+ "command": command,
163
+ "workdir": workdir,
164
+ "timeout": timeout,
152
165
  },
153
166
  api_key=settings.api_key,
154
167
  )
155
-
168
+
156
169
  return ExecuteResult(
157
170
  stdout=b64decode(data["stdout"]),
158
171
  stderr=b64decode(data["stderr"]),
159
- exit_code=data["exit_code"]
172
+ exit_code=data["exit_code"],
160
173
  )
161
174
 
162
-
163
- async def invoke(self, config: HudStyleConfig) -> tuple[Any, bytes, bytes]:
175
+ async def invoke(self, config: FunctionConfig) -> tuple[Any, bytes, bytes]:
164
176
  """
165
177
  Invoke a function in the environment.
166
178
  """
@@ -170,9 +182,8 @@ class RemoteClient(Client):
170
182
  json=config.model_dump(),
171
183
  api_key=settings.api_key,
172
184
  )
173
-
174
- return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
175
185
 
186
+ return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
176
187
 
177
188
  async def close(self) -> None:
178
189
  """
@@ -5,6 +5,7 @@ from base64 import b64decode, b64encode
5
5
  from typing import Any
6
6
 
7
7
  from hud.env.docker_client import DockerClient
8
+ from hud.exceptions import HudResponseError
8
9
  from hud.server import make_request
9
10
  from hud.settings import settings
10
11
  from hud.types import EnvironmentStatus
@@ -39,7 +40,10 @@ class RemoteDockerClient(DockerClient):
39
40
  metadata: Metadata to associate with the environment
40
41
 
41
42
  Returns:
42
- RemoteClient: An instance of the remote environment client
43
+ A tuple containing the remote environment client and the build metadata
44
+
45
+ Raises:
46
+ HudResponseError: If the environment creation fails.
43
47
  """
44
48
 
45
49
  # Validate arguments
@@ -48,7 +52,7 @@ class RemoteDockerClient(DockerClient):
48
52
 
49
53
  logger.info("Creating remote environment")
50
54
 
51
- true_gym_id = await get_gym_id("local-docker")
55
+ true_gym_id = await get_gym_id("docker")
52
56
 
53
57
  # augment metadata with dockerfile
54
58
  if "environment_config" not in metadata:
@@ -73,7 +77,13 @@ class RemoteDockerClient(DockerClient):
73
77
  # Get the environment ID from the response
74
78
  env_id = response.get("id")
75
79
  if not env_id:
76
- raise ValueError("Failed to create remote environment: No ID returned")
80
+ raise HudResponseError(
81
+ message=(
82
+ "Failed to create remote environment: No ID returned in API response. "
83
+ "Please contact support if this issue persists."
84
+ ),
85
+ response_json=response,
86
+ )
77
87
 
78
88
  # Create the controller instance
79
89
  controller = cls(env_id)
@@ -1,10 +1,9 @@
1
1
  """
2
2
  Evaluators for assessing task responses.
3
3
  """
4
+
4
5
  from __future__ import annotations
5
6
 
6
7
  from hud.evaluators.base import Evaluator
7
8
 
8
- __all__ = [
9
- "Evaluator"
10
- ]
9
+ __all__ = ["Evaluator"]
hud/evaluators/base.py CHANGED
@@ -11,21 +11,22 @@ if TYPE_CHECKING:
11
11
 
12
12
  class EvaluationResult(BaseModel):
13
13
  """Result of an evaluation.
14
-
14
+
15
15
  Attributes:
16
16
  score: Float score between 0 and 1
17
17
  reason: Explanation of the evaluation
18
18
  mode: Mode used for matching, if applicable
19
19
  """
20
-
20
+
21
21
  score: float
22
22
  reason: str
23
23
  mode: str | None = None
24
24
  criteria_scores: dict[str, float] | None = Field(default_factory=dict)
25
25
 
26
+
26
27
  class Evaluator(ABC):
27
28
  """Abstract base class for evaluators."""
28
-
29
+
29
30
  @abstractmethod
30
31
  def evaluate(self, task: Task, response: str) -> EvaluationResult:
31
32
  """Evaluate a task and response."""
hud/evaluators/inspect.py CHANGED
@@ -10,20 +10,15 @@ def inspect_evaluate(
10
10
  answer: Any,
11
11
  ) -> EvaluationResult:
12
12
  """Evaluate using Inspect-ai's evaluation models.
13
-
13
+
14
14
  Args:
15
15
  response: The response to evaluate
16
16
  answer: The reference answer to compare against
17
17
  model_name: The Inspect model to use
18
18
  prompt: Optional custom prompt for evaluation
19
19
  metrics: Optional list of metrics to evaluate against
20
-
20
+
21
21
  Returns:
22
22
  EvaluationResult with the evaluation results
23
23
  """
24
- return EvaluationResult(
25
- score=0.0,
26
- reason="Inspect evaluation not implemented",
27
- mode="inspect"
28
- )
29
-
24
+ return EvaluationResult(score=0.0, reason="Inspect evaluation not implemented", mode="inspect")