hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +5 -3
- hud/adapters/__init__.py +2 -1
- hud/adapters/claude/adapter.py +13 -17
- hud/adapters/common/adapter.py +3 -3
- hud/adapters/common/tests/__init__.py +0 -0
- hud/adapters/common/tests/test_adapter.py +277 -0
- hud/adapters/common/types.py +3 -6
- hud/adapters/operator/adapter.py +22 -29
- hud/agent/__init__.py +9 -1
- hud/agent/base.py +28 -28
- hud/agent/claude.py +69 -60
- hud/agent/langchain.py +204 -0
- hud/agent/operator.py +75 -67
- hud/env/__init__.py +5 -5
- hud/env/client.py +2 -2
- hud/env/docker_client.py +37 -39
- hud/env/environment.py +91 -66
- hud/env/local_docker_client.py +5 -7
- hud/env/remote_client.py +40 -29
- hud/env/remote_docker_client.py +13 -3
- hud/evaluators/__init__.py +2 -3
- hud/evaluators/base.py +4 -3
- hud/evaluators/inspect.py +3 -8
- hud/evaluators/judge.py +34 -58
- hud/evaluators/match.py +42 -49
- hud/evaluators/remote.py +13 -26
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +12 -0
- hud/evaluators/tests/test_judge.py +231 -0
- hud/evaluators/tests/test_match.py +115 -0
- hud/evaluators/tests/test_remote.py +98 -0
- hud/exceptions.py +167 -0
- hud/gym.py +12 -10
- hud/job.py +525 -47
- hud/server/__init__.py +2 -2
- hud/server/requests.py +148 -186
- hud/server/tests/__init__.py +0 -0
- hud/server/tests/test_requests.py +275 -0
- hud/settings.py +3 -2
- hud/task.py +12 -22
- hud/taskset.py +44 -11
- hud/trajectory.py +6 -9
- hud/types.py +14 -9
- hud/utils/__init__.py +2 -2
- hud/utils/common.py +37 -13
- hud/utils/config.py +44 -29
- hud/utils/progress.py +149 -0
- hud/utils/telemetry.py +10 -11
- hud/utils/tests/__init__.py +0 -0
- hud/utils/tests/test_common.py +52 -0
- hud/utils/tests/test_config.py +129 -0
- hud/utils/tests/test_progress.py +225 -0
- hud/utils/tests/test_telemetry.py +37 -0
- hud/utils/tests/test_version.py +8 -0
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
- hud_python-0.2.3.dist-info/RECORD +62 -0
- hud_python-0.2.1.dist-info/RECORD +0 -44
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0
hud/env/environment.py
CHANGED
|
@@ -10,25 +10,21 @@ from pydantic import BaseModel
|
|
|
10
10
|
from hud.env.client import Client
|
|
11
11
|
from hud.env.remote_client import RemoteClient
|
|
12
12
|
from hud.task import Task
|
|
13
|
-
from hud.utils.common import
|
|
14
|
-
from hud.utils.config import
|
|
13
|
+
from hud.utils.common import FunctionConfig, FunctionConfigs, Observation
|
|
14
|
+
from hud.utils.config import (
|
|
15
|
+
LOCAL_EVALUATORS,
|
|
16
|
+
REMOTE_EVALUATE,
|
|
17
|
+
REMOTE_FUNCTION_PREFIX,
|
|
18
|
+
REMOTE_SETUP,
|
|
19
|
+
expand_config,
|
|
20
|
+
)
|
|
21
|
+
from hud.utils.telemetry import stream
|
|
15
22
|
|
|
16
23
|
logger = logging.getLogger("hud.environment")
|
|
17
24
|
|
|
18
25
|
if TYPE_CHECKING:
|
|
19
26
|
from hud.adapters.common import CLA
|
|
20
|
-
|
|
21
|
-
class Observation(BaseModel):
|
|
22
|
-
"""
|
|
23
|
-
Observation from the environment.
|
|
24
|
-
|
|
25
|
-
Attributes:
|
|
26
|
-
screenshot: Base64 encoded PNG string of the screen
|
|
27
|
-
text: Text observation, if available
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
screenshot: str | None = None # base64 string png
|
|
31
|
-
text: str | None = None
|
|
27
|
+
from hud.agent import Agent
|
|
32
28
|
|
|
33
29
|
|
|
34
30
|
class Environment(BaseModel):
|
|
@@ -48,7 +44,7 @@ class Environment(BaseModel):
|
|
|
48
44
|
# final response
|
|
49
45
|
final_response: str | None = None
|
|
50
46
|
|
|
51
|
-
async def _invoke_all(self, configs:
|
|
47
|
+
async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]:
|
|
52
48
|
# Execute each config and collect results
|
|
53
49
|
configs_all = [configs] if not isinstance(configs, list) else configs
|
|
54
50
|
results = []
|
|
@@ -69,8 +65,8 @@ class Environment(BaseModel):
|
|
|
69
65
|
stderr.decode(),
|
|
70
66
|
)
|
|
71
67
|
return results
|
|
72
|
-
|
|
73
|
-
async def _setup(self, config:
|
|
68
|
+
|
|
69
|
+
async def _setup(self, config: FunctionConfigs | None = None) -> None:
|
|
74
70
|
"""
|
|
75
71
|
Setup the environment.
|
|
76
72
|
|
|
@@ -87,7 +83,7 @@ class Environment(BaseModel):
|
|
|
87
83
|
else:
|
|
88
84
|
raise ValueError("No config or task provided for local environment")
|
|
89
85
|
|
|
90
|
-
async def evaluate(self, config:
|
|
86
|
+
async def evaluate(self, config: FunctionConfigs | None = None) -> Any:
|
|
91
87
|
"""
|
|
92
88
|
Evaluate the environment.
|
|
93
89
|
|
|
@@ -98,8 +94,7 @@ class Environment(BaseModel):
|
|
|
98
94
|
Any: Result of the evaluation
|
|
99
95
|
"""
|
|
100
96
|
if isinstance(self.client, RemoteClient):
|
|
101
|
-
results = await self._invoke_all(
|
|
102
|
-
create_remote_config(self, config, REMOTE_EVALUATE))
|
|
97
|
+
results = await self._invoke_all(create_remote_config(self, config, REMOTE_EVALUATE))
|
|
103
98
|
else:
|
|
104
99
|
if config is not None:
|
|
105
100
|
results = await self._invoke_all(config)
|
|
@@ -111,11 +106,10 @@ class Environment(BaseModel):
|
|
|
111
106
|
return results[0]
|
|
112
107
|
else:
|
|
113
108
|
return results
|
|
114
|
-
|
|
115
109
|
|
|
116
|
-
async def reset(
|
|
117
|
-
|
|
118
|
-
]:
|
|
110
|
+
async def reset(
|
|
111
|
+
self, configs: FunctionConfigs | None = None
|
|
112
|
+
) -> tuple[Observation, dict[str, Any]]:
|
|
119
113
|
"""
|
|
120
114
|
Reset the environment.
|
|
121
115
|
|
|
@@ -126,15 +120,15 @@ class Environment(BaseModel):
|
|
|
126
120
|
Observation: The first observation from the environment
|
|
127
121
|
info: Dictionary of information about the environment
|
|
128
122
|
"""
|
|
129
|
-
#await self._setup(configs)
|
|
123
|
+
# await self._setup(configs)
|
|
130
124
|
obs, _, _, info = await self.step()
|
|
131
125
|
if self.task and self.task.prompt:
|
|
132
126
|
obs.text = self.task.prompt
|
|
133
127
|
return obs, info
|
|
134
128
|
|
|
135
|
-
async def step(
|
|
136
|
-
|
|
137
|
-
]:
|
|
129
|
+
async def step(
|
|
130
|
+
self, actions: CLA | list[CLA] | None = None
|
|
131
|
+
) -> tuple[Observation, float, bool, dict[str, Any]]:
|
|
138
132
|
"""Execute a step in the environment.
|
|
139
133
|
|
|
140
134
|
Args:
|
|
@@ -143,6 +137,8 @@ class Environment(BaseModel):
|
|
|
143
137
|
Returns:
|
|
144
138
|
Any: Result of the step execution
|
|
145
139
|
"""
|
|
140
|
+
if not isinstance(actions, list) and actions is not None:
|
|
141
|
+
actions = [actions]
|
|
146
142
|
if actions is None or len(actions) == 0:
|
|
147
143
|
actions = []
|
|
148
144
|
args = [[action.model_dump() for action in actions]]
|
|
@@ -150,20 +146,19 @@ class Environment(BaseModel):
|
|
|
150
146
|
# TODO: Move this into the server side
|
|
151
147
|
if self._maybe_store_response(actions):
|
|
152
148
|
return Observation(text=self.final_response), 0, False, {}
|
|
153
|
-
|
|
149
|
+
|
|
154
150
|
result, stdout, stderr = await self.client.invoke(
|
|
155
|
-
|
|
151
|
+
FunctionConfig(function="step", args=args)
|
|
156
152
|
)
|
|
157
153
|
if stdout:
|
|
158
154
|
logger.info("Step produced stdout: %s", stdout.decode())
|
|
159
155
|
if stderr:
|
|
160
156
|
logger.warning("Step produced stderr: %s", stderr.decode())
|
|
161
157
|
|
|
162
|
-
|
|
163
158
|
observation = Observation.model_validate(result["observation"], strict=True)
|
|
164
159
|
|
|
165
160
|
return observation, 0, False, {}
|
|
166
|
-
|
|
161
|
+
|
|
167
162
|
def _maybe_store_response(self, actions: list[CLA]) -> bool:
|
|
168
163
|
"""Store the final response into the environment.
|
|
169
164
|
|
|
@@ -178,14 +173,13 @@ class Environment(BaseModel):
|
|
|
178
173
|
return True
|
|
179
174
|
return False
|
|
180
175
|
|
|
181
|
-
|
|
182
176
|
async def get_urls(self) -> dict[str, Any]:
|
|
183
177
|
"""Get URLs for the environment.
|
|
184
178
|
|
|
185
179
|
Returns:
|
|
186
180
|
dict: Dictionary of URLs for accessing the environment
|
|
187
181
|
"""
|
|
188
|
-
data, _, _ = await self.client.invoke(
|
|
182
|
+
data, _, _ = await self.client.invoke(FunctionConfig(function="get_urls", args=[]))
|
|
189
183
|
|
|
190
184
|
self.url = data.get("url")
|
|
191
185
|
self.live_url = data.get("live_url")
|
|
@@ -202,11 +196,43 @@ class Environment(BaseModel):
|
|
|
202
196
|
"""
|
|
203
197
|
await self.client.close()
|
|
204
198
|
|
|
199
|
+
async def stream(self) -> str | None:
|
|
200
|
+
urls = await self.get_urls()
|
|
201
|
+
if urls["live_url"] is None:
|
|
202
|
+
logger.warning("No live URL found")
|
|
203
|
+
return None
|
|
204
|
+
# Stream the live view
|
|
205
|
+
return stream(urls["live_url"])
|
|
206
|
+
|
|
207
|
+
async def run(self, agent: Agent, max_steps: int = 27, verbose: bool = True) -> Any:
|
|
208
|
+
"""Run an agent in the environment.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
agent: The agent to run
|
|
212
|
+
"""
|
|
213
|
+
if verbose:
|
|
214
|
+
logger.info("[HUD] Running agent in environment...")
|
|
215
|
+
obs, _ = await self.reset()
|
|
216
|
+
for i in range(max_steps):
|
|
217
|
+
action, done = await agent.predict(obs)
|
|
218
|
+
if verbose:
|
|
219
|
+
logger.info("[HUD] Step %d: Action: %s", i, action)
|
|
220
|
+
obs, reward, terminated, info = await self.step(action)
|
|
221
|
+
if verbose:
|
|
222
|
+
logger.info("[HUD] Step %d: Observation: %s", i, obs)
|
|
223
|
+
if done or terminated:
|
|
224
|
+
break
|
|
225
|
+
result = await self.evaluate()
|
|
226
|
+
if verbose:
|
|
227
|
+
logger.info("[HUD] Evaluation result: %s", result)
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
|
|
205
231
|
def create_remote_config(
|
|
206
232
|
env: Environment | None = None,
|
|
207
|
-
config:
|
|
233
|
+
config: FunctionConfigs | None = None,
|
|
208
234
|
function: str | None = None,
|
|
209
|
-
) -> list[
|
|
235
|
+
) -> list[FunctionConfig]:
|
|
210
236
|
"""
|
|
211
237
|
Create a remote configuration for setup or evaluate, determining the final
|
|
212
238
|
function call structure based on the provided task or explicit config.
|
|
@@ -218,11 +244,11 @@ def create_remote_config(
|
|
|
218
244
|
env: Environment object, potentially containing a task definition.
|
|
219
245
|
Used to access `env.task` and `env.final_response`.
|
|
220
246
|
config: Direct configuration override (e.g., passed to `env.evaluate(config=...)`).
|
|
221
|
-
Can be in various
|
|
247
|
+
Can be in various FunctionConfigs formats.
|
|
222
248
|
function: The top-level function context, typically "setup" or "evaluate".
|
|
223
249
|
|
|
224
250
|
Returns:
|
|
225
|
-
list[
|
|
251
|
+
list[FunctionConfig]: A list containing a single FunctionConfig object
|
|
226
252
|
ready for remote invocation via `client.invoke`.
|
|
227
253
|
The specific function/arguments are chosen based on this priority:
|
|
228
254
|
1. Explicit `config` parameter (if provided).
|
|
@@ -242,8 +268,8 @@ def create_remote_config(
|
|
|
242
268
|
`config=("contains_text", "Paris")`
|
|
243
269
|
`function="evaluate"`
|
|
244
270
|
- Example Output:
|
|
245
|
-
`[
|
|
246
|
-
|
|
271
|
+
`[FunctionConfig(function='evaluate', args=[
|
|
272
|
+
FunctionConfig(function='contains_text', args=['Paris', 'Paris'])
|
|
247
273
|
])]`
|
|
248
274
|
|
|
249
275
|
2) No explicit `config`, Task has the attribute (e.g., `task.evaluate`):
|
|
@@ -255,7 +281,7 @@ def create_remote_config(
|
|
|
255
281
|
`config=None`
|
|
256
282
|
`function="evaluate"`
|
|
257
283
|
- Example Output:
|
|
258
|
-
`[
|
|
284
|
+
`[FunctionConfig(function='evaluate', args=[FunctionConfig(function='check_answer',
|
|
259
285
|
args=['42'], id='t1')])]`
|
|
260
286
|
|
|
261
287
|
3) No explicit `config`, no specific Task attribute, Task has `task.config`:
|
|
@@ -267,7 +293,7 @@ def create_remote_config(
|
|
|
267
293
|
`config=None`
|
|
268
294
|
`function="evaluate"`
|
|
269
295
|
- Example Output:
|
|
270
|
-
`[
|
|
296
|
+
`[FunctionConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
|
|
271
297
|
|
|
272
298
|
4) No explicit `config`, no specific Task attribute, no `task.config`, Task has `task.id`:
|
|
273
299
|
Calls a private function (`private_<function>`) on the remote end, passing
|
|
@@ -277,7 +303,7 @@ def create_remote_config(
|
|
|
277
303
|
`config=None`
|
|
278
304
|
`function="evaluate"`
|
|
279
305
|
- Example Output:
|
|
280
|
-
`[
|
|
306
|
+
`[FunctionConfig(function='private_evaluate', args=['t3'])]`
|
|
281
307
|
|
|
282
308
|
5) No explicit `config` and no relevant Task info:
|
|
283
309
|
Calls the top-level `function` with empty args.
|
|
@@ -286,50 +312,50 @@ def create_remote_config(
|
|
|
286
312
|
`config=None`
|
|
287
313
|
`function="evaluate"`
|
|
288
314
|
- Example Output:
|
|
289
|
-
`[
|
|
315
|
+
`[FunctionConfig(function='evaluate', args=[])]`
|
|
290
316
|
"""
|
|
291
317
|
# If no function provided, just expand the config and return it directly
|
|
292
318
|
if function is None:
|
|
293
319
|
if config:
|
|
294
320
|
return expand_config(config)
|
|
295
321
|
raise ValueError("Either function or config must be provided")
|
|
296
|
-
|
|
322
|
+
|
|
297
323
|
# Case 1: Explicit config provided
|
|
298
324
|
if config:
|
|
299
325
|
expanded_configs = expand_config(config)
|
|
300
|
-
if env and env.final_response:
|
|
326
|
+
if env and env.final_response and expanded_configs[0].args[0] in LOCAL_EVALUATORS:
|
|
301
327
|
# Ensure args is a list before appending
|
|
302
328
|
if not isinstance(expanded_configs[0].args, list):
|
|
303
|
-
|
|
304
|
-
expanded_configs[0].args.append(env.final_response)
|
|
305
|
-
return [
|
|
306
|
-
|
|
329
|
+
expanded_configs[0].args = [expanded_configs[0].args]
|
|
330
|
+
expanded_configs[0].args.append(env.final_response) # for remote responses
|
|
331
|
+
return [FunctionConfig(function=function, args=expanded_configs)]
|
|
332
|
+
|
|
307
333
|
# Otherwise, use the environment's task
|
|
308
334
|
task = env.task if env else None
|
|
309
|
-
|
|
335
|
+
|
|
310
336
|
# Must have a task for the remaining cases
|
|
311
337
|
if task is None:
|
|
312
338
|
raise ValueError("Either task or config must be provided")
|
|
313
|
-
|
|
339
|
+
|
|
314
340
|
# Case 2: Task has the specified function attribute
|
|
315
341
|
task_config = getattr(task, function, None)
|
|
316
342
|
if task_config:
|
|
317
343
|
expanded_configs = expand_config(task_config)
|
|
318
344
|
if task.id:
|
|
319
|
-
expanded_configs[0].id = task.id
|
|
320
|
-
|
|
345
|
+
expanded_configs[0].id = task.id # for remote IDs
|
|
346
|
+
if env and env.final_response and expanded_configs[0].args[0] in LOCAL_EVALUATORS:
|
|
321
347
|
# Ensure args is a list before appending
|
|
322
348
|
if not isinstance(expanded_configs[0].args, list):
|
|
323
|
-
|
|
324
|
-
expanded_configs[0].args.append(env.final_response)
|
|
325
|
-
return [
|
|
326
|
-
|
|
349
|
+
expanded_configs[0].args = [expanded_configs[0].args]
|
|
350
|
+
expanded_configs[0].args.append(env.final_response) # for remote responses
|
|
351
|
+
return [FunctionConfig(function=function, args=expanded_configs)]
|
|
352
|
+
|
|
327
353
|
# Case 3: Check for task.config
|
|
328
354
|
if hasattr(task, "config") and task.config:
|
|
329
355
|
# Ensure task.config is a dictionary before adding id
|
|
330
356
|
final_args = task.config.copy() if isinstance(task.config, dict) else {}
|
|
331
357
|
if task.id:
|
|
332
|
-
final_args["id"] = task.id
|
|
358
|
+
final_args["id"] = task.id # for remote IDs
|
|
333
359
|
if env and env.final_response:
|
|
334
360
|
# Append response, ensuring args exists and is a list
|
|
335
361
|
if "args" not in final_args:
|
|
@@ -337,18 +363,17 @@ def create_remote_config(
|
|
|
337
363
|
if not isinstance(final_args["args"], list):
|
|
338
364
|
final_args["args"] = [final_args["args"]]
|
|
339
365
|
final_args["args"].append(env.final_response)
|
|
340
|
-
return [
|
|
341
|
-
|
|
366
|
+
return [FunctionConfig(function=function, args=[final_args])]
|
|
367
|
+
|
|
342
368
|
# Case 4: Use task.id
|
|
343
369
|
if task.id:
|
|
344
370
|
args_list = [task.id]
|
|
345
371
|
if env and env.final_response:
|
|
346
|
-
|
|
347
|
-
return [
|
|
348
|
-
|
|
372
|
+
args_list.append(env.final_response) # Append final response
|
|
373
|
+
return [FunctionConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
|
|
374
|
+
|
|
349
375
|
# Case 5: No valid configuration found
|
|
350
376
|
args_list = []
|
|
351
377
|
if env and env.final_response:
|
|
352
378
|
args_list.append(env.final_response)
|
|
353
|
-
return [
|
|
354
|
-
|
|
379
|
+
return [FunctionConfig(function=function, args=args_list)]
|
hud/env/local_docker_client.py
CHANGED
|
@@ -19,15 +19,16 @@ if TYPE_CHECKING:
|
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger("hud.env.docker_env_client")
|
|
21
21
|
|
|
22
|
+
|
|
22
23
|
class LocalDockerClient(DockerClient):
|
|
23
24
|
"""
|
|
24
25
|
Docker-based environment client implementation.
|
|
25
26
|
"""
|
|
26
27
|
|
|
27
28
|
@classmethod
|
|
28
|
-
async def create(
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
async def create(
|
|
30
|
+
cls, dockerfile: str, ports: list[int] | None = None
|
|
31
|
+
) -> tuple[LocalDockerClient, dict[str, Any]]:
|
|
31
32
|
"""
|
|
32
33
|
Creates a Docker environment client from a dockerfile.
|
|
33
34
|
|
|
@@ -86,9 +87,7 @@ class LocalDockerClient(DockerClient):
|
|
|
86
87
|
"HostConfig": {
|
|
87
88
|
"PublishAllPorts": True,
|
|
88
89
|
},
|
|
89
|
-
"ExposedPorts": {
|
|
90
|
-
f"{port}/tcp": {} for port in ports
|
|
91
|
-
},
|
|
90
|
+
"ExposedPorts": {f"{port}/tcp": {} for port in ports},
|
|
92
91
|
}
|
|
93
92
|
|
|
94
93
|
container = await docker_client.containers.create(config=container_config)
|
|
@@ -198,7 +197,6 @@ class LocalDockerClient(DockerClient):
|
|
|
198
197
|
exit_code=0,
|
|
199
198
|
)
|
|
200
199
|
|
|
201
|
-
|
|
202
200
|
async def get_archive(self, path: str) -> bytes:
|
|
203
201
|
"""
|
|
204
202
|
Get an archive of a path from the container.
|
hud/env/remote_client.py
CHANGED
|
@@ -5,23 +5,25 @@ from base64 import b64decode
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
7
|
from hud.env.client import Client
|
|
8
|
+
from hud.exceptions import HudResponseError
|
|
8
9
|
from hud.server import make_request
|
|
9
10
|
from hud.settings import settings
|
|
10
11
|
from hud.types import EnvironmentStatus
|
|
11
12
|
from hud.utils import ExecuteResult
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
|
-
from hud.utils.config import
|
|
15
|
+
from hud.utils.config import FunctionConfig
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger("hud.env.remote_env_client")
|
|
17
18
|
|
|
19
|
+
|
|
18
20
|
class RemoteClient(Client):
|
|
19
21
|
"""
|
|
20
22
|
Remote environment client implementation.
|
|
21
|
-
|
|
23
|
+
|
|
22
24
|
Uses the HUD API to manage a remote environment.
|
|
23
25
|
"""
|
|
24
|
-
|
|
26
|
+
|
|
25
27
|
@classmethod
|
|
26
28
|
async def create(
|
|
27
29
|
cls,
|
|
@@ -33,21 +35,23 @@ class RemoteClient(Client):
|
|
|
33
35
|
) -> tuple[RemoteClient, dict[str, Any]]:
|
|
34
36
|
"""
|
|
35
37
|
Creates a remote environment client from a dockerfile or gym_id.
|
|
36
|
-
|
|
38
|
+
|
|
37
39
|
Args:
|
|
38
40
|
dockerfile: The dockerfile content to build the environment
|
|
39
41
|
gym_id: The gym_id of the environment to create
|
|
40
42
|
metadata: Metadata to associate with the environment
|
|
41
|
-
|
|
43
|
+
|
|
42
44
|
Returns:
|
|
43
|
-
|
|
45
|
+
A tuple containing the remote environment client and the build metadata
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
HudResponseError: If the environment creation is successful but the response is invalid.
|
|
44
49
|
"""
|
|
45
50
|
|
|
46
51
|
# Validate arguments
|
|
47
52
|
if metadata is None:
|
|
48
53
|
metadata = {}
|
|
49
54
|
|
|
50
|
-
|
|
51
55
|
request_data = {
|
|
52
56
|
# still named run_id for backwards compatibility
|
|
53
57
|
"run_id": job_id,
|
|
@@ -63,29 +67,38 @@ class RemoteClient(Client):
|
|
|
63
67
|
json=request_data,
|
|
64
68
|
api_key=settings.api_key,
|
|
65
69
|
)
|
|
66
|
-
|
|
70
|
+
|
|
67
71
|
# Get the environment ID from the response
|
|
68
72
|
env_id = response.get("id")
|
|
69
73
|
if not env_id:
|
|
70
|
-
raise
|
|
71
|
-
|
|
74
|
+
raise HudResponseError(
|
|
75
|
+
message="Failed to create remote environment: No ID returned in API response. "
|
|
76
|
+
"Please contact support if this issue persists.",
|
|
77
|
+
response_json=response,
|
|
78
|
+
)
|
|
79
|
+
|
|
72
80
|
# Create the controller instance
|
|
73
81
|
controller = cls(env_id)
|
|
74
|
-
|
|
82
|
+
|
|
75
83
|
build_data = response.get("metadata", {})
|
|
76
|
-
|
|
84
|
+
|
|
85
|
+
if response.get("readme"):
|
|
86
|
+
logger.info(
|
|
87
|
+
"[HUD] %s gym created, see how to use it at %s", gym_id, response.get("readme")
|
|
88
|
+
)
|
|
89
|
+
|
|
77
90
|
return controller, build_data
|
|
78
91
|
|
|
79
92
|
def __init__(self, env_id: str) -> None:
|
|
80
93
|
"""
|
|
81
94
|
Initialize the RemoteClient.
|
|
82
|
-
|
|
95
|
+
|
|
83
96
|
Args:
|
|
84
97
|
env_id: ID of the remote environment to control
|
|
85
98
|
"""
|
|
86
99
|
super().__init__()
|
|
87
100
|
self._env_id = env_id
|
|
88
|
-
|
|
101
|
+
|
|
89
102
|
@property
|
|
90
103
|
def env_id(self) -> str:
|
|
91
104
|
"""The ID of the remote environment."""
|
|
@@ -94,7 +107,7 @@ class RemoteClient(Client):
|
|
|
94
107
|
async def get_status(self) -> EnvironmentStatus:
|
|
95
108
|
"""
|
|
96
109
|
Get the current status of the remote environment.
|
|
97
|
-
|
|
110
|
+
|
|
98
111
|
Returns:
|
|
99
112
|
EnvironmentStatus: The current status of the environment
|
|
100
113
|
"""
|
|
@@ -107,7 +120,7 @@ class RemoteClient(Client):
|
|
|
107
120
|
logger.debug("Environment status response: %s", response)
|
|
108
121
|
|
|
109
122
|
status = response.get("state", "").lower()
|
|
110
|
-
|
|
123
|
+
|
|
111
124
|
if status == "running":
|
|
112
125
|
return EnvironmentStatus.RUNNING
|
|
113
126
|
elif status == "initializing" or status == "pending":
|
|
@@ -118,12 +131,12 @@ class RemoteClient(Client):
|
|
|
118
131
|
# Any other status is considered an error
|
|
119
132
|
logger.warning("Abnormal environment status response: %s", response)
|
|
120
133
|
return EnvironmentStatus.ERROR
|
|
121
|
-
|
|
134
|
+
|
|
122
135
|
except Exception:
|
|
123
136
|
# If we can't connect to the API or there's any other error
|
|
124
137
|
logger.info("(potentially transient) Error getting environment status")
|
|
125
138
|
return EnvironmentStatus.ERROR
|
|
126
|
-
|
|
139
|
+
|
|
127
140
|
async def execute(
|
|
128
141
|
self,
|
|
129
142
|
command: list[str],
|
|
@@ -134,11 +147,11 @@ class RemoteClient(Client):
|
|
|
134
147
|
"""
|
|
135
148
|
Execute a command in the environment.
|
|
136
149
|
No-op in some environments (like browser use).
|
|
137
|
-
|
|
150
|
+
|
|
138
151
|
Args:
|
|
139
152
|
command: Command to execute
|
|
140
153
|
workdir: Working directory for the command (ignored for remote environments)
|
|
141
|
-
|
|
154
|
+
|
|
142
155
|
Returns:
|
|
143
156
|
ExecuteResult: Result of the command execution
|
|
144
157
|
"""
|
|
@@ -146,21 +159,20 @@ class RemoteClient(Client):
|
|
|
146
159
|
method="POST",
|
|
147
160
|
url=f"{settings.base_url}/v2/environments/{self.env_id}/execute",
|
|
148
161
|
json={
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
162
|
+
"command": command,
|
|
163
|
+
"workdir": workdir,
|
|
164
|
+
"timeout": timeout,
|
|
152
165
|
},
|
|
153
166
|
api_key=settings.api_key,
|
|
154
167
|
)
|
|
155
|
-
|
|
168
|
+
|
|
156
169
|
return ExecuteResult(
|
|
157
170
|
stdout=b64decode(data["stdout"]),
|
|
158
171
|
stderr=b64decode(data["stderr"]),
|
|
159
|
-
exit_code=data["exit_code"]
|
|
172
|
+
exit_code=data["exit_code"],
|
|
160
173
|
)
|
|
161
174
|
|
|
162
|
-
|
|
163
|
-
async def invoke(self, config: HudStyleConfig) -> tuple[Any, bytes, bytes]:
|
|
175
|
+
async def invoke(self, config: FunctionConfig) -> tuple[Any, bytes, bytes]:
|
|
164
176
|
"""
|
|
165
177
|
Invoke a function in the environment.
|
|
166
178
|
"""
|
|
@@ -170,9 +182,8 @@ class RemoteClient(Client):
|
|
|
170
182
|
json=config.model_dump(),
|
|
171
183
|
api_key=settings.api_key,
|
|
172
184
|
)
|
|
173
|
-
|
|
174
|
-
return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
|
|
175
185
|
|
|
186
|
+
return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
|
|
176
187
|
|
|
177
188
|
async def close(self) -> None:
|
|
178
189
|
"""
|
hud/env/remote_docker_client.py
CHANGED
|
@@ -5,6 +5,7 @@ from base64 import b64decode, b64encode
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
7
|
from hud.env.docker_client import DockerClient
|
|
8
|
+
from hud.exceptions import HudResponseError
|
|
8
9
|
from hud.server import make_request
|
|
9
10
|
from hud.settings import settings
|
|
10
11
|
from hud.types import EnvironmentStatus
|
|
@@ -39,7 +40,10 @@ class RemoteDockerClient(DockerClient):
|
|
|
39
40
|
metadata: Metadata to associate with the environment
|
|
40
41
|
|
|
41
42
|
Returns:
|
|
42
|
-
|
|
43
|
+
A tuple containing the remote environment client and the build metadata
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
HudResponseError: If the environment creation fails.
|
|
43
47
|
"""
|
|
44
48
|
|
|
45
49
|
# Validate arguments
|
|
@@ -48,7 +52,7 @@ class RemoteDockerClient(DockerClient):
|
|
|
48
52
|
|
|
49
53
|
logger.info("Creating remote environment")
|
|
50
54
|
|
|
51
|
-
true_gym_id = await get_gym_id("
|
|
55
|
+
true_gym_id = await get_gym_id("docker")
|
|
52
56
|
|
|
53
57
|
# augment metadata with dockerfile
|
|
54
58
|
if "environment_config" not in metadata:
|
|
@@ -73,7 +77,13 @@ class RemoteDockerClient(DockerClient):
|
|
|
73
77
|
# Get the environment ID from the response
|
|
74
78
|
env_id = response.get("id")
|
|
75
79
|
if not env_id:
|
|
76
|
-
raise
|
|
80
|
+
raise HudResponseError(
|
|
81
|
+
message=(
|
|
82
|
+
"Failed to create remote environment: No ID returned in API response. "
|
|
83
|
+
"Please contact support if this issue persists."
|
|
84
|
+
),
|
|
85
|
+
response_json=response,
|
|
86
|
+
)
|
|
77
87
|
|
|
78
88
|
# Create the controller instance
|
|
79
89
|
controller = cls(env_id)
|
hud/evaluators/__init__.py
CHANGED
hud/evaluators/base.py
CHANGED
|
@@ -11,21 +11,22 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
class EvaluationResult(BaseModel):
|
|
13
13
|
"""Result of an evaluation.
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
Attributes:
|
|
16
16
|
score: Float score between 0 and 1
|
|
17
17
|
reason: Explanation of the evaluation
|
|
18
18
|
mode: Mode used for matching, if applicable
|
|
19
19
|
"""
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
score: float
|
|
22
22
|
reason: str
|
|
23
23
|
mode: str | None = None
|
|
24
24
|
criteria_scores: dict[str, float] | None = Field(default_factory=dict)
|
|
25
25
|
|
|
26
|
+
|
|
26
27
|
class Evaluator(ABC):
|
|
27
28
|
"""Abstract base class for evaluators."""
|
|
28
|
-
|
|
29
|
+
|
|
29
30
|
@abstractmethod
|
|
30
31
|
def evaluate(self, task: Task, response: str) -> EvaluationResult:
|
|
31
32
|
"""Evaluate a task and response."""
|
hud/evaluators/inspect.py
CHANGED
|
@@ -10,20 +10,15 @@ def inspect_evaluate(
|
|
|
10
10
|
answer: Any,
|
|
11
11
|
) -> EvaluationResult:
|
|
12
12
|
"""Evaluate using Inspect-ai's evaluation models.
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
Args:
|
|
15
15
|
response: The response to evaluate
|
|
16
16
|
answer: The reference answer to compare against
|
|
17
17
|
model_name: The Inspect model to use
|
|
18
18
|
prompt: Optional custom prompt for evaluation
|
|
19
19
|
metrics: Optional list of metrics to evaluate against
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
Returns:
|
|
22
22
|
EvaluationResult with the evaluation results
|
|
23
23
|
"""
|
|
24
|
-
return EvaluationResult(
|
|
25
|
-
score=0.0,
|
|
26
|
-
reason="Inspect evaluation not implemented",
|
|
27
|
-
mode="inspect"
|
|
28
|
-
)
|
|
29
|
-
|
|
24
|
+
return EvaluationResult(score=0.0, reason="Inspect evaluation not implemented", mode="inspect")
|