hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +4 -3
- hud/adapters/claude/adapter.py +5 -14
- hud/adapters/common/adapter.py +3 -3
- hud/adapters/common/tests/__init__.py +0 -0
- hud/adapters/common/tests/test_adapter.py +277 -0
- hud/adapters/common/types.py +3 -3
- hud/adapters/operator/adapter.py +16 -23
- hud/agent/__init__.py +8 -1
- hud/agent/base.py +28 -28
- hud/agent/claude.py +69 -60
- hud/agent/langchain.py +32 -26
- hud/agent/operator.py +75 -67
- hud/env/__init__.py +5 -5
- hud/env/client.py +2 -2
- hud/env/docker_client.py +37 -39
- hud/env/environment.py +91 -66
- hud/env/local_docker_client.py +5 -7
- hud/env/remote_client.py +39 -32
- hud/env/remote_docker_client.py +13 -3
- hud/evaluators/__init__.py +2 -3
- hud/evaluators/base.py +4 -3
- hud/evaluators/inspect.py +3 -8
- hud/evaluators/judge.py +34 -58
- hud/evaluators/match.py +42 -49
- hud/evaluators/remote.py +13 -26
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +12 -0
- hud/evaluators/tests/test_judge.py +231 -0
- hud/evaluators/tests/test_match.py +115 -0
- hud/evaluators/tests/test_remote.py +98 -0
- hud/exceptions.py +167 -0
- hud/gym.py +9 -7
- hud/job.py +179 -109
- hud/server/__init__.py +2 -2
- hud/server/requests.py +148 -186
- hud/server/tests/__init__.py +0 -0
- hud/server/tests/test_requests.py +275 -0
- hud/settings.py +3 -2
- hud/task.py +9 -19
- hud/taskset.py +44 -11
- hud/trajectory.py +6 -9
- hud/types.py +12 -9
- hud/utils/__init__.py +2 -2
- hud/utils/common.py +36 -15
- hud/utils/config.py +45 -30
- hud/utils/progress.py +34 -21
- hud/utils/telemetry.py +10 -11
- hud/utils/tests/__init__.py +0 -0
- hud/utils/tests/test_common.py +52 -0
- hud/utils/tests/test_config.py +129 -0
- hud/utils/tests/test_progress.py +225 -0
- hud/utils/tests/test_telemetry.py +37 -0
- hud/utils/tests/test_version.py +8 -0
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
- hud_python-0.2.4.dist-info/RECORD +62 -0
- hud_python-0.2.2.dist-info/RECORD +0 -46
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0
hud/env/__init__.py
CHANGED
|
@@ -3,9 +3,9 @@ from __future__ import annotations
|
|
|
3
3
|
from . import docker_client, environment, local_docker_client, remote_client, remote_docker_client
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
6
|
+
"docker_client",
|
|
7
|
+
"environment",
|
|
8
|
+
"local_docker_client",
|
|
9
|
+
"remote_client",
|
|
10
|
+
"remote_docker_client",
|
|
11
11
|
]
|
hud/env/client.py
CHANGED
|
@@ -7,7 +7,7 @@ from pydantic import BaseModel
|
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from hud.types import EnvironmentStatus
|
|
10
|
-
from hud.utils.config import
|
|
10
|
+
from hud.utils.config import FunctionConfig
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class Client(BaseModel, ABC):
|
|
@@ -16,7 +16,7 @@ class Client(BaseModel, ABC):
|
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
@abstractmethod
|
|
19
|
-
async def invoke(self, config:
|
|
19
|
+
async def invoke(self, config: FunctionConfig) -> Any:
|
|
20
20
|
"""
|
|
21
21
|
Invoke the environment with the given config.
|
|
22
22
|
"""
|
hud/env/docker_client.py
CHANGED
|
@@ -16,7 +16,7 @@ from hud.utils.common import directory_to_tar_bytes
|
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
18
|
from hud.utils import ExecuteResult
|
|
19
|
-
from hud.utils.config import
|
|
19
|
+
from hud.utils.config import FunctionConfig
|
|
20
20
|
|
|
21
21
|
logger = logging.getLogger("hud.env.docker_client")
|
|
22
22
|
|
|
@@ -33,7 +33,7 @@ class InvokeError(Exception):
|
|
|
33
33
|
"""
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def invoke_template(config:
|
|
36
|
+
def invoke_template(config: FunctionConfig, package_name: str, divider: str) -> str:
|
|
37
37
|
"""
|
|
38
38
|
Return a python script to run the given config.
|
|
39
39
|
"""
|
|
@@ -51,16 +51,17 @@ print("{divider}")
|
|
|
51
51
|
print(result_str)
|
|
52
52
|
"""
|
|
53
53
|
|
|
54
|
+
|
|
54
55
|
class DockerClient(Client):
|
|
55
56
|
"""
|
|
56
57
|
Base class for environment clients.
|
|
57
|
-
|
|
58
|
+
|
|
58
59
|
Handles updating the environment when local files change.
|
|
59
60
|
"""
|
|
60
|
-
|
|
61
|
+
|
|
61
62
|
_last_pyproject_toml_str: str | None = None
|
|
62
63
|
_last_update_time: int = 0
|
|
63
|
-
_last_file_mtimes: dict[str, float] = {}
|
|
64
|
+
_last_file_mtimes: dict[str, float] = {} # noqa: RUF012 - Not recognized as Pydantic model
|
|
64
65
|
_source_path: Path | None = None
|
|
65
66
|
_package_name: str | None = None
|
|
66
67
|
|
|
@@ -68,47 +69,46 @@ class DockerClient(Client):
|
|
|
68
69
|
def source_path(self) -> Path | None:
|
|
69
70
|
"""Get the source path."""
|
|
70
71
|
return self._source_path
|
|
71
|
-
|
|
72
|
+
|
|
72
73
|
@property
|
|
73
74
|
def package_name(self) -> str:
|
|
74
75
|
"""Get the package name."""
|
|
75
76
|
if not self._package_name:
|
|
76
77
|
raise ValueError("Package name not set")
|
|
77
78
|
return self._package_name
|
|
78
|
-
|
|
79
79
|
|
|
80
80
|
def set_source_path(self, source_path: Path) -> None:
|
|
81
81
|
"""
|
|
82
82
|
Set the source path for this environment controller.
|
|
83
83
|
Can only be set once, and cannot be set if source_path is already set.
|
|
84
|
-
|
|
84
|
+
|
|
85
85
|
Args:
|
|
86
86
|
source_path: Path to the source code to use in the environment
|
|
87
|
-
|
|
87
|
+
|
|
88
88
|
Raises:
|
|
89
89
|
ValueError: If source_path has already been set
|
|
90
90
|
"""
|
|
91
91
|
if self._source_path:
|
|
92
92
|
raise ValueError("Source path has already been set")
|
|
93
|
-
|
|
93
|
+
|
|
94
94
|
# Validate source path
|
|
95
95
|
if not source_path.exists():
|
|
96
96
|
raise FileNotFoundError(f"Source path {source_path} does not exist")
|
|
97
97
|
if not source_path.is_dir():
|
|
98
98
|
raise NotADirectoryError(f"Source path {source_path} is not a directory")
|
|
99
|
-
|
|
99
|
+
|
|
100
100
|
# Parse pyproject.toml to get package name
|
|
101
101
|
pyproject_path = source_path / "pyproject.toml"
|
|
102
102
|
if not pyproject_path.exists():
|
|
103
103
|
raise FileNotFoundError(f"pyproject.toml not found in {source_path}")
|
|
104
|
-
|
|
104
|
+
|
|
105
105
|
pyproject_data = toml.load(pyproject_path)
|
|
106
106
|
self._package_name = pyproject_data.get("project", {}).get("name")
|
|
107
107
|
if not self._package_name:
|
|
108
108
|
raise ValueError("Could not find package name in pyproject.toml")
|
|
109
|
-
|
|
109
|
+
|
|
110
110
|
self._source_path = source_path
|
|
111
|
-
|
|
111
|
+
|
|
112
112
|
@classmethod
|
|
113
113
|
@abc.abstractmethod
|
|
114
114
|
async def create(cls, dockerfile: str) -> DockerClient:
|
|
@@ -121,26 +121,26 @@ class DockerClient(Client):
|
|
|
121
121
|
Returns:
|
|
122
122
|
EnvClient: An instance of the environment client
|
|
123
123
|
"""
|
|
124
|
-
|
|
124
|
+
|
|
125
125
|
@abc.abstractmethod
|
|
126
126
|
async def get_status(self) -> EnvironmentStatus:
|
|
127
127
|
"""
|
|
128
128
|
Get the current status of the environment.
|
|
129
|
-
|
|
129
|
+
|
|
130
130
|
Returns:
|
|
131
131
|
EnvironmentStatus: A status enum indicating the current state of the environment
|
|
132
132
|
"""
|
|
133
|
-
|
|
133
|
+
|
|
134
134
|
def _get_all_file_mtimes(self) -> dict[str, float]:
|
|
135
135
|
"""
|
|
136
136
|
Get modification times for all files in the source path.
|
|
137
|
-
|
|
137
|
+
|
|
138
138
|
Returns:
|
|
139
139
|
Dict[str, float]: Dictionary mapping file paths to modification times
|
|
140
140
|
"""
|
|
141
141
|
if not self._source_path:
|
|
142
142
|
return {}
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
file_mtimes = {}
|
|
145
145
|
for root, _, files in os.walk(self._source_path):
|
|
146
146
|
for file in files:
|
|
@@ -151,12 +151,12 @@ class DockerClient(Client):
|
|
|
151
151
|
# Skip files that can't be accessed
|
|
152
152
|
continue
|
|
153
153
|
return file_mtimes
|
|
154
|
-
|
|
154
|
+
|
|
155
155
|
async def needs_update(self) -> bool:
|
|
156
156
|
"""
|
|
157
157
|
Check if the environment needs an update by:
|
|
158
158
|
1. Checking if any file has been modified since the last update
|
|
159
|
-
|
|
159
|
+
|
|
160
160
|
Returns:
|
|
161
161
|
bool: True if the environment needs an update, False otherwise.
|
|
162
162
|
"""
|
|
@@ -166,18 +166,18 @@ class DockerClient(Client):
|
|
|
166
166
|
|
|
167
167
|
# Check if any file has been modified since the last update
|
|
168
168
|
current_mtimes = self._get_all_file_mtimes()
|
|
169
|
-
|
|
169
|
+
|
|
170
170
|
# If we don't have previous modification times, we need an update
|
|
171
171
|
if not self._last_file_mtimes:
|
|
172
172
|
return True
|
|
173
|
-
|
|
173
|
+
|
|
174
174
|
# Check for new or modified files
|
|
175
175
|
for file_path, mtime in current_mtimes.items():
|
|
176
176
|
if file_path not in self._last_file_mtimes or mtime > self._last_file_mtimes[file_path]:
|
|
177
177
|
return True
|
|
178
|
-
|
|
178
|
+
|
|
179
179
|
return False
|
|
180
|
-
|
|
180
|
+
|
|
181
181
|
async def update(self) -> None:
|
|
182
182
|
"""
|
|
183
183
|
Base update method for environment controllers.
|
|
@@ -186,22 +186,22 @@ class DockerClient(Client):
|
|
|
186
186
|
# If no source path, nothing to update
|
|
187
187
|
if not self._source_path:
|
|
188
188
|
return
|
|
189
|
-
|
|
189
|
+
|
|
190
190
|
logger.info("Updating environment")
|
|
191
191
|
|
|
192
192
|
# Save current file modification times
|
|
193
193
|
self._last_file_mtimes = self._get_all_file_mtimes()
|
|
194
|
-
|
|
194
|
+
|
|
195
195
|
# Create tar archive of the source code and send it to the container
|
|
196
196
|
tar_bytes = directory_to_tar_bytes(self._source_path)
|
|
197
197
|
await self.execute(["mkdir", "-p", "/root/controller"], timeout=5)
|
|
198
198
|
await self.put_archive("/root/controller", tar_bytes)
|
|
199
|
-
|
|
199
|
+
|
|
200
200
|
# Check if pyproject.toml exists and parse it
|
|
201
201
|
pyproject_path = self._source_path / "pyproject.toml"
|
|
202
202
|
if not pyproject_path.exists():
|
|
203
203
|
raise FileNotFoundError(f"pyproject.toml not found in {self._source_path}")
|
|
204
|
-
|
|
204
|
+
|
|
205
205
|
# Read and parse the current content of pyproject.toml
|
|
206
206
|
current_pyproject_content = pyproject_path.read_text()
|
|
207
207
|
if (
|
|
@@ -224,8 +224,7 @@ class DockerClient(Client):
|
|
|
224
224
|
logger.warning("STDERR:\n%s", result["stderr"])
|
|
225
225
|
# Save current pyproject.toml content
|
|
226
226
|
self._last_pyproject_toml_str = current_pyproject_content
|
|
227
|
-
|
|
228
|
-
|
|
227
|
+
|
|
229
228
|
@abc.abstractmethod
|
|
230
229
|
async def execute(
|
|
231
230
|
self,
|
|
@@ -235,20 +234,20 @@ class DockerClient(Client):
|
|
|
235
234
|
) -> ExecuteResult:
|
|
236
235
|
"""
|
|
237
236
|
Execute a command in the environment. May not be supported by all environments.
|
|
238
|
-
|
|
237
|
+
|
|
239
238
|
Args:
|
|
240
239
|
command: The command to execute
|
|
241
240
|
workdir: The working directory to execute the command in
|
|
242
241
|
timeout: The timeout for the command
|
|
243
|
-
|
|
242
|
+
|
|
244
243
|
Returns:
|
|
245
244
|
ExecuteResult: The result of the command
|
|
246
245
|
"""
|
|
247
|
-
|
|
248
|
-
async def invoke(self, config:
|
|
246
|
+
|
|
247
|
+
async def invoke(self, config: FunctionConfig) -> tuple[Any, bytes, bytes]:
|
|
249
248
|
"""
|
|
250
249
|
Invoke a function in the environment. Supported by all environments.
|
|
251
|
-
|
|
250
|
+
|
|
252
251
|
Args:
|
|
253
252
|
config: The configuration to invoke
|
|
254
253
|
|
|
@@ -289,11 +288,11 @@ class DockerClient(Client):
|
|
|
289
288
|
May not be supported by all environments. (notably browser environments)
|
|
290
289
|
Args:
|
|
291
290
|
path: The path to get the archive of
|
|
292
|
-
|
|
291
|
+
|
|
293
292
|
Returns:
|
|
294
293
|
bytes: The archive of the path
|
|
295
294
|
"""
|
|
296
|
-
|
|
295
|
+
|
|
297
296
|
@abc.abstractmethod
|
|
298
297
|
async def put_archive(self, path: str, data: bytes) -> bool:
|
|
299
298
|
"""
|
|
@@ -303,4 +302,3 @@ class DockerClient(Client):
|
|
|
303
302
|
path: The path to put the archive at
|
|
304
303
|
data: The data to put in the archive
|
|
305
304
|
"""
|
|
306
|
-
|
hud/env/environment.py
CHANGED
|
@@ -10,25 +10,21 @@ from pydantic import BaseModel
|
|
|
10
10
|
from hud.env.client import Client
|
|
11
11
|
from hud.env.remote_client import RemoteClient
|
|
12
12
|
from hud.task import Task
|
|
13
|
-
from hud.utils.common import
|
|
14
|
-
from hud.utils.config import
|
|
13
|
+
from hud.utils.common import FunctionConfig, FunctionConfigs, Observation
|
|
14
|
+
from hud.utils.config import (
|
|
15
|
+
LOCAL_EVALUATORS,
|
|
16
|
+
REMOTE_EVALUATE,
|
|
17
|
+
REMOTE_FUNCTION_PREFIX,
|
|
18
|
+
REMOTE_SETUP,
|
|
19
|
+
expand_config,
|
|
20
|
+
)
|
|
21
|
+
from hud.utils.telemetry import stream
|
|
15
22
|
|
|
16
23
|
logger = logging.getLogger("hud.environment")
|
|
17
24
|
|
|
18
25
|
if TYPE_CHECKING:
|
|
19
26
|
from hud.adapters.common import CLA
|
|
20
|
-
|
|
21
|
-
class Observation(BaseModel):
|
|
22
|
-
"""
|
|
23
|
-
Observation from the environment.
|
|
24
|
-
|
|
25
|
-
Attributes:
|
|
26
|
-
screenshot: Base64 encoded PNG string of the screen
|
|
27
|
-
text: Text observation, if available
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
screenshot: str | None = None # base64 string png
|
|
31
|
-
text: str | None = None
|
|
27
|
+
from hud.agent import Agent
|
|
32
28
|
|
|
33
29
|
|
|
34
30
|
class Environment(BaseModel):
|
|
@@ -48,7 +44,7 @@ class Environment(BaseModel):
|
|
|
48
44
|
# final response
|
|
49
45
|
final_response: str | None = None
|
|
50
46
|
|
|
51
|
-
async def _invoke_all(self, configs:
|
|
47
|
+
async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]:
|
|
52
48
|
# Execute each config and collect results
|
|
53
49
|
configs_all = [configs] if not isinstance(configs, list) else configs
|
|
54
50
|
results = []
|
|
@@ -69,8 +65,8 @@ class Environment(BaseModel):
|
|
|
69
65
|
stderr.decode(),
|
|
70
66
|
)
|
|
71
67
|
return results
|
|
72
|
-
|
|
73
|
-
async def _setup(self, config:
|
|
68
|
+
|
|
69
|
+
async def _setup(self, config: FunctionConfigs | None = None) -> None:
|
|
74
70
|
"""
|
|
75
71
|
Setup the environment.
|
|
76
72
|
|
|
@@ -87,7 +83,7 @@ class Environment(BaseModel):
|
|
|
87
83
|
else:
|
|
88
84
|
raise ValueError("No config or task provided for local environment")
|
|
89
85
|
|
|
90
|
-
async def evaluate(self, config:
|
|
86
|
+
async def evaluate(self, config: FunctionConfigs | None = None) -> Any:
|
|
91
87
|
"""
|
|
92
88
|
Evaluate the environment.
|
|
93
89
|
|
|
@@ -98,8 +94,7 @@ class Environment(BaseModel):
|
|
|
98
94
|
Any: Result of the evaluation
|
|
99
95
|
"""
|
|
100
96
|
if isinstance(self.client, RemoteClient):
|
|
101
|
-
results = await self._invoke_all(
|
|
102
|
-
create_remote_config(self, config, REMOTE_EVALUATE))
|
|
97
|
+
results = await self._invoke_all(create_remote_config(self, config, REMOTE_EVALUATE))
|
|
103
98
|
else:
|
|
104
99
|
if config is not None:
|
|
105
100
|
results = await self._invoke_all(config)
|
|
@@ -111,11 +106,10 @@ class Environment(BaseModel):
|
|
|
111
106
|
return results[0]
|
|
112
107
|
else:
|
|
113
108
|
return results
|
|
114
|
-
|
|
115
109
|
|
|
116
|
-
async def reset(
|
|
117
|
-
|
|
118
|
-
]:
|
|
110
|
+
async def reset(
|
|
111
|
+
self, configs: FunctionConfigs | None = None
|
|
112
|
+
) -> tuple[Observation, dict[str, Any]]:
|
|
119
113
|
"""
|
|
120
114
|
Reset the environment.
|
|
121
115
|
|
|
@@ -126,15 +120,15 @@ class Environment(BaseModel):
|
|
|
126
120
|
Observation: The first observation from the environment
|
|
127
121
|
info: Dictionary of information about the environment
|
|
128
122
|
"""
|
|
129
|
-
#await self._setup(configs)
|
|
123
|
+
# await self._setup(configs)
|
|
130
124
|
obs, _, _, info = await self.step()
|
|
131
125
|
if self.task and self.task.prompt:
|
|
132
126
|
obs.text = self.task.prompt
|
|
133
127
|
return obs, info
|
|
134
128
|
|
|
135
|
-
async def step(
|
|
136
|
-
|
|
137
|
-
]:
|
|
129
|
+
async def step(
|
|
130
|
+
self, actions: CLA | list[CLA] | None = None
|
|
131
|
+
) -> tuple[Observation, float, bool, dict[str, Any]]:
|
|
138
132
|
"""Execute a step in the environment.
|
|
139
133
|
|
|
140
134
|
Args:
|
|
@@ -143,6 +137,8 @@ class Environment(BaseModel):
|
|
|
143
137
|
Returns:
|
|
144
138
|
Any: Result of the step execution
|
|
145
139
|
"""
|
|
140
|
+
if not isinstance(actions, list) and actions is not None:
|
|
141
|
+
actions = [actions]
|
|
146
142
|
if actions is None or len(actions) == 0:
|
|
147
143
|
actions = []
|
|
148
144
|
args = [[action.model_dump() for action in actions]]
|
|
@@ -150,20 +146,19 @@ class Environment(BaseModel):
|
|
|
150
146
|
# TODO: Move this into the server side
|
|
151
147
|
if self._maybe_store_response(actions):
|
|
152
148
|
return Observation(text=self.final_response), 0, False, {}
|
|
153
|
-
|
|
149
|
+
|
|
154
150
|
result, stdout, stderr = await self.client.invoke(
|
|
155
|
-
|
|
151
|
+
FunctionConfig(function="step", args=args)
|
|
156
152
|
)
|
|
157
153
|
if stdout:
|
|
158
154
|
logger.info("Step produced stdout: %s", stdout.decode())
|
|
159
155
|
if stderr:
|
|
160
156
|
logger.warning("Step produced stderr: %s", stderr.decode())
|
|
161
157
|
|
|
162
|
-
|
|
163
158
|
observation = Observation.model_validate(result["observation"], strict=True)
|
|
164
159
|
|
|
165
160
|
return observation, 0, False, {}
|
|
166
|
-
|
|
161
|
+
|
|
167
162
|
def _maybe_store_response(self, actions: list[CLA]) -> bool:
|
|
168
163
|
"""Store the final response into the environment.
|
|
169
164
|
|
|
@@ -178,14 +173,13 @@ class Environment(BaseModel):
|
|
|
178
173
|
return True
|
|
179
174
|
return False
|
|
180
175
|
|
|
181
|
-
|
|
182
176
|
async def get_urls(self) -> dict[str, Any]:
|
|
183
177
|
"""Get URLs for the environment.
|
|
184
178
|
|
|
185
179
|
Returns:
|
|
186
180
|
dict: Dictionary of URLs for accessing the environment
|
|
187
181
|
"""
|
|
188
|
-
data, _, _ = await self.client.invoke(
|
|
182
|
+
data, _, _ = await self.client.invoke(FunctionConfig(function="get_urls", args=[]))
|
|
189
183
|
|
|
190
184
|
self.url = data.get("url")
|
|
191
185
|
self.live_url = data.get("live_url")
|
|
@@ -202,11 +196,43 @@ class Environment(BaseModel):
|
|
|
202
196
|
"""
|
|
203
197
|
await self.client.close()
|
|
204
198
|
|
|
199
|
+
async def stream(self) -> str | None:
|
|
200
|
+
urls = await self.get_urls()
|
|
201
|
+
if urls["live_url"] is None:
|
|
202
|
+
logger.warning("No live URL found")
|
|
203
|
+
return None
|
|
204
|
+
# Stream the live view
|
|
205
|
+
return stream(urls["live_url"])
|
|
206
|
+
|
|
207
|
+
async def run(self, agent: Agent, max_steps: int = 27, verbose: bool = True) -> Any:
|
|
208
|
+
"""Run an agent in the environment.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
agent: The agent to run
|
|
212
|
+
"""
|
|
213
|
+
if verbose:
|
|
214
|
+
logger.info("[HUD] Running agent in environment...")
|
|
215
|
+
obs, _ = await self.reset()
|
|
216
|
+
for i in range(max_steps):
|
|
217
|
+
action, done = await agent.predict(obs)
|
|
218
|
+
if verbose:
|
|
219
|
+
logger.info("[HUD] Step %d: Action: %s", i, action)
|
|
220
|
+
obs, reward, terminated, info = await self.step(action)
|
|
221
|
+
if verbose:
|
|
222
|
+
logger.info("[HUD] Step %d: Observation: %s", i, obs)
|
|
223
|
+
if done or terminated:
|
|
224
|
+
break
|
|
225
|
+
result = await self.evaluate()
|
|
226
|
+
if verbose:
|
|
227
|
+
logger.info("[HUD] Evaluation result: %s", result)
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
|
|
205
231
|
def create_remote_config(
|
|
206
232
|
env: Environment | None = None,
|
|
207
|
-
config:
|
|
233
|
+
config: FunctionConfigs | None = None,
|
|
208
234
|
function: str | None = None,
|
|
209
|
-
) -> list[
|
|
235
|
+
) -> list[FunctionConfig]:
|
|
210
236
|
"""
|
|
211
237
|
Create a remote configuration for setup or evaluate, determining the final
|
|
212
238
|
function call structure based on the provided task or explicit config.
|
|
@@ -218,11 +244,11 @@ def create_remote_config(
|
|
|
218
244
|
env: Environment object, potentially containing a task definition.
|
|
219
245
|
Used to access `env.task` and `env.final_response`.
|
|
220
246
|
config: Direct configuration override (e.g., passed to `env.evaluate(config=...)`).
|
|
221
|
-
Can be in various
|
|
247
|
+
Can be in various FunctionConfigs formats.
|
|
222
248
|
function: The top-level function context, typically "setup" or "evaluate".
|
|
223
249
|
|
|
224
250
|
Returns:
|
|
225
|
-
list[
|
|
251
|
+
list[FunctionConfig]: A list containing a single FunctionConfig object
|
|
226
252
|
ready for remote invocation via `client.invoke`.
|
|
227
253
|
The specific function/arguments are chosen based on this priority:
|
|
228
254
|
1. Explicit `config` parameter (if provided).
|
|
@@ -242,8 +268,8 @@ def create_remote_config(
|
|
|
242
268
|
`config=("contains_text", "Paris")`
|
|
243
269
|
`function="evaluate"`
|
|
244
270
|
- Example Output:
|
|
245
|
-
`[
|
|
246
|
-
|
|
271
|
+
`[FunctionConfig(function='evaluate', args=[
|
|
272
|
+
FunctionConfig(function='contains_text', args=['Paris', 'Paris'])
|
|
247
273
|
])]`
|
|
248
274
|
|
|
249
275
|
2) No explicit `config`, Task has the attribute (e.g., `task.evaluate`):
|
|
@@ -255,7 +281,7 @@ def create_remote_config(
|
|
|
255
281
|
`config=None`
|
|
256
282
|
`function="evaluate"`
|
|
257
283
|
- Example Output:
|
|
258
|
-
`[
|
|
284
|
+
`[FunctionConfig(function='evaluate', args=[FunctionConfig(function='check_answer',
|
|
259
285
|
args=['42'], id='t1')])]`
|
|
260
286
|
|
|
261
287
|
3) No explicit `config`, no specific Task attribute, Task has `task.config`:
|
|
@@ -267,7 +293,7 @@ def create_remote_config(
|
|
|
267
293
|
`config=None`
|
|
268
294
|
`function="evaluate"`
|
|
269
295
|
- Example Output:
|
|
270
|
-
`[
|
|
296
|
+
`[FunctionConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
|
|
271
297
|
|
|
272
298
|
4) No explicit `config`, no specific Task attribute, no `task.config`, Task has `task.id`:
|
|
273
299
|
Calls a private function (`private_<function>`) on the remote end, passing
|
|
@@ -277,7 +303,7 @@ def create_remote_config(
|
|
|
277
303
|
`config=None`
|
|
278
304
|
`function="evaluate"`
|
|
279
305
|
- Example Output:
|
|
280
|
-
`[
|
|
306
|
+
`[FunctionConfig(function='private_evaluate', args=['t3'])]`
|
|
281
307
|
|
|
282
308
|
5) No explicit `config` and no relevant Task info:
|
|
283
309
|
Calls the top-level `function` with empty args.
|
|
@@ -286,50 +312,50 @@ def create_remote_config(
|
|
|
286
312
|
`config=None`
|
|
287
313
|
`function="evaluate"`
|
|
288
314
|
- Example Output:
|
|
289
|
-
`[
|
|
315
|
+
`[FunctionConfig(function='evaluate', args=[])]`
|
|
290
316
|
"""
|
|
291
317
|
# If no function provided, just expand the config and return it directly
|
|
292
318
|
if function is None:
|
|
293
319
|
if config:
|
|
294
320
|
return expand_config(config)
|
|
295
321
|
raise ValueError("Either function or config must be provided")
|
|
296
|
-
|
|
322
|
+
|
|
297
323
|
# Case 1: Explicit config provided
|
|
298
324
|
if config:
|
|
299
325
|
expanded_configs = expand_config(config)
|
|
300
|
-
if env and env.final_response:
|
|
326
|
+
if env and env.final_response and expanded_configs[0].args[0] in LOCAL_EVALUATORS:
|
|
301
327
|
# Ensure args is a list before appending
|
|
302
328
|
if not isinstance(expanded_configs[0].args, list):
|
|
303
|
-
|
|
304
|
-
expanded_configs[0].args.append(env.final_response)
|
|
305
|
-
return [
|
|
306
|
-
|
|
329
|
+
expanded_configs[0].args = [expanded_configs[0].args]
|
|
330
|
+
expanded_configs[0].args.append(env.final_response) # for remote responses
|
|
331
|
+
return [FunctionConfig(function=function, args=expanded_configs)]
|
|
332
|
+
|
|
307
333
|
# Otherwise, use the environment's task
|
|
308
334
|
task = env.task if env else None
|
|
309
|
-
|
|
335
|
+
|
|
310
336
|
# Must have a task for the remaining cases
|
|
311
337
|
if task is None:
|
|
312
338
|
raise ValueError("Either task or config must be provided")
|
|
313
|
-
|
|
339
|
+
|
|
314
340
|
# Case 2: Task has the specified function attribute
|
|
315
341
|
task_config = getattr(task, function, None)
|
|
316
342
|
if task_config:
|
|
317
343
|
expanded_configs = expand_config(task_config)
|
|
318
344
|
if task.id:
|
|
319
|
-
expanded_configs[0].id = task.id
|
|
320
|
-
|
|
345
|
+
expanded_configs[0].id = task.id # for remote IDs
|
|
346
|
+
if env and env.final_response and expanded_configs[0].function in LOCAL_EVALUATORS:
|
|
321
347
|
# Ensure args is a list before appending
|
|
322
348
|
if not isinstance(expanded_configs[0].args, list):
|
|
323
|
-
|
|
324
|
-
expanded_configs[0].args.append(env.final_response)
|
|
325
|
-
return [
|
|
326
|
-
|
|
349
|
+
expanded_configs[0].args = [expanded_configs[0].args]
|
|
350
|
+
expanded_configs[0].args.append(env.final_response) # for remote responses
|
|
351
|
+
return [FunctionConfig(function=function, args=expanded_configs)]
|
|
352
|
+
|
|
327
353
|
# Case 3: Check for task.config
|
|
328
354
|
if hasattr(task, "config") and task.config:
|
|
329
355
|
# Ensure task.config is a dictionary before adding id
|
|
330
356
|
final_args = task.config.copy() if isinstance(task.config, dict) else {}
|
|
331
357
|
if task.id:
|
|
332
|
-
final_args["id"] = task.id
|
|
358
|
+
final_args["id"] = task.id # for remote IDs
|
|
333
359
|
if env and env.final_response:
|
|
334
360
|
# Append response, ensuring args exists and is a list
|
|
335
361
|
if "args" not in final_args:
|
|
@@ -337,18 +363,17 @@ def create_remote_config(
|
|
|
337
363
|
if not isinstance(final_args["args"], list):
|
|
338
364
|
final_args["args"] = [final_args["args"]]
|
|
339
365
|
final_args["args"].append(env.final_response)
|
|
340
|
-
return [
|
|
341
|
-
|
|
366
|
+
return [FunctionConfig(function=function, args=[final_args])]
|
|
367
|
+
|
|
342
368
|
# Case 4: Use task.id
|
|
343
369
|
if task.id:
|
|
344
370
|
args_list = [task.id]
|
|
345
371
|
if env and env.final_response:
|
|
346
|
-
|
|
347
|
-
return [
|
|
348
|
-
|
|
372
|
+
args_list.append(env.final_response) # Append final response
|
|
373
|
+
return [FunctionConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
|
|
374
|
+
|
|
349
375
|
# Case 5: No valid configuration found
|
|
350
376
|
args_list = []
|
|
351
377
|
if env and env.final_response:
|
|
352
378
|
args_list.append(env.final_response)
|
|
353
|
-
return [
|
|
354
|
-
|
|
379
|
+
return [FunctionConfig(function=function, args=args_list)]
|
hud/env/local_docker_client.py
CHANGED
|
@@ -19,15 +19,16 @@ if TYPE_CHECKING:
|
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger("hud.env.docker_env_client")
|
|
21
21
|
|
|
22
|
+
|
|
22
23
|
class LocalDockerClient(DockerClient):
|
|
23
24
|
"""
|
|
24
25
|
Docker-based environment client implementation.
|
|
25
26
|
"""
|
|
26
27
|
|
|
27
28
|
@classmethod
|
|
28
|
-
async def create(
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
async def create(
|
|
30
|
+
cls, dockerfile: str, ports: list[int] | None = None
|
|
31
|
+
) -> tuple[LocalDockerClient, dict[str, Any]]:
|
|
31
32
|
"""
|
|
32
33
|
Creates a Docker environment client from a dockerfile.
|
|
33
34
|
|
|
@@ -86,9 +87,7 @@ class LocalDockerClient(DockerClient):
|
|
|
86
87
|
"HostConfig": {
|
|
87
88
|
"PublishAllPorts": True,
|
|
88
89
|
},
|
|
89
|
-
"ExposedPorts": {
|
|
90
|
-
f"{port}/tcp": {} for port in ports
|
|
91
|
-
},
|
|
90
|
+
"ExposedPorts": {f"{port}/tcp": {} for port in ports},
|
|
92
91
|
}
|
|
93
92
|
|
|
94
93
|
container = await docker_client.containers.create(config=container_config)
|
|
@@ -198,7 +197,6 @@ class LocalDockerClient(DockerClient):
|
|
|
198
197
|
exit_code=0,
|
|
199
198
|
)
|
|
200
199
|
|
|
201
|
-
|
|
202
200
|
async def get_archive(self, path: str) -> bytes:
|
|
203
201
|
"""
|
|
204
202
|
Get an archive of a path from the container.
|