hud-python 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (46) hide show
  1. hud/__init__.py +16 -12
  2. hud/adapters/__init__.py +4 -2
  3. hud/adapters/claude/adapter.py +9 -2
  4. hud/adapters/common/adapter.py +11 -10
  5. hud/adapters/common/types.py +34 -13
  6. hud/adapters/operator/__init__.py +5 -0
  7. hud/adapters/operator/adapter.py +97 -0
  8. hud/agent/__init__.py +7 -0
  9. hud/agent/base.py +109 -0
  10. hud/agent/claude.py +207 -0
  11. hud/agent/operator.py +208 -0
  12. hud/env/__init__.py +11 -0
  13. hud/env/client.py +35 -0
  14. hud/env/docker_client.py +306 -0
  15. hud/env/environment.py +354 -0
  16. hud/env/local_docker_client.py +251 -0
  17. hud/env/remote_client.py +185 -0
  18. hud/env/remote_docker_client.py +221 -0
  19. hud/evaluators/__init__.py +10 -0
  20. hud/evaluators/base.py +31 -0
  21. hud/evaluators/inspect.py +29 -0
  22. hud/evaluators/judge.py +213 -0
  23. hud/evaluators/match.py +163 -0
  24. hud/evaluators/remote.py +78 -0
  25. hud/gym.py +101 -15
  26. hud/job.py +185 -0
  27. hud/server/__init__.py +2 -2
  28. hud/server/requests.py +87 -0
  29. hud/settings.py +13 -2
  30. hud/task.py +144 -0
  31. hud/taskset.py +103 -0
  32. hud/trajectory.py +90 -0
  33. hud/types.py +65 -0
  34. hud/utils/__init__.py +4 -2
  35. hud/utils/common.py +96 -0
  36. hud/utils/config.py +91 -4
  37. hud/utils/telemetry.py +67 -0
  38. hud_python-0.2.1.dist-info/METADATA +181 -0
  39. hud_python-0.2.1.dist-info/RECORD +44 -0
  40. {hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/licenses/LICENSE +1 -1
  41. hud/client.py +0 -200
  42. hud/environment.py +0 -318
  43. hud/run.py +0 -208
  44. hud_python-0.1.5.dist-info/METADATA +0 -125
  45. hud_python-0.1.5.dist-info/RECORD +0 -21
  46. {hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/WHEEL +0 -0
hud/env/environment.py ADDED
@@ -0,0 +1,354 @@
1
+ """Base classes for environment implementations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from pydantic import BaseModel
9
+
10
+ from hud.env.client import Client
11
+ from hud.env.remote_client import RemoteClient
12
+ from hud.task import Task
13
+ from hud.utils.common import HudStyleConfig, HudStyleConfigs
14
+ from hud.utils.config import REMOTE_EVALUATE, REMOTE_FUNCTION_PREFIX, REMOTE_SETUP, expand_config
15
+
16
+ logger = logging.getLogger("hud.environment")
17
+
18
+ if TYPE_CHECKING:
19
+ from hud.adapters.common import CLA
20
+
21
+ class Observation(BaseModel):
22
+ """
23
+ Observation from the environment.
24
+
25
+ Attributes:
26
+ screenshot: Base64 encoded PNG string of the screen
27
+ text: Text observation, if available
28
+ """
29
+
30
+ screenshot: str | None = None # base64 string png
31
+ text: str | None = None
32
+
33
+
34
+ class Environment(BaseModel):
35
+ """
36
+ Environment base class that provides common functionality for all environment implementations.
37
+ This class uses the primitives provided by EnvClient to implement core environment operations.
38
+ """
39
+
40
+ metadata: dict[str, Any]
41
+ client: Client
42
+ url: str | None = None
43
+ live_url: str | None = None
44
+ # The task id to use for the environment reset
45
+ task: Task | None = None
46
+ build_data: dict[str, Any]
47
+
48
+ # final response
49
+ final_response: str | None = None
50
+
51
+ async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]:
52
+ # Execute each config and collect results
53
+ configs_all = [configs] if not isinstance(configs, list) else configs
54
+ results = []
55
+ for config in configs_all:
56
+ for expanded_config in expand_config(config):
57
+ result, stdout, stderr = await self.client.invoke(expanded_config)
58
+ results.append(result)
59
+ if stdout:
60
+ logger.info(
61
+ "%s produced stdout:\n%s",
62
+ expanded_config.function,
63
+ stdout.decode(),
64
+ )
65
+ if stderr:
66
+ logger.warning(
67
+ "%s produced stderr:\n%s",
68
+ expanded_config.function,
69
+ stderr.decode(),
70
+ )
71
+ return results
72
+
73
+ async def _setup(self, config: HudStyleConfigs | None = None) -> None:
74
+ """
75
+ Setup the environment.
76
+
77
+ Args:
78
+ config: The configuration to use for the setup
79
+ """
80
+ if isinstance(self.client, RemoteClient):
81
+ await self._invoke_all(create_remote_config(self, config, REMOTE_SETUP))
82
+ else:
83
+ if config is not None:
84
+ await self._invoke_all(config)
85
+ elif self.task and self.task.config is not None:
86
+ await self._invoke_all(self.task.config)
87
+ else:
88
+ raise ValueError("No config or task provided for local environment")
89
+
90
+ async def evaluate(self, config: HudStyleConfigs | None = None) -> Any:
91
+ """
92
+ Evaluate the environment.
93
+
94
+ Args:
95
+ config: The configuration to use for the evaluation
96
+
97
+ Returns:
98
+ Any: Result of the evaluation
99
+ """
100
+ if isinstance(self.client, RemoteClient):
101
+ results = await self._invoke_all(
102
+ create_remote_config(self, config, REMOTE_EVALUATE))
103
+ else:
104
+ if config is not None:
105
+ results = await self._invoke_all(config)
106
+ elif self.task and self.task.config is not None:
107
+ results = await self._invoke_all(self.task.config)
108
+ else:
109
+ raise ValueError("No config or task provided for local environment")
110
+ if len(results) == 1:
111
+ return results[0]
112
+ else:
113
+ return results
114
+
115
+
116
+ async def reset(self, configs: HudStyleConfigs | None = None) -> tuple[
117
+ Observation, dict[str, Any]
118
+ ]:
119
+ """
120
+ Reset the environment.
121
+
122
+ Args:
123
+ configs: The configuration to use for the reset
124
+
125
+ Returns:
126
+ Observation: The first observation from the environment
127
+ info: Dictionary of information about the environment
128
+ """
129
+ #await self._setup(configs)
130
+ obs, _, _, info = await self.step()
131
+ if self.task and self.task.prompt:
132
+ obs.text = self.task.prompt
133
+ return obs, info
134
+
135
+ async def step(self, actions: list[CLA] | None = None) -> tuple[
136
+ Observation, float, bool, dict[str, Any]
137
+ ]:
138
+ """Execute a step in the environment.
139
+
140
+ Args:
141
+ action: The action to execute
142
+
143
+ Returns:
144
+ Any: Result of the step execution
145
+ """
146
+ if actions is None or len(actions) == 0:
147
+ actions = []
148
+ args = [[action.model_dump() for action in actions]]
149
+
150
+ # TODO: Move this into the server side
151
+ if self._maybe_store_response(actions):
152
+ return Observation(text=self.final_response), 0, False, {}
153
+
154
+ result, stdout, stderr = await self.client.invoke(
155
+ HudStyleConfig(function="step", args=args)
156
+ )
157
+ if stdout:
158
+ logger.info("Step produced stdout: %s", stdout.decode())
159
+ if stderr:
160
+ logger.warning("Step produced stderr: %s", stderr.decode())
161
+
162
+
163
+ observation = Observation.model_validate(result["observation"], strict=True)
164
+
165
+ return observation, 0, False, {}
166
+
167
+ def _maybe_store_response(self, actions: list[CLA]) -> bool:
168
+ """Store the final response into the environment.
169
+
170
+ Args:
171
+ actions: The action(s) to check
172
+
173
+ Returns:
174
+ bool: True if the response was submitted, False otherwise
175
+ """
176
+ if len(actions) > 0 and actions[-1].type == "response":
177
+ self.final_response = actions[-1].text
178
+ return True
179
+ return False
180
+
181
+
182
+ async def get_urls(self) -> dict[str, Any]:
183
+ """Get URLs for the environment.
184
+
185
+ Returns:
186
+ dict: Dictionary of URLs for accessing the environment
187
+ """
188
+ data, _, _ = await self.client.invoke(HudStyleConfig(function="get_urls", args=[]))
189
+
190
+ self.url = data.get("url")
191
+ self.live_url = data.get("live_url")
192
+
193
+ return {
194
+ "url": self.url,
195
+ "live_url": self.live_url,
196
+ }
197
+
198
+ async def close(self) -> None:
199
+ """Close the environment.
200
+
201
+ This should release any resources and clean up the environment.
202
+ """
203
+ await self.client.close()
204
+
205
+ def create_remote_config(
206
+ env: Environment | None = None,
207
+ config: HudStyleConfigs | None = None,
208
+ function: str | None = None,
209
+ ) -> list[HudStyleConfig]:
210
+ """
211
+ Create a remote configuration for setup or evaluate, determining the final
212
+ function call structure based on the provided task or explicit config.
213
+
214
+ This function orchestrates how setup and evaluate steps defined in a Task
215
+ or passed directly are prepared for remote execution via `env._invoke_all`.
216
+
217
+ Args:
218
+ env: Environment object, potentially containing a task definition.
219
+ Used to access `env.task` and `env.final_response`.
220
+ config: Direct configuration override (e.g., passed to `env.evaluate(config=...)`).
221
+ Can be in various HudStyleConfigs formats.
222
+ function: The top-level function context, typically "setup" or "evaluate".
223
+
224
+ Returns:
225
+ list[HudStyleConfig]: A list containing a single HudStyleConfig object
226
+ ready for remote invocation via `client.invoke`.
227
+ The specific function/arguments are chosen based on this priority:
228
+ 1. Explicit `config` parameter (if provided).
229
+ 2. Specific `task` attribute (e.g., `task.evaluate`).
230
+ 3. General `task.config` dictionary.
231
+ 4. Default private function using `task.id`
232
+ (e.g., `private_evaluate(task.id)`).
233
+ 5. Base `function` name with minimal/default arguments.
234
+
235
+ Logic & Examples (Assuming `function="evaluate"` for examples):
236
+
237
+ 1) Explicit `config` provided: The `config` is expanded and becomes the `args`
238
+ for the top-level `function` call. If the environment has a final_response,
239
+ it's appended to these args.
240
+ - Example Input:
241
+ `env` (with `final_response="Paris"`)
242
+ `config=("contains_text", "Paris")`
243
+ `function="evaluate"`
244
+ - Example Output:
245
+ `[HudStyleConfig(function='evaluate', args=[
246
+ HudStyleConfig(function='contains_text', args=['Paris', 'Paris'])
247
+ ])]`
248
+
249
+ 2) No explicit `config`, Task has the attribute (e.g., `task.evaluate`):
250
+ The Task's attribute value (e.g., `task.evaluate`) is expanded and becomes the `args`
251
+ for the top-level `function` call. Task ID is added if present. `final_response` is
252
+ appended if present.
253
+ - Example Input:
254
+ `env` (`task=Task(id="t1", evaluate=("check_answer",), ...)`, `final_response="42"`)
255
+ `config=None`
256
+ `function="evaluate"`
257
+ - Example Output:
258
+ `[HudStyleConfig(function='evaluate', args=[HudStyleConfig(function='check_answer',
259
+ args=['42'], id='t1')])]`
260
+
261
+ 3) No explicit `config`, no specific Task attribute, Task has `task.config`:
262
+ The `task.config` dictionary becomes the single argument for the top-level
263
+ `function` call. Task ID is added to the config dict if present. `final_response` is
264
+ appended if present.
265
+ - Example Input:
266
+ `env` (with `task=Task(id="t2", config={"expected": "val"}, ...)`)
267
+ `config=None`
268
+ `function="evaluate"`
269
+ - Example Output:
270
+ `[HudStyleConfig(function='evaluate', args=[{"expected": "val", "id": "t2"}])]`
271
+
272
+ 4) No explicit `config`, no specific Task attribute, no `task.config`, Task has `task.id`:
273
+ Calls a private function (`private_<function>`) on the remote end, passing
274
+ the `task.id` as the only argument.
275
+ - Example Input:
276
+ `env` (with `task=Task(id="t3", ...)`)
277
+ `config=None`
278
+ `function="evaluate"`
279
+ - Example Output:
280
+ `[HudStyleConfig(function='private_evaluate', args=['t3'])]`
281
+
282
+ 5) No explicit `config` and no relevant Task info:
283
+ Calls the top-level `function` with empty args.
284
+ - Example Input:
285
+ `env` (with `task=Task(...)`)
286
+ `config=None`
287
+ `function="evaluate"`
288
+ - Example Output:
289
+ `[HudStyleConfig(function='evaluate', args=[])]`
290
+ """
291
+ # If no function provided, just expand the config and return it directly
292
+ if function is None:
293
+ if config:
294
+ return expand_config(config)
295
+ raise ValueError("Either function or config must be provided")
296
+
297
+ # Case 1: Explicit config provided
298
+ if config:
299
+ expanded_configs = expand_config(config)
300
+ if env and env.final_response:
301
+ # Ensure args is a list before appending
302
+ if not isinstance(expanded_configs[0].args, list):
303
+ expanded_configs[0].args = [expanded_configs[0].args]
304
+ expanded_configs[0].args.append(env.final_response) # for remote responses
305
+ return [HudStyleConfig(function=function, args=expanded_configs)]
306
+
307
+ # Otherwise, use the environment's task
308
+ task = env.task if env else None
309
+
310
+ # Must have a task for the remaining cases
311
+ if task is None:
312
+ raise ValueError("Either task or config must be provided")
313
+
314
+ # Case 2: Task has the specified function attribute
315
+ task_config = getattr(task, function, None)
316
+ if task_config:
317
+ expanded_configs = expand_config(task_config)
318
+ if task.id:
319
+ expanded_configs[0].id = task.id # for remote IDs
320
+ elif env and env.final_response:
321
+ # Ensure args is a list before appending
322
+ if not isinstance(expanded_configs[0].args, list):
323
+ expanded_configs[0].args = [expanded_configs[0].args]
324
+ expanded_configs[0].args.append(env.final_response) # for remote responses
325
+ return [HudStyleConfig(function=function, args=expanded_configs)]
326
+
327
+ # Case 3: Check for task.config
328
+ if hasattr(task, "config") and task.config:
329
+ # Ensure task.config is a dictionary before adding id
330
+ final_args = task.config.copy() if isinstance(task.config, dict) else {}
331
+ if task.id:
332
+ final_args["id"] = task.id # for remote IDs
333
+ if env and env.final_response:
334
+ # Append response, ensuring args exists and is a list
335
+ if "args" not in final_args:
336
+ final_args["args"] = []
337
+ if not isinstance(final_args["args"], list):
338
+ final_args["args"] = [final_args["args"]]
339
+ final_args["args"].append(env.final_response)
340
+ return [HudStyleConfig(function=function, args=[final_args])]
341
+
342
+ # Case 4: Use task.id
343
+ if task.id:
344
+ args_list = [task.id]
345
+ if env and env.final_response:
346
+ args_list.append(env.final_response) # Append final response
347
+ return [HudStyleConfig(function=f"{REMOTE_FUNCTION_PREFIX}{function}", args=args_list)]
348
+
349
+ # Case 5: No valid configuration found
350
+ args_list = []
351
+ if env and env.final_response:
352
+ args_list.append(env.final_response)
353
+ return [HudStyleConfig(function=function, args=args_list)]
354
+
@@ -0,0 +1,251 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import tarfile
6
+ import tempfile
7
+ import uuid
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import aiodocker
11
+ from aiohttp import ClientTimeout
12
+
13
+ from hud.env.docker_client import DockerClient, EnvironmentStatus
14
+ from hud.utils import ExecuteResult
15
+
16
+ if TYPE_CHECKING:
17
+ from aiodocker.containers import DockerContainer
18
+ from aiodocker.stream import Stream
19
+
20
+ logger = logging.getLogger("hud.env.docker_env_client")
21
+
22
+ class LocalDockerClient(DockerClient):
23
+ """
24
+ Docker-based environment client implementation.
25
+ """
26
+
27
+ @classmethod
28
+ async def create(cls, dockerfile: str, ports: list[int] | None = None) -> tuple[
29
+ LocalDockerClient, dict[str, Any]
30
+ ]:
31
+ """
32
+ Creates a Docker environment client from a dockerfile.
33
+
34
+ Args:
35
+ dockerfile: The dockerfile content to build the Docker image
36
+
37
+ Returns:
38
+ DockerClient: An instance of the Docker environment client
39
+ """
40
+ # Create a unique image tag
41
+ image_tag = f"hud-env-{uuid.uuid4().hex[:8]}"
42
+
43
+ # Initialize Docker client
44
+ docker_client = aiodocker.Docker()
45
+
46
+ # Create fileobj for the Dockerfile
47
+ dockerfile_fileobj = io.BytesIO(dockerfile.encode("utf-8"))
48
+
49
+ if ports is None:
50
+ ports = []
51
+
52
+ # Create a tar file from the dockerfile
53
+ with tempfile.NamedTemporaryFile() as f:
54
+ with tarfile.open(mode="w:gz", fileobj=f) as t:
55
+ dfinfo = tarfile.TarInfo("Dockerfile")
56
+ dfinfo.size = len(dockerfile_fileobj.getvalue())
57
+ dockerfile_fileobj.seek(0)
58
+ t.addfile(dfinfo, dockerfile_fileobj)
59
+
60
+ # Reset the file pointer to the beginning of the file
61
+ f.seek(0)
62
+
63
+ # Build the image
64
+ build_stream = await docker_client.images.build(
65
+ fileobj=f,
66
+ encoding="gzip",
67
+ tag=image_tag,
68
+ rm=True,
69
+ pull=True,
70
+ forcerm=True,
71
+ )
72
+
73
+ # Print build output
74
+ output = ""
75
+ for chunk in build_stream:
76
+ if "stream" in chunk:
77
+ logger.info(chunk["stream"])
78
+ output += chunk["stream"]
79
+
80
+ # Create and start the container
81
+ container_config = {
82
+ "Image": image_tag,
83
+ "Tty": True,
84
+ "OpenStdin": True,
85
+ "Cmd": None,
86
+ "HostConfig": {
87
+ "PublishAllPorts": True,
88
+ },
89
+ "ExposedPorts": {
90
+ f"{port}/tcp": {} for port in ports
91
+ },
92
+ }
93
+
94
+ container = await docker_client.containers.create(config=container_config)
95
+ await container.start()
96
+
97
+ # Return the controller instance
98
+ return cls(docker_client, container.id), {"build_output": output}
99
+
100
+ def __init__(self, docker_conn: aiodocker.Docker, container_id: str) -> None:
101
+ """
102
+ Initialize the DockerClient.
103
+
104
+ Args:
105
+ docker_conn: Docker client connection
106
+ container_id: ID of the Docker container to control
107
+ """
108
+ super().__init__()
109
+
110
+ # Store container ID instead of container object
111
+ self._container_id = container_id
112
+
113
+ # Docker client will be initialized when needed
114
+ self._docker = docker_conn
115
+
116
+ @property
117
+ def container_id(self) -> str:
118
+ """Get the container ID."""
119
+ return self._container_id
120
+
121
+ @container_id.setter
122
+ def container_id(self, value: str) -> None:
123
+ """Set the container ID."""
124
+ self._container_id = value
125
+
126
+ async def _get_container(self) -> DockerContainer:
127
+ """Get the container object from aiodocker."""
128
+ return await self._docker.containers.get(self.container_id)
129
+
130
+ async def get_status(self) -> EnvironmentStatus:
131
+ """
132
+ Get the current status of the Docker environment.
133
+
134
+ Returns:
135
+ EnvironmentStatus: The current status of the environment
136
+ """
137
+ try:
138
+ container = await self._get_container()
139
+ container_data = await container.show()
140
+
141
+ # Check the container state
142
+ state = container_data.get("State", {})
143
+ status = state.get("Status", "").lower()
144
+
145
+ if status == "running":
146
+ return EnvironmentStatus.RUNNING
147
+ elif status == "created" or status == "starting":
148
+ return EnvironmentStatus.INITIALIZING
149
+ elif status in ["exited", "dead", "removing", "paused"]:
150
+ return EnvironmentStatus.COMPLETED
151
+ else:
152
+ # Any other state is considered an error
153
+ return EnvironmentStatus.ERROR
154
+
155
+ except Exception:
156
+ # If we can't connect to the container or there's any other error
157
+ return EnvironmentStatus.ERROR
158
+
159
+ async def execute(
160
+ self,
161
+ command: list[str],
162
+ *,
163
+ timeout: int | None = None,
164
+ ) -> ExecuteResult:
165
+ """
166
+ Execute a command in the container.
167
+
168
+ Args:
169
+ command: Command to execute
170
+ workdir: Working directory for the command
171
+
172
+ Returns:
173
+ ExecuteResult: Result of the command execution
174
+ """
175
+ container = await self._get_container()
176
+
177
+ exec_result = await container.exec(
178
+ cmd=command,
179
+ )
180
+ output: Stream = exec_result.start(timeout=ClientTimeout(timeout), detach=False)
181
+
182
+ stdout_data = bytearray()
183
+ stderr_data = bytearray()
184
+
185
+ while True:
186
+ message = await output.read_out()
187
+ if message is None:
188
+ break
189
+ if message.stream == 1: # stdout
190
+ stdout_data.extend(message.data)
191
+ elif message.stream == 2: # stderr
192
+ stderr_data.extend(message.data)
193
+
194
+ return ExecuteResult(
195
+ stdout=bytes(stdout_data),
196
+ stderr=bytes(stderr_data),
197
+ # TODO: Get the exit code from the output
198
+ exit_code=0,
199
+ )
200
+
201
+
202
+ async def get_archive(self, path: str) -> bytes:
203
+ """
204
+ Get an archive of a path from the container.
205
+
206
+ Args:
207
+ path: Path in the container to archive
208
+
209
+ Returns:
210
+ bytes: Tar archive containing the path contents
211
+ """
212
+ container = await self._get_container()
213
+
214
+ tarfile = await container.get_archive(path)
215
+ # we know tarfile has fileobj BytesIO
216
+ # read the tarfile into a bytes object
217
+ fileobj = tarfile.fileobj
218
+ if not isinstance(fileobj, io.BytesIO):
219
+ raise TypeError("fileobj is not a BytesIO object")
220
+ return fileobj.getvalue()
221
+
222
+ async def put_archive(self, path: str, data: bytes) -> None:
223
+ """
224
+ Put an archive of data at a path in the container.
225
+
226
+ Args:
227
+ path: Path in the container to extract the archive to
228
+ data: Bytes of the tar archive to extract
229
+
230
+ Returns:
231
+ bool: True if successful
232
+ """
233
+ container = await self._get_container()
234
+
235
+ # Convert bytes to a file-like object for aiodocker
236
+ file_obj = io.BytesIO(data)
237
+ await container.put_archive(path=path, data=file_obj)
238
+
239
+ async def close(self) -> None:
240
+ """
241
+ Close the Docker environment by stopping and removing the container.
242
+ """
243
+ try:
244
+ container = await self._get_container()
245
+ await container.stop()
246
+ await container.delete()
247
+ except Exception as e:
248
+ # Log the error but don't raise it since this is cleanup
249
+ logger.warning("Error during Docker container cleanup: %s", e)
250
+ finally:
251
+ await self._docker.close()